# Imports

In [110]:
# Linear Algebra, Data Manipulation
import numpy as np
import pandas as pd

# Plots and Graphs
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

# Clean Overviews
import nltk
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")

# Data Preprocessing

## Getting the Data

In [111]:
movies = pd.read_csv("./datasets/tmdb_5000_movies.csv")
credits = pd.read_csv("./datasets/tmdb_5000_credits.csv")

In [112]:
movies.shape, credits.shape

((4803, 20), (4803, 4))

In [113]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [114]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Merging Two DataFrame together on the "id" column

In [115]:
movies = movies.merge(credits, on="title")
movies.shape

(4809, 23)

In [116]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Keeping only the Necessary Columns

In [117]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [118]:
columns_to_keep = ["id", "title", "overview", "keywords", "genres", "cast", "crew"]

movies = movies[columns_to_keep]
movies.head()

Unnamed: 0,id,title,overview,keywords,genres,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


## Dealing with Missing and Duplicate Data

In [119]:
movies.isnull().sum()

id          0
title       0
overview    3
keywords    0
genres      0
cast        0
crew        0
dtype: int64

In [120]:
# Filling null Overvies with an empty string
movies.overview.fillna(" ", inplace=True)

In [121]:
movies.duplicated().sum()
# We do not have any duplicate records in our data

0

## Function to clean keywords, genres and cast column

In [122]:
import json

def string_list_to_list(data):
    data = json.loads(data)
    items = []
    for i in data:
        items.append(i["name"].replace(" ", "").lower())
    return items

### Cleaning genres

In [123]:
movies["genres"] = movies.genres.apply(lambda x: string_list_to_list(x)) # Run Only Once Other wise will give error
movies["genres"]

0       [action, adventure, fantasy, sciencefiction]
1                       [adventure, fantasy, action]
2                         [action, adventure, crime]
3                   [action, crime, drama, thriller]
4                [action, adventure, sciencefiction]
                            ...                     
4804                       [action, crime, thriller]
4805                               [comedy, romance]
4806               [comedy, drama, romance, tvmovie]
4807                                              []
4808                                   [documentary]
Name: genres, Length: 4809, dtype: object

### Cleaning keywords

In [124]:
movies["keywords"] = movies.keywords.apply(lambda x: string_list_to_list(x)) # Run Only Once Other wise will give error
movies["keywords"]

0       [cultureclash, future, spacewar, spacecolony, ...
1       [ocean, drugabuse, exoticisland, eastindiatrad...
2       [spy, basedonnovel, secretagent, sequel, mi6, ...
3       [dccomics, crimefighter, terrorist, secretiden...
4       [basedonnovel, mars, medallion, spacetravel, p...
                              ...                        
4804    [unitedstates–mexicobarrier, legs, arms, paper...
4805                                                   []
4806    [date, loveatfirstsight, narration, investigat...
4807                                                   []
4808             [obsession, camcorder, crush, dreamgirl]
Name: keywords, Length: 4809, dtype: object

### Cleaning cast

In [125]:
movies["cast"] = movies.cast.apply(lambda x: string_list_to_list(x)[:3]) # Run Only Once Other wise will give error
movies["cast"]

0        [samworthington, zoesaldana, sigourneyweaver]
1           [johnnydepp, orlandobloom, keiraknightley]
2            [danielcraig, christophwaltz, léaseydoux]
3            [christianbale, michaelcaine, garyoldman]
4          [taylorkitsch, lynncollins, samanthamorton]
                             ...                      
4804    [carlosgallardo, jaimedehoyos, petermarquardt]
4805         [edwardburns, kerrybishé, marshadietlein]
4806           [ericmabius, kristinbooth, crystallowe]
4807            [danielhenney, elizacoupe, billpaxton]
4808    [drewbarrymore, brianherzlinger, coreyfeldman]
Name: cast, Length: 4809, dtype: object

## Function to clean crew column and get Director Name

In [126]:
def get_director(data):
    data = json.loads(data)
    director = []
    for i in data:
        if i["job"] == "Director":
            director.append(i["name"].replace(" ", "").lower())
            break
    return director

In [127]:
movies["crew"] = movies.crew.apply(lambda x: get_director(x)) # Run Only Once Other wise will give error
movies["crew"]

0           [jamescameron]
1          [goreverbinski]
2              [sammendes]
3       [christophernolan]
4          [andrewstanton]
               ...        
4804     [robertrodriguez]
4805         [edwardburns]
4806          [scottsmith]
4807          [danielhsia]
4808     [brianherzlinger]
Name: crew, Length: 4809, dtype: object

## Cleaning and Removing Stop words from Overview

In [128]:
movies["overview"] = movies.overview.apply(lambda x: str(x).split())

stops = set(stopwords.words('english'))

overviews = []
for i in movies.overview.values:
    L = []
    for j in i:
        j = PorterStemmer().stem(j.strip(",").replace(" ", "").lower())
        if j not in stops:
            L.append(j)
    overviews.append(L)
    
movies["overview"] = overviews
movies["overview"]

0       [22nd, centuri, parapleg, marin, dispatch, moo...
1       [captain, barbossa, long, believ, dead, ha, co...
2       [cryptic, messag, bond’, past, send, trail, un...
3       [follow, death, district, attorney, harvey, de...
4       [john, carter, war-weari, former, militari, ca...
                              ...                        
4804    [el, mariachi, want, play, hi, guitar, carri, ...
4805    [newlyw, couple', honeymoon, upend, arriv, res...
4806    ["sign, seal, delivered", introduc, dedic, qua...
4807    [ambiti, new, york, attorney, sam, sent, shang...
4808    [ever, sinc, second, grade, first, saw, e.t., ...
Name: overview, Length: 4809, dtype: object

In [129]:
movies["title"]

0                                         Avatar
1       Pirates of the Caribbean: At World's End
2                                        Spectre
3                          The Dark Knight Rises
4                                    John Carter
                          ...                   
4804                                 El Mariachi
4805                                   Newlyweds
4806                   Signed, Sealed, Delivered
4807                            Shanghai Calling
4808                           My Date with Drew
Name: title, Length: 4809, dtype: object

In [130]:
# Viewing cleaned data
movies.head()

Unnamed: 0,id,title,overview,keywords,genres,cast,crew
0,19995,Avatar,"[22nd, centuri, parapleg, marin, dispatch, moo...","[cultureclash, future, spacewar, spacecolony, ...","[action, adventure, fantasy, sciencefiction]","[samworthington, zoesaldana, sigourneyweaver]",[jamescameron]
1,285,Pirates of the Caribbean: At World's End,"[captain, barbossa, long, believ, dead, ha, co...","[ocean, drugabuse, exoticisland, eastindiatrad...","[adventure, fantasy, action]","[johnnydepp, orlandobloom, keiraknightley]",[goreverbinski]
2,206647,Spectre,"[cryptic, messag, bond’, past, send, trail, un...","[spy, basedonnovel, secretagent, sequel, mi6, ...","[action, adventure, crime]","[danielcraig, christophwaltz, léaseydoux]",[sammendes]
3,49026,The Dark Knight Rises,"[follow, death, district, attorney, harvey, de...","[dccomics, crimefighter, terrorist, secretiden...","[action, crime, drama, thriller]","[christianbale, michaelcaine, garyoldman]",[christophernolan]
4,49529,John Carter,"[john, carter, war-weari, former, militari, ca...","[basedonnovel, mars, medallion, spacetravel, p...","[action, adventure, sciencefiction]","[taylorkitsch, lynncollins, samanthamorton]",[andrewstanton]


## Creating tags column

In [131]:
movies["tags"] = movies.overview + movies.keywords + movies.genres + movies.cast + movies.crew
movies.head()

Unnamed: 0,id,title,overview,keywords,genres,cast,crew,tags
0,19995,Avatar,"[22nd, centuri, parapleg, marin, dispatch, moo...","[cultureclash, future, spacewar, spacecolony, ...","[action, adventure, fantasy, sciencefiction]","[samworthington, zoesaldana, sigourneyweaver]",[jamescameron],"[22nd, centuri, parapleg, marin, dispatch, moo..."
1,285,Pirates of the Caribbean: At World's End,"[captain, barbossa, long, believ, dead, ha, co...","[ocean, drugabuse, exoticisland, eastindiatrad...","[adventure, fantasy, action]","[johnnydepp, orlandobloom, keiraknightley]",[goreverbinski],"[captain, barbossa, long, believ, dead, ha, co..."
2,206647,Spectre,"[cryptic, messag, bond’, past, send, trail, un...","[spy, basedonnovel, secretagent, sequel, mi6, ...","[action, adventure, crime]","[danielcraig, christophwaltz, léaseydoux]",[sammendes],"[cryptic, messag, bond’, past, send, trail, un..."
3,49026,The Dark Knight Rises,"[follow, death, district, attorney, harvey, de...","[dccomics, crimefighter, terrorist, secretiden...","[action, crime, drama, thriller]","[christianbale, michaelcaine, garyoldman]",[christophernolan],"[follow, death, district, attorney, harvey, de..."
4,49529,John Carter,"[john, carter, war-weari, former, militari, ca...","[basedonnovel, mars, medallion, spacetravel, p...","[action, adventure, sciencefiction]","[taylorkitsch, lynncollins, samanthamorton]",[andrewstanton],"[john, carter, war-weari, former, militari, ca..."


In [132]:
new_movies_df = movies[["id", "title", "tags"]] # Getting only the necessary columns
new_movies_df["tags"] = new_movies_df.tags.apply(lambda x: " ".join(x)) # Converting tags list to tags Paragraph
new_movies_df # Viewing the final data

Unnamed: 0,id,title,tags
0,19995,Avatar,22nd centuri parapleg marin dispatch moon pand...
1,285,Pirates of the Caribbean: At World's End,captain barbossa long believ dead ha come back...
2,206647,Spectre,cryptic messag bond’ past send trail uncov sin...
3,49026,The Dark Knight Rises,follow death district attorney harvey dent bat...
4,49529,John Carter,john carter war-weari former militari captain ...
...,...,...,...
4804,9367,El Mariachi,el mariachi want play hi guitar carri famili t...
4805,72766,Newlyweds,newlyw couple' honeymoon upend arriv respect s...
4806,231617,"Signed, Sealed, Delivered","""sign seal delivered"" introduc dedic quartet c..."
4807,126186,Shanghai Calling,ambiti new york attorney sam sent shanghai ass...


## Vectorization

In [133]:
# Limiting the maximum words to 5000 and removing the english stopwords
# We have removed stop words seperately so specifying the stop_words="english" will bring no change
cv = CountVectorizer(max_features=5000, stop_words="english")

In [134]:
vectors = cv.fit_transform(new_movies_df["tags"]).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [135]:
vectors.shape

(4809, 5000)

In [136]:
# print(cv.get_feature_names()) # Uncomment and run to view all the 5000 words vectorized

#### Now we will calculate the Cosine Distance i.e., the angle between vectors to find similarities and dis-similarities between them.
#### Distance ∝ 1/Similarity

In [137]:
# Calculating Cosine Distance
similarity = cosine_similarity(vectors)
similarity

array([[1.        , 0.08838835, 0.05986843, ..., 0.02268046, 0.        ,
        0.        ],
       [0.08838835, 1.        , 0.06350006, ..., 0.02405626, 0.        ,
        0.02665009],
       [0.05986843, 0.06350006, 1.        , ..., 0.04888237, 0.        ,
        0.        ],
       ...,
       [0.02268046, 0.02405626, 0.04888237, ..., 1.        , 0.03888079,
        0.0410305 ],
       [0.        , 0.        , 0.        , ..., 0.03888079, 1.        ,
        0.0861461 ],
       [0.        , 0.02665009, 0.        , ..., 0.0410305 , 0.0861461 ,
        1.        ]])

In [138]:
similarity.shape

(4809, 4809)

In [139]:
similarity[0] # Gives the distance of 1st movie from all the other movies

# 1 = completely similar
# 0 = not at all similar
# so greater the value greater the similarity

array([1.        , 0.08838835, 0.05986843, ..., 0.02268046, 0.        ,
       0.        ])

# Build the Recommender

In [174]:
def recommend(movie):
    movie_index = 0  
    # Try to find a perfect match 
    for index, title in enumerate(new_movies_df.title):
        if movie == title:
            movie_index = index
            break
     
    similar_movies = sorted(
        list(enumerate(similarity[movie_index])), reverse=True, key=lambda x: x[1]
    )[1: 6]
    
    for i in similar_movies:
        data = new_movies_df[["id", "title"]].iloc[i[0]]
        print([data[0], data[1]])

In [175]:
recommend("avatar")

[440, 'Aliens vs Predator: Requiem']
[602, 'Independence Day']
[679, 'Aliens']
[7450, 'Titan A.E.']
[34851, 'Predators']


In [176]:
recommend("dark knight")

[440, 'Aliens vs Predator: Requiem']
[602, 'Independence Day']
[679, 'Aliens']
[7450, 'Titan A.E.']
[34851, 'Predators']


In [177]:
# Saving the movies data into binary file to use in webapp
new_movies_df.to_pickle("./webapp/movies.pkl")

In [165]:
pd.DataFrame(similarity).to_pickle("./webapp/similarity_matrix.pkl")