In [57]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [58]:
df=pd.read_csv('movies_metadata.csv')

In [59]:
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [60]:
# Select features for movie recommendation system
features_to_keep = [ 'title', 'overview', 'genres','tagline','vote_average','popularity']

# keeping the useful columns only
df_optimized = df[features_to_keep].copy()

In [61]:
df_optimized.head(1)

Unnamed: 0,title,overview,genres,tagline,vote_average,popularity
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",,7.7,21.946943


In [62]:
df_optimized.isnull().sum()

title               6
overview          954
genres              0
tagline         25054
vote_average        6
popularity          5
dtype: int64

In [63]:
# Remove rows with missing critical values
df_optimized = df_optimized.dropna(subset=["title"])
# fill overview with empty spaces
df_optimized['genres']=df_optimized['genres'].fillna("")
df_optimized['overview']=df_optimized['overview'].fillna("")
df_optimized['tagline']=df_optimized['tagline'].fillna("")

In [64]:
import ast
df_optimized['genres'] = df_optimized['genres'].apply(lambda x :" ".join([i['name'] for i in ast.literal_eval(x)]))

In [65]:
df_optimized['tags']=df_optimized['genres']+" "+df_optimized['overview']+" "+df_optimized['tagline']

# NLP WORK

In [66]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re


In [67]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/anirban/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/anirban/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [68]:
df_optimized= df_optimized.drop_duplicates(subset=['title'])

In [69]:
stop_words=set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [70]:
def word_normalizer(txt):
    text=str(txt).lower()
    
    text=re.sub(r'[^a-zA-Z\s]','',text)
    
    words=text.split()
    
    words=[word for word in words if word not in stop_words]
    
    words=[lemmatizer.lemmatize(word) for word in words]
    
    return " ".join(words)
    

In [71]:
gs=df_optimized['tags'].apply(word_normalizer)

In [72]:
df_optimized = df_optimized.reset_index(drop = True)
indices = pd.Series(df_optimized.index,index = df_optimized['title']).drop_duplicates()
indices

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Caged Heat 3000                42272
Subdue                         42273
Century of Birthing            42274
Satan Triumphant               42275
Queerama                       42276
Length: 42277, dtype: int64

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [74]:
tfidf=TfidfVectorizer(max_features= 500000,ngram_range=(1,2),stop_words="english")

tfidf_matrix=tfidf.fit_transform(df_optimized['tags'])

## cosin similarity

In [75]:
from sklearn.metrics.pairwise import cosine_similarity

In [76]:
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2057430 stored elements and shape (42277, 500000)>

In [77]:
def find_similarity(title,n=10):
    if title not in indices:
        print("film not found data frame")
    else:
        idx=indices[title] 
        similarity=cosine_similarity(tfidf_matrix[idx],tfidf_matrix).flatten()
        idx_of_sims=similarity.argsort()[::-1][1:n+1]
        name=df_optimized['title'].iloc[idx_of_sims].to_list()
        return name

In [78]:
find_similarity('Avatar')

['Avatar 2',
 'Hellraiser: Bloodline',
 'Désiré',
 'France société anonyme',
 'Nightmare City 2035',
 'The Inhabited Island',
 'The War of the Robots',
 'Lara Croft Tomb Raider: The Cradle of Life',
 'Stand by Me Doraemon',
 'Bloodbrothers']

In [81]:
import pickle 

pickle.dump(df_optimized,open("df_optimized.pkl","wb"))
pickle.dump(tfidf_matrix,open("tfidf_matrix.pkl","wb"))
pickle.dump(indices,open("indices.pkl","wb"))
pickle.dump(tfidf,open("tfidf.pkl","wb"))

In [80]:
indices

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Caged Heat 3000                42272
Subdue                         42273
Century of Birthing            42274
Satan Triumphant               42275
Queerama                       42276
Length: 42277, dtype: int64