In [17]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise.model_selection import KFold
from surprise.model_selection.validation import cross_validate
from sklearn.feature_extraction.text import CountVectorizer
from surprise import SVD,Reader,Dataset
from ast import literal_eval


#function to convert the datatype into int
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan
    
    
#function to split the strings separated by ','
def listing(x):
    try: 
        return x.split(",")
    except:
        return np.nan

    
#loading the dataframe from all the datasets
datadf = pd.read_csv('../dataset/movies_metadata.csv')
linkdf = pd.read_csv('../dataset/links_small.csv')
ratedf = pd.read_csv('../dataset/ratings_small.csv')
creddf = pd.read_csv('../dataset/credits.csv')
keydf = pd.read_csv('../dataset/keywords.csv')


datadf['overview']= datadf['overview'].fillna('')
datadf['overview']= datadf['overview'].apply(listing)

#removing all the entries with duplicate ids
keydf.drop_duplicates(subset ="id", keep ='first',inplace = True)
creddf.drop_duplicates(subset ="id", keep ='first',inplace = True)

#removing all the entries with duplicate title
datadf.drop_duplicates(subset ="title", keep ='first',inplace = True)

#converting the datatype to int
keydf['id'] = keydf['id'].astype('int')
creddf['id'] = creddf['id'].astype('int')

datadf['id']= datadf['id'].apply(clean_ids)

#remving entries with null id
datadf = datadf[datadf['id'].notnull()]

datadf['id']=datadf['id'].astype('int')
linkdf['tmdbId']=linkdf['tmdbId'].apply(clean_ids)
linkdf = linkdf[linkdf['tmdbId'].notnull()]
linkdf['tmdbId']=linkdf['tmdbId'].astype('int')

#creating a new column tmdbId equal to id
datadf['tmdbId']=datadf['id']
datadf=datadf.merge(linkdf,on='tmdbId')

#merging datadf with creddf and keydf
datadf = datadf.merge(creddf, on='id')
datadf = datadf.merge(keydf, on='id')

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    datadf[feature] = datadf[feature].apply(literal_eval)
    
#extracting the director's name
def crewdirector(x):
    for creww in x:
        if creww['job'] == 'Director':
            return creww['name']
    return np.nan
datadf['director'] = datadf['crew'].apply(crewdirector)

#extracting top three names
def generate_list(x):
    if isinstance(x, list):
        names = [key['name'] for key in x]
        if len(names) > 2:
            names = names[:2]
        return names

    return []

datadf['cast'] = datadf['cast'].apply(generate_list)
datadf['keywords'] = datadf['keywords'].apply(generate_list)
datadf['genres'] = datadf['genres'].apply(lambda x: [i['name'] for i in x] if
isinstance(x, list) else [])
datadf['genres'] = datadf['genres'].apply(lambda x: x[:3])



def cleanspaces(x):
     if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
     else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
        
for feature in ['cast', 'director', 'genres', 'keywords']:
     datadf[feature] = datadf[feature].apply(cleanspaces)
        
#merge the different features
def mergee(x):
     return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast'])+ ' ' +x['director'] + ' ' + ' '.join(x['genres'])+' '+' '.join(x['overview'])

#creating new column merged
datadf['merged'] = datadf.apply(mergee, axis=1)


#making the similarity matrix
tfidf = TfidfVectorizer(stop_words='english')
matrix = tfidf.fit_transform(datadf['merged'])
simmatrix = linear_kernel(matrix, matrix)

datadf = datadf.reset_index()
indices = pd.Series(datadf.index, index=datadf['title'])

#making the dataset of smaller movies
datadf.to_csv('../dataset/movies_small.csv',index=False)

#building SVD based collaborative filter
reader = Reader() 
data = Dataset.load_from_df(ratedf[['userId', 'movieId', 'rating']], reader)
kf = KFold(n_splits=5)
kf.split(data)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'])
trainset = data.build_full_trainset()
svd.fit(trainset)

links2df = pd.read_csv('../dataset/links_small.csv')
links2df['tmdbId']=links2df['tmdbId'].apply(clean_ids)
links2df = links2df[links2df['tmdbId'].notnull()]
links2df['tmdbId']=links2df['tmdbId'].astype('int')

links2df = links2df.merge(datadf[['title', 'tmdbId']], on='tmdbId')

#title to tmdbId and tmdbId to title mappings
links2df = links2df.set_index('title')
links3df=   links2df.set_index('tmdbId')

#making the final recommendation function
def recfunction(user_id, title):
    indx = indices[title]
    tmdbId = links2df.loc[title]['tmdbId']
    sim_scores = list(enumerate(simmatrix[indx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:25]
    movie_indices = [i[0] for i in sim_scores]  
    pred_movies = datadf.iloc[movie_indices][['title', 'runtime','vote_average', 'tmdbId']] 
    pred_movies['est'] = pred_movies['tmdbId'].apply(lambda x: svd.predict(user_id, links3df.loc[x]['movieId']).est)   
    pred_movies = pred_movies.sort_values('est', ascending=False) 
    pred_movies.columns = ['Title', 'Runtime','Vote Average', 'TMDb Id', 'Estimated Prediction']
    return pred_movies.head(10)


  datadf = pd.read_csv('../dataset/movies_metadata.csv')


In [26]:
recfunction(2,'The Maze Runner')

Unnamed: 0,Title,Runtime,Vote Average,TMDb Id,Estimated Prediction
1520,Labyrinth,101.0,7.1,13597,3.85734
1502,A Man for All Seasons,120.0,7.5,874,3.792735
744,Rebecca,130.0,7.7,223,3.759375
6815,1984,90.0,7.0,1984,3.641828
5345,Angels with Dirty Faces,97.0,7.6,13696,3.611614
7171,The Book of Eli,118.0,6.6,20504,3.608152
7068,9,79.0,6.6,12244,3.606947
7386,1990: The Bronx Warriors,89.0,4.2,61755,3.54996
3871,Waydowntown,84.0,6.0,13915,3.541817
8501,Maggie,95.0,5.2,287424,3.525414


In [23]:
recfunction(500, 'The Conjuring 2')

Unnamed: 0,Title,Runtime,Vote Average,TMDb Id,Estimated Prediction
4337,My Neighbor Totoro,86.0,8.0,8392,3.811077
7601,Clapham Junction,99.0,5.2,39517,3.349612
1615,The Purple Rose of Cairo,82.0,7.3,10849,3.320961
7487,Insidious,103.0,6.8,49018,3.301616
838,Secrets & Lies,142.0,7.1,11159,3.277896
3037,The Spiral Staircase,83.0,6.6,27452,3.135256
8599,Ashby,102.0,6.2,330112,3.085202
7491,Henry's Crime,108.0,5.8,53172,3.076349
3650,The Others,101.0,7.4,1933,3.05586
8464,Furious 7,137.0,7.3,168259,3.055793


In [7]:
import pickle
pickle.dump(datadf,open('movies.pkl','wb'))

In [8]:
userdf = ratedf

In [9]:
userdf.drop_duplicates(subset ="userId", keep ='first',inplace = True)


In [10]:
pickle.dump(userdf,open('users.pkl','wb'))

In [11]:
pickle.dump(links2df,open('links.pkl','wb'))

In [12]:
pickle.dump(links3df,open('links1.pkl','wb'))

In [13]:
pickle.dump(indices,open('indices.pkl','wb'))

In [14]:
pickle.dump(simmatrix,open('matrix.pkl','wb'))

In [15]:
pickle.dump(svd,open('svd1.pkl','wb'))