In [4]:
import numpy as np
import pandas as pd
import ast

movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [5]:
movies= movies.merge(credits,on='title')

In [6]:
movies = movies[['movie_id','title','genres', 'overview', 'keywords', 'cast', 'crew' ]]

In [7]:
movies.duplicated().sum()
movies.isnull().sum()
movies.dropna(inplace=True)

In [8]:
movies.iloc[21].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}]'

In [9]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [10]:
movies['genres'] = movies['genres'].apply(convert)

In [11]:
movies['keywords'] = movies['keywords'].apply(convert)

In [12]:
movies['cast'] = movies['cast'].apply(convert3)

In [13]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [14]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [15]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])

In [16]:
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])

In [17]:
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])

In [18]:
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [19]:
movies['tags'] = movies ['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [20]:
new_df = movies [['movie_id', 'title', 'tags']]

In [21]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))


In [22]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [23]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 5000, stop_words = 'english')

In [24]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [25]:
cv.get_feature_names()

['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '18th',
 '19',
 '1930s',
 '1940s',
 '1950',
 '1950s',
 '1960s',
 '1970s',
 '1980',
 '1980s',
 '1985',
 '1990s',
 '1999',
 '19th',
 '19thcentury',
 '20',
 '200',
 '2009',
 '20th',
 '24',
 '25',
 '30',
 '300',
 '3d',
 '40',
 '50',
 '500',
 '60',
 '60s',
 '70',
 '70s',
 'aaron',
 'aaroneckhart',
 'abandoned',
 'abducted',
 'abigailbreslin',
 'abilities',
 'ability',
 'able',
 'aboard',
 'abuse',
 'abusive',
 'academy',
 'accept',
 'accepted',
 'accepts',
 'access',
 'accident',
 'accidental',
 'accidentally',
 'accompanied',
 'accomplish',
 'account',
 'accountant',
 'accused',
 'ace',
 'achieve',
 'act',
 'acting',
 'action',
 'actionhero',
 'actions',
 'activist',
 'activities',
 'activity',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'adam',
 'adams',
 'adamsandler',
 'adamshankman',
 'adaptation',
 'adapted',
 'addict',
 'addicted',
 'addiction',
 'adolescence',
 'adolescent'

In [26]:
import nltk

In [27]:
from nltk.stem.porter import PorterStemmer

In [28]:
ps = PorterStemmer()

In [29]:
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [30]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
similarity = cosine_similarity(vectors)

In [33]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x: x[1])[1:6]
    
    for i in movies_list:
        print (new_df.iloc[i[0]].title)

In [34]:
recommend('Avatar')

Titan A.E.
Small Soldiers
Independence Day
Ender's Game
Aliens vs Predator: Requiem


In [35]:
import pickle

In [40]:
pickle.dump(new_df,open("movies.pkl","wb"))

In [41]:
pickle.dump(similarity,open("similarity.pkl","wb"))

In [42]:
pickle.dump(new_df.to_dict(),open("movies_dict.pkl","wb"))