In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
import pickle

In [None]:
movies_data = pd.read_csv("/content/tmdb_5000_movies.csv")
credits_data = pd.read_csv("/content/tmdb_5000_credits.csv")

movies_data = movies_data.merge(credits_data, on = "title")

In [None]:
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1964 entries, 0 to 1963
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                1964 non-null   int64  
 1   genres                1964 non-null   object 
 2   homepage              851 non-null    object 
 3   id                    1964 non-null   int64  
 4   keywords              1964 non-null   object 
 5   original_language     1964 non-null   object 
 6   original_title        1964 non-null   object 
 7   overview              1964 non-null   object 
 8   popularity            1964 non-null   float64
 9   production_companies  1964 non-null   object 
 10  production_countries  1964 non-null   object 
 11  release_date          1964 non-null   object 
 12  revenue               1964 non-null   int64  
 13  runtime               1964 non-null   float64
 14  spoken_languages      1964 non-null   object 
 15  status               

In [None]:
good_cols = ['id' , 'title', 'genres', 'overview', 'keywords', 'cast', 'crew']
num_good_cols = ['release_date', 'revenue']
movie_data = movies_data[good_cols].copy()
movie_data = movie_data.dropna()
movie_data['crew'].iloc[1]

'[{"credit_id": "52fe4232c3a36847f800b579", "department": "Camera", "gender": 2, "id": 120, "job": "Director of Photography", "name": "Dariusz Wolski"}, {"credit_id": "52fe4232c3a36847f800b4fd", "department": "Directing", "gender": 2, "id": 1704, "job": "Director", "name": "Gore Verbinski"}, {"credit_id": "52fe4232c3a36847f800b54f", "department": "Production", "gender": 2, "id": 770, "job": "Producer", "name": "Jerry Bruckheimer"}, {"credit_id": "52fe4232c3a36847f800b503", "department": "Writing", "gender": 2, "id": 1705, "job": "Screenplay", "name": "Ted Elliott"}, {"credit_id": "52fe4232c3a36847f800b509", "department": "Writing", "gender": 2, "id": 1706, "job": "Screenplay", "name": "Terry Rossio"}, {"credit_id": "52fe4232c3a36847f800b57f", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "52fe4232c3a36847f800b585", "department": "Editing", "gender": 2, "id": 1722, "job": "Editor", "name": "Craig Wood"}, {"credit_id": "52f

In [None]:
import ast
def helper(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

def helper_cast(obj):
    L=[]
    f=0
    for i in ast.literal_eval(obj):
      if(f!=3):
        L.append(i['name'])
        f =f+1
      else:
        break
    return L

def fetch_names(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if(i['job'] == 'Director'):
            L.append(i['name'])
            break
    return L

def return_names(obj):
  return " ".join(obj)



In [None]:
movie_data["genres"] = movie_data["genres"].apply(helper)
movie_data["keywords"] = movie_data["keywords"].apply(helper)
movie_data["crew"] = movie_data['crew'].apply(fetch_names)
movie_data["cast"] = movie_data["cast"].apply(helper_cast)
movie_data["overview"] = movie_data['overview'].apply(lambda x: x.split())
movie_data["title"] = movie_data['title'].apply(lambda x: x.split())

In [None]:
movie_data["genres"] = movie_data['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movie_data["keywords"] = movie_data['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movie_data["cast"] = movie_data['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movie_data["overview"] = movie_data['overview'].apply(lambda x: [i.replace(" ", "") for i in x])
movie_data["crew"] = movie_data['crew'].apply(lambda x: [i.replace(" ", "") for i in x])



In [None]:
movie_data["tags"] = movie_data['title'] + movie_data["genres"]  + movie_data["keywords"]  + movie_data["cast"]  + movie_data["crew"]

In [None]:
movie_data["title"] = movie_data["title"].apply(lambda x: " ".join(x))

In [None]:
df = movie_data[['id','title', 'tags']]
df["tags"] = df["tags"].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tags"] = df["tags"].apply(lambda x: " ".join(x))


In [None]:
cv = CountVectorizer(max_features = 5000, stop_words = 'english')
vectors = cv.fit_transform(df['tags']).toarray()

In [None]:
ps = PorterStemmer()
def stem_text(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

df['tags'] = df['tags'].apply(stem_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(stem_text)


In [None]:
sim = cosine_similarity(vectors)

In [None]:
def recommend(movie):
    index = movie_data[movie_data["title"] == movie].index[0]
    sm = sim[index]
    m_list = sorted(list(enumerate(sm)), reverse = True, key = lambda x: x[1])[1:6]

    for i in m_list():
        print(df.iloc[i[0]].title)


In [None]:
pickle.dump(df, open("movies.pkl", 'wb'))
pickle.dump(df.to_dict(), open("movie_dict.pkl", 'wb'))
pickle.dump(sim, open("similarity.pkl", 'wb'))