In [1]:
import ast
import pickle
import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def convert_txt_string(obj):
    data = []
    for i in ast.literal_eval(obj):
        data.append(i["name"])
    return data

In [3]:
def getCast(obj):
    data = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            data.append(i["name"])
            counter += 1
        else:
            break
    return data

In [4]:
def getDirector(obj):
    data = []
    for i in ast.literal_eval(obj):
        if i["job"] == "Director":
            data.append(i["name"])
    return data

In [5]:
def stem(text):
    ps = PorterStemmer()
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [27]:
def recommend(movie):
    movie_index = df[df["title"] == movie.lower()].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:15]
    data = []
    for i in movie_list:
        dict = {
            "id" : df.iloc[i[0]].movie_id,
            "title" : df.iloc[i[0]].title.title()
        }
        data.append(dict)
    return data

In [7]:
def wrangle(set1, set2):
    credits = pd.read_csv(set1)
    movies = pd.read_csv(set2)
    df = movies.merge(credits, left_on="title",right_on="title")
    df.drop(columns=["id", "budget", "homepage", "original_language", "original_title","popularity", "production_companies", "production_countries", "release_date", "revenue", "runtime", "spoken_languages", "status", "tagline", "vote_average", "vote_count"], inplace=True)

    df.dropna(inplace=True)
    
    df["genres"] = df["genres"].apply(convert_txt_string)
    df["genres"] = df["genres"].apply(lambda x: [i.replace(" ", "_")  for i in x ])
    
    df["keywords"] = df["keywords"].apply(convert_txt_string)
    df["keywords"] = df["keywords"].apply(lambda x: [i.replace(" ", "_")  for i in x ])
    
    df["cast"] = df["cast"].apply(getCast)
    df["cast"] = df["cast"].apply(lambda x: [i.replace(" ", "_")  for i in x ])
    
    df["crew"] = df["crew"].apply(getDirector)
    df["crew"] = df["crew"].apply(lambda x: [i.replace(" ", "_")  for i in x ])
    
    df["overview"] = df["overview"].apply(lambda x: x.split())

    df["tags"] = df["overview"] + df["genres"] + df["keywords"] + df["cast"] + df["crew"]

    new_df = df[["movie_id", "title", "tags"]]

    new_df["title"] = new_df["title"].apply(lambda x: x.lower()) 
    new_df["tags"] = new_df["tags"].apply(lambda x: " ".join(x)) 
    new_df["tags"] = new_df["tags"].apply(stem) 

    return new_df

In [8]:
df = wrangle(set1="tmdb_5000_credits.csv", set2="tmdb_5000_movies.csv")
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["title"] = new_df["title"].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tags"] = new_df["tags"].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tags"] = new_df["tags"].apply(stem)


Unnamed: 0,movie_id,title,tags
0,19995,avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,pirates of the caribbean: at world's end,"captain barbossa, long believ to be dead, ha c..."
2,206647,spectre,a cryptic messag from bond’ past send him on a...
3,49026,the dark knight rises,follow the death of district attorney harvey d...
4,49529,john carter,"john carter is a war-weary, former militari ca..."


In [9]:
cv = CountVectorizer(max_features=5000, stop_words="english")

In [10]:
vectors = cv.fit_transform(df["tags"]).toarray()

In [11]:
vectors.shape

(4806, 5000)

In [12]:
similarity = cosine_similarity(vectors)

In [13]:
with open("movies_similarity.pkl", "wb") as f:
    pickle.dump(similarity, f)

In [28]:
recommend("deadpool")

[{'id': 102899, 'title': 'Ant-Man'},
 {'id': 10138, 'title': 'Iron Man 2'},
 {'id': 37931, 'title': 'Macgruber'},
 {'id': 11090, 'title': 'The Animal'},
 {'id': 1930, 'title': 'The Amazing Spider-Man'},
 {'id': 10202, 'title': 'Bedtime Stories'},
 {'id': 38055, 'title': 'Megamind'},
 {'id': 250349, 'title': 'Barbecue'},
 {'id': 27936, 'title': 'Micmacs'},
 {'id': 98369, 'title': 'Blue Like Jazz'},
 {'id': 16441, 'title': 'The Beastmaster'},
 {'id': 76170, 'title': 'The Wolverine'},
 {'id': 152747, 'title': 'All Is Lost'},
 {'id': 361505, 'title': 'Me You And Five Bucks'}]