# Movie Recommendation System

* Demographic Filtering
* Content-Based Filtering
* Collaborative Filtering

## Demographic Filtering

In [1]:
import pandas as pd
import numpy as np

data_frame_1 = pd.read_csv("tmdb_5000_credits.csv")
data_frame_2 = pd.read_csv("tmdb_5000_movies.csv")

In [2]:
data_frame_1.columns = ["id", "title", "cast", "crew"]

In [3]:
data_frame_2 = data_frame_2.merge(data_frame_1[["id", "cast", "crew"]], on = "id")

In [4]:
C = data_frame_2["vote_average"].mean()
m = data_frame_2["vote_count"].quantile(0.9)

In [5]:
q_movies = data_frame_2.copy().loc[data_frame_2["vote_count"] >= m]

In [6]:
def weighted_rating(x, m = m, C = C):
    v = x["vote_count"]
    R = x["vote_average"]
    return (v / (v + m) * R) + (m / (m + v) * C)

In [7]:
q_movies["score"] = q_movies.apply(weighted_rating, axis = 1)

In [8]:
q_movies = q_movies.sort_values("score", ascending = False)

## Content-Based Filtering (Story)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
tfidf = TfidfVectorizer(stop_words = "english")

In [11]:
data_frame_2["overview"] = data_frame_2["overview"].fillna("")

In [12]:
tfidf_matrix = tfidf.fit_transform(data_frame_2["overview"])

In [13]:
from sklearn.metrics.pairwise import linear_kernel

In [14]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [15]:
indices = pd.Series(data_frame_2.index, index = data_frame_2["title"]).drop_duplicates()

In [16]:
def get_recommendations(title, cosine_sim = cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1 : 11]
    movie_indices = [i[0] for i in sim_scores]
    
    return data_frame_2["title"].iloc[movie_indices]

## Content-Based Filtering (Cast, Crew, Keywords, etc.)

In [17]:
from ast import literal_eval
features = ["cast", "crew", "keywords", "genres"]
for feature in features:
    data_frame_2[feature] = data_frame_2[feature].apply(literal_eval)

In [18]:
def get_director(input_data):
    for i in input_data:
        if i["job"] == "Director":
            return i["name"]
        
    return np.nan

In [19]:
data_frame_2["director"] = data_frame_2["crew"].apply(get_director)

In [20]:
def get_list(input_list):
    if isinstance(input_list, list):
        names = [i["name"] for i in input_list]
        if len(names) > 3:
            names = names[:3]
        return names
    
    return []

In [21]:
features = ["cast", "keywords", "genres"]
for feature in features:
    data_frame_2[feature] = data_frame_2[feature].apply(get_list)

In [22]:
def clean_data(input_val):
    if isinstance(input_val, list):
        return [str.lower(i.replace(" ", "")) for i in input_val]
    else:
        if isinstance(input_val, str):
            return str.lower(input_val.replace(" ", ""))
        else:
            return ""

In [23]:
features = ["cast", "keywords", "director", "genres"]
for feature in features:
    data_frame_2[feature] = data_frame_2[feature].apply(clean_data)

In [24]:
def create_soup(input_val):
    return ' '.join(input_val["keywords"]) + " " + " ".join(input_val["cast"]) + " " + input_val["director"] + " " + " ".join(input_val["genres"])
data_frame_2["soup"] = data_frame_2.apply(create_soup, axis = 1)

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words = "english")
count_matrix = count.fit_transform(data_frame_2["soup"])

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [27]:
data_frame_2 = data_frame_2.reset_index()
indices = pd.Series(data_frame_2.index, index = data_frame_2["title"])

In [28]:
get_recommendations("Amélie", cosine_sim2)

861       A Very Long Engagement
3512       The Spanish Apartment
4247       Me You and Five Bucks
653               This Means War
1348                       Aloha
1366       The Devil Wears Prada
1579           Midnight in Paris
2217    Everyone Says I Love You
3763     The Rules of Attraction
226              How Do You Know
Name: title, dtype: object

In [29]:
import pickle

In [30]:
movies = data_frame_2[["id", "title"]].copy()
movies.head(5)

Unnamed: 0,id,title
0,19995,Avatar
1,285,Pirates of the Caribbean: At World's End
2,206647,Spectre
3,49026,The Dark Knight Rises
4,49529,John Carter


In [31]:
pickle.dump(movies, open("movies.pickle", "wb"))

In [32]:
pickle.dump(cosine_sim2, open("cosine_sim.pickle", "wb"))