# Film Recommender with Two Models

In [1]:
import pandas as pd
import numpy as np 
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

## Exploratory Data Analysis

In [2]:
movies = pd.read_csv('./dataset.csv')
movies.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [3]:
movies.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [4]:
movies['content'] = movies['genre'] + ' ' + movies['overview']
movies = movies[['id', 'title', 'content']]
movies.head()

Unnamed: 0,id,title,content
0,278,The Shawshank Redemption,"Drama,Crime Framed in the 1940s for the double..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance Raj is a rich, carefree, ..."
2,238,The Godfather,"Drama,Crime Spanning the years 1945 to 1955, a..."
3,424,Schindler's List,"Drama,History,War The true story of how busine..."
4,240,The Godfather: Part II,"Drama,Crime In the continuing saga of the Corl..."


## Spacy Model




In [5]:
# load the model
nlp = spacy.load('en_core_web_lg')

In [6]:
# apply to each description
content = list(movies['content'].astype(str))
vectors = [nlp(string).vector for string in content]

In [7]:
# convert to array to apply similarity function
vectors = np.array(vectors)
vectors.shape

(10000, 300)

In [8]:
similarity = cosine_similarity(vectors)
print(similarity.shape)
similarity

(10000, 10000)


array([[1.0000001 , 0.9416373 , 0.9583355 , ..., 0.88816786, 0.91998696,
        0.91120994],
       [0.9416373 , 0.99999994, 0.9430751 , ..., 0.8757428 , 0.8874782 ,
        0.9049433 ],
       [0.9583355 , 0.9430751 , 1.0000001 , ..., 0.90550375, 0.9321621 ,
        0.91120696],
       ...,
       [0.88816786, 0.8757428 , 0.90550375, ..., 0.99999976, 0.8759167 ,
        0.86058116],
       [0.91998696, 0.8874782 , 0.9321621 , ..., 0.8759167 , 0.99999976,
        0.89578146],
       [0.91120994, 0.9049433 , 0.91120696, ..., 0.86058116, 0.89578146,
        0.9999999 ]], dtype=float32)

In [9]:
# test with The Godfather
distance = sorted(list(enumerate(similarity[2])), reverse=True, key=lambda vect: vect[1])
for i, _ in distance[:5]:
    print(movies.iloc[i].title)

The Godfather
Traffic
Brazil
Wild Strawberries
Beyond Borders


In [11]:
# store as a parquet file 
import pickle 
pickle.dump(similarity, open('spacy_matrix.pkl', 'wb'))

## Count Vectoriser

In [12]:
# instantiate and fit the Count Vectoriser
CV = CountVectorizer(max_features=10000, stop_words='english')
vector = CV.fit_transform(content).toarray()
vector.shape

(10000, 10000)

In [13]:
similarity_cv = cosine_similarity(vector)

In [14]:
# test again for The Godfather
distance = sorted(list(enumerate(similarity_cv[2])), reverse=True, key=lambda vect: vect[1])
for i, _ in distance[:5]:
    print(movies.iloc[i].title)

The Godfather
The Godfather: Part II
Blood Ties
Joker
Bomb City


In [15]:
pickle.dump(similarity_cv, open('cv_matrix.pkl', 'wb'))