# Building a content-based recommender using tf-idf

## Import libraries and data

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer #obtaining tdf vectors
from itertools import combinations #finding combination of genres for a given movie
from sklearn.metrics.pairwise import cosine_similarity #To compute the cosine similarities between all tf-idf vectors


df = pd.read_csv("../../Data/ml-latest-small/PreprocessedData_ml_latest_year_small.csv",index_col=0)


## drop duplicates and keep only movieId, title and genres

In [2]:
df['pasteIDandMovie'] = df['title']+str(df['movieId'])
df = df.drop_duplicates(subset=['pasteIDandMovie'])

movies = df[['movieId', 'title', 'genres']].sort_values(by=['movieId']).reset_index(drop=True)


In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## Transform the genres into a meaningful representation of numbers using TfidVectorizer

In [4]:
tf = TfidfVectorizer(stop_words = 'english', analyzer = 'word')
tfidf_matrix = tf.fit_transform(movies['genres'])

In [5]:
##to get an impression of what the result looks like, uncomment the line below
#pd.DataFrame(tfidf_matrix.todense(), columns=tf.get_feature_names(), index=movies.title).sample(10, axis=1).sample(10, axis=0)


## Use cosine similarity to find similar vectors

In [6]:
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, index=movies['title'], columns=movies['title'])


print('Shape:', cosine_sim_df.shape)
#find a sample of the result below
cosine_sim_df.sample(5, axis=1).round(2) 

Shape: (9719, 9719)


title,Half a Loaf of Kung Fu (Dian zhi gong fu gan chian chan) (1980),The Wolfpack (2015),Paint Your Wagon (1969),"Adventures of Sharkboy and Lavagirl 3-D, The (2005)","Law of Desire (Ley del deseo, La) (1987)"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Toy Story (1995),0.16,0.0,0.07,0.74,0.13
Jumanji (1995),0.00,0.0,0.00,0.91,0.00
Grumpier Old Men (1995),0.34,0.0,0.16,0.00,0.88
Waiting to Exhale (1995),0.30,0.0,0.14,0.00,1.00
Father of the Bride Part II (1995),0.59,0.0,0.28,0.00,0.50
...,...,...,...,...,...
Black Butler: Book of the Atlantic (2017),0.54,0.0,0.09,0.49,0.16
No Game No Life: Zero (2017),0.21,0.0,0.10,0.35,0.18
Flint (2017),0.00,0.0,0.00,0.00,0.47
Bungo Stray Dogs: Dead Apple (2018),0.47,0.0,0.00,0.24,0.00


## find the highest tf-idf score for a given movie, function declaration


Moved this function in the CB_TFIDF_CosineSimilarity.py script

In [17]:

def genre_recommendation(query_title):
    """
    Recommends movies based on a similarity dataframe
    Parameters
    ----------
    query_title : Movie title (string)

    """
    items= movies[['title', 'genres']]
    #select column with the input movie title, and change it to numpy array 
    #resulting array of indices indicates the positions of the elements that would be in the first i positions
    sel = cosine_sim_df.loc[:,query_title].to_numpy().argpartition(range(-1,-100,-1)) 
    #resulting subset of column names is ordered in descending order of the corresponding values in the title column. 
    #This subset is then assigned to the variable ct    
    ct = cosine_sim_df.columns[sel[-1:-(100+2):-1]]
    #drop columns title from input and merge the df with the original dataframe. show only first i results. 
    ct = ct.drop(query_title, errors='ignore')
    
    xx = pd.DataFrame(ct).merge(items).head(100)
    
    #add similarity score to xx
    xx['Similarity Score'] = cosine_sim_df.loc[query_title, xx['title']].values
    
    return xx



### find your movie on what the recommendation should be based on, and paste the name in the function below

In [15]:
# for example, find a harry potter movie
#movies[movies["title"].str.contains('Harry')]

## Recommendation Example

In [9]:
similar_movies=genre_recommendation('Harry Potter and the Order of the Phoenix (2007)')

In [16]:
similar_movies.head(10)

Unnamed: 0,title,genres,Similarity Score
0,"Hobbit: An Unexpected Journey, The (2012)",Adventure|Fantasy|IMAX,0.967592
1,Alice in Wonderland (2010),Adventure|Fantasy|IMAX,0.967592
2,Jack the Giant Slayer (2013),Adventure|Fantasy|IMAX,0.967592
3,"Hobbit: The Desolation of Smaug, The (2013)",Adventure|Fantasy|IMAX,0.967592
4,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX,0.967592
5,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,0.936705
6,"Twilight Saga: Breaking Dawn - Part 2, The (2012)",Adventure|Drama|Fantasy|Romance|IMAX,0.93064
7,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX,0.904
8,Oz the Great and Powerful (2013),Action|Adventure|Fantasy|IMAX,0.902588
9,Thor: The Dark World (2013),Action|Adventure|Fantasy|IMAX,0.902588


## Save your model

In [13]:
#import pickle
#filename = 'Model_tfidf_cosine_sim.sav'
#pickle.dump(cosine_sim, open(filename, 'wb'))