# Content-Based Recommender system usig tf-idf

In [1]:
import pandas as pd #tout ce qui est maths
import numpy as np #gestion des fichiers
import matplotlib.pyplot as plt #ce qui concerne les graphe



In [2]:
users = pd.read_csv('users.csv', sep='\t', encoding = 'latin-1', usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])
movies = pd.read_csv('movies.csv', sep='\t', encoding = 'latin-1', usecols=['movie_id', 'title', 'genres'])
ratings = pd.read_csv('ratings.csv', sep='\t', encoding = 'latin-1', usecols=['user_id', 'movie_id', 'rating'])


# Users Dataset

In [3]:
users.shape

(6040, 5)

In [4]:
#check the top 5 rows
print(users.head())

   user_id gender zipcode  age_desc              occ_desc
0        1      F   48067  Under 18          K-12 student
1        2      M   70072       56+         self-employed
2        3      M   55117     25-34             scientist
3        4      M   02460     45-49  executive/managerial
4        5      M   55455     25-34                writer


In [5]:
users.sample(5)

Unnamed: 0,user_id,gender,zipcode,age_desc,occ_desc
4507,4508,M,15701,25-34,writer
1476,1477,F,90601,45-49,academic/educator
322,323,M,53716,45-49,programmer
419,420,M,55406,35-44,academic/educator
3165,3166,M,32607,18-24,college/grad student


In [6]:
#check the file info
print(users.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   6040 non-null   int64 
 1   gender    6040 non-null   object
 2   zipcode   6040 non-null   object
 3   age_desc  6040 non-null   object
 4   occ_desc  6040 non-null   object
dtypes: int64(1), object(4)
memory usage: 236.1+ KB
None


# Movies Dataset

In [7]:
movies.shape

(3883, 3)

In [8]:
#check the top 5 rows
print(movies.head())

   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy


In [9]:
movies.sample(5)

Unnamed: 0,movie_id,title,genres
1716,1772,Blues Brothers 2000 (1998),Action|Comedy|Musical
2306,2375,"Money Pit, The (1986)",Comedy
2454,2523,Rollercoaster (1977),Drama|Thriller
378,382,Wolf (1994),Drama|Horror
2012,2081,"Little Mermaid, The (1989)",Animation|Children's|Comedy|Musical|Romance


In [10]:
#check the file info
print(movies.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genres    3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB
None


# Ratings Dataset

In [11]:
ratings.shape

(1000209, 3)

In [12]:
#check the top 5 rows
print(ratings.head())

   user_id  movie_id  rating
0        1      1193       5
1        1       661       3
2        1       914       3
3        1      3408       4
4        1      2355       5


In [13]:
ratings.sample(5)

Unnamed: 0,user_id,movie_id,rating
559804,3441,2266,1
687930,4116,2064,3
531784,3283,32,4
210485,1285,3019,5
630655,3816,585,3


# Implementation 

### Im going to build a Content based Recommendation Engine that computes similarity between movies based on movies genres. It will suggest movies that are most similar to a particular movie based on its genre. To do so, I will make use of the file movies.csv 

In [14]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
print (movies['genres'].head())
# Convert genres to string value
movies[ 'genres' ] = movies['genres' ].fillna("").astype('str')
print (movies['genres'].head())

0     [Animation, Children's, Comedy]
1    [Adventure, Children's, Fantasy]
2                   [Comedy, Romance]
3                     [Comedy, Drama]
4                            [Comedy]
Name: genres, dtype: object
0     ['Animation', "Children's", 'Comedy']
1    ['Adventure', "Children's", 'Fantasy']
2                     ['Comedy', 'Romance']
3                       ['Comedy', 'Drama']
4                                ['Comedy']
Name: genres, dtype: object


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from itertools import combinations

In [18]:
tf = TfidfVectorizer(analyzer='word', token_pattern=r"(?u)\b\w[\w-]*\w\b", ngram_range=(1, 1), min_df=1, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
print(tfidf_matrix.shape)
feature_names = tf.get_feature_names_out()
print(feature_names)

(3883, 18)
['action' 'adventure' 'animation' 'children' 'comedy' 'crime'
 'documentary' 'drama' 'fantasy' 'film-noir' 'horror' 'musical' 'mystery'
 'romance' 'sci-fi' 'thriller' 'war' 'western']


### To compute the cosine similar 

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, index = movies['title'], columns = movies['title'])
print('Shape:', cosine_sim_df.shape)
cosine_sim_df.sample(5, axis=1).round(2)

Shape: (3883, 3883)


title,Boys Life 2 (1997),Now and Then (1995),"Mask of Zorro, The (1998)",I Saw What You Did (1965),Midnight Express (1978)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Toy Story (1995),0.00,0.00,0.00,0.00,0.00
Jumanji (1995),0.00,0.00,0.32,0.00,0.00
Grumpier Old Men (1995),0.00,0.00,0.45,0.00,0.00
Waiting to Exhale (1995),0.66,0.66,0.00,0.00,0.66
Father of the Bride Part II (1995),0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...
Meet the Parents (2000),0.00,0.00,0.00,0.00,0.00
Requiem for a Dream (2000),1.00,1.00,0.00,0.00,1.00
Tigerland (2000),1.00,1.00,0.00,0.00,1.00
Two Family House (2000),1.00,1.00,0.00,0.00,1.00


In [20]:
movies[movies.title.eq('2001: A Space Odyssey (1968)')]

Unnamed: 0,movie_id,title,genres
912,924,2001: A Space Odyssey (1968),"['Drama', 'Mystery', 'Sci-Fi', 'Thriller']"


In [21]:


def genre_recommendations(i , M , items , k=10 ):
    ix = M.loc[:,i].to_numpy().argpartition(range(-1,-k,-1))
    closest = M.columns[ix[-1:-(k+2):-1]]
    closest = closest.drop(i,errors = 'ignore')
    return pd.DataFrame(closest).merge(items).head(k)


In [22]:
genre_recommendations('2001: A Space Odyssey (1968)', cosine_sim_df, movies[['title', 'genres']])

Unnamed: 0,title,genres
0,"X-Files: Fight the Future, The (1998)","['Mystery', 'Sci-Fi', 'Thriller']"
1,Event Horizon (1997),"['Action', 'Mystery', 'Sci-Fi', 'Thriller']"
2,2010 (1984),"['Mystery', 'Sci-Fi']"
3,Stalker (1979),"['Mystery', 'Sci-Fi']"
4,"Talented Mr. Ripley, The (1999)","['Drama', 'Mystery', 'Thriller']"
5,"Client, The (1994)","['Drama', 'Mystery', 'Thriller']"
6,Absolute Power (1997),"['Mystery', 'Thriller']"
7,Eyes of Laura Mars (1978),"['Mystery', 'Thriller']"
8,Vertigo (1958),"['Mystery', 'Thriller']"
9,Just Cause (1995),"['Mystery', 'Thriller']"


In [23]:
movies[movies.title.eq('Contact (1997)')]

Unnamed: 0,movie_id,title,genres
1543,1584,Contact (1997),"['Drama', 'Sci-Fi']"


In [24]:
genre_recommendations('Contact (1997)', cosine_sim_df, movies[['title', 'genres']])

Unnamed: 0,title,genres
0,Nineteen Eighty-Four (1984),"['Drama', 'Sci-Fi']"
1,Until the End of the World (Bis ans Ende der W...,"['Drama', 'Sci-Fi']"
2,Close Encounters of the Third Kind (1977),"['Drama', 'Sci-Fi']"
3,"Goodbye, 20th Century (Zbogum na dvadesetiot v...","['Drama', 'Sci-Fi']"
4,Solaris (Solyaris) (1972),"['Drama', 'Sci-Fi']"
5,Powder (1995),"['Drama', 'Sci-Fi']"
6,"Brother from Another Planet, The (1984)","['Drama', 'Sci-Fi']"
7,Conceiving Ada (1997),"['Drama', 'Sci-Fi']"
8,"Day the Earth Stood Still, The (1951)","['Drama', 'Sci-Fi']"
9,Twelve Monkeys (1995),"['Drama', 'Sci-Fi']"
