## IMPORT LIBRAIRIES

In [1]:
import pandas as pd

## READ CSV

In [2]:
links = pd.read_csv('./src/links.csv')
movies = pd.read_csv('./src/movies.csv')
movies_title = pd.merge(movies, links, how='left')
content = pd.read_csv('./src/TMDB_content.csv')
content = content.drop(['poster_path', 'year'], axis=1)

In [3]:
content

Unnamed: 0,tmdb_id,title,genres,keywords,cast,director,watch_providers
0,614696,#Alive,"Horror, Action, Adventure, Thriller","escape, alone, survival, drone, zombie, apartm...","Yoo Ah-in, Park Shin-hye, Lee Hyun-wook, Jin S...",Cho Il,"8, 1796"
1,19913,(500) Days of Summer,"Comedy, Drama, Romance","jealousy, gallery, fight, date, architect, int...","Joseph Gordon-Levitt, Zooey Deschanel, Geoffre...",Marc Webb,337
2,333371,10 Cloverfield Lane,"Thriller, Science Fiction, Drama, Horror","kidnapping, paranoia, bunker, basement, surviv...","Mary Elizabeth Winstead, John Goodman, John Ga...",Dan Trachtenberg,"381, 531, 582, 1853"
3,4951,10 Things I Hate About You,"Comedy, Romance, Drama","high school, deception, based on play or music...","Heath Ledger, Julia Stiles, Joseph Gordon-Levi...",Gil Junger,337
4,389,12 Angry Men,Drama,"death penalty, anonymity, court case, court, j...","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",Sidney Lumet,588
...,...,...,...,...,...,...,...
2995,1946,eXistenZ,"Action, Thriller, Science Fiction","hacker, video game, virtual reality, pop star,...","Jennifer Jason Leigh, Jude Law, Ian Holm, Will...",David Cronenberg,381
2996,437586,mid90s,"Drama, Comedy","california, skateboarding, dysfunctional famil...","Sunny Suljic, Katherine Waterston, Lucas Hedge...",Jonah Hill,
2997,381283,mother!,"Drama, Thriller, Horror","husband wife relationship, nightmare, nihilism...","Jennifer Lawrence, Javier Bardem, Ed Harris, M...",Darren Aronofsky,"381, 531, 582, 1853"
2998,537116,"tick, tick... BOOM!","Drama, Music","new york city, composer, artist, musical, base...","Andrew Garfield, Alexandra Shipp, Robin de Jes...",Lin-Manuel Miranda,"8, 1796"


## CONTENT-BASED FILTERING RECOMMENDER

### PREPROCESSING KEYWORDS
1. Remove occurence < 1

2. Transform keyword to its lemma form

3. Lowercase and remove spaces

In [4]:
df = content.copy()
df['keywords'] = df['keywords'].str.split(',')
df = df.explode('keywords')

In [5]:
k = df['keywords'].value_counts()
k

keywords
 based on novel or book    332
 duringcreditsstinger      240
 murder                    202
 sequel                    196
 california                171
                          ... 
 minefield                   1
 corsican                    1
 twilight                    1
 streetwise                  1
 free fall                   1
Name: count, Length: 10589, dtype: int64

In [6]:
k = k[k>1]
k

keywords
 based on novel or book    332
 duringcreditsstinger      240
 murder                    202
 sequel                    196
 california                171
                          ... 
 around the world            2
 canoe                       2
 candle                      2
secret identity              2
 covid-19                    2
Name: count, Length: 4995, dtype: int64

In [7]:
def filter_keywords(x):
    ''' Remove keywords that appear only one time '''
    words = []
    for i in x:
        if i in k:
            words.append(i)
    return words

In [8]:
import spacy
nlp = spacy.load('en_core_web_sm')

def to_lemma(text):
    ''' Transform keyword into its lemma form '''
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [9]:
content['keywords'] = content['keywords'].apply(lambda x: str(x).split(','))
content['keywords'] = content['keywords'].apply(filter_keywords)
content['keywords'] = content['keywords'].apply(lambda x: [to_lemma(i) for i in x])
content['keywords'] = content['keywords'].apply(lambda x: [str.lower(i.replace(' ','')) for i in x])

In [10]:
k_clean = content.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
k_clean.name = 'keyword'
k_clean.value_counts()

keyword
baseonnovelorbook       496
duringcreditsstinger    244
sequel                  209
murder                  202
newyorkcity             201
                       ... 
universe                  2
earth                     2
warstrategy               2
slowmotion                2
platoniclove              2
Name: count, Length: 4649, dtype: int64

Résultats : 
1. les mots-clés ont été remplacés par leur racine lemma (ex. 'base' VS 'based on novel')

2. les mots-clés n'apparaissant qu'une seule fois ont été supprimés

### PREPROCESSING GENRES AND CAST

In [11]:
content['genres'] = content['genres'].apply(lambda x: str(x).split(','))
content['genres'] = content['genres'].apply(lambda x: [str.lower(i.replace(' ','')) for i in x])

content['cast'] = content['cast'].apply(lambda x: str(x).split(','))
content['cast'] = content['cast'].apply(lambda x: [str.lower(i.replace(' ','')) for i in x])

In [12]:
# We keep 3 actors as the main cast:
if len(content['cast']) > 3:
    content['main_cast'] = content['cast'].apply(lambda x: x[0:3])
else:
    content['main_cast'] = content['cast']

### GROUP ALL CONTENT DATA

In [13]:
content['soup'] = content['genres'] + content['keywords'] + content['main_cast']
content['soup'] = content['soup'].apply(lambda x: ' '.join(x))

## TF-IDF VECTORIZER

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

# Transform each text into its vectorial form and compute the frequency of each word:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(content['soup'])

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity scores:
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim.shape

(3000, 3000)

In [17]:
cosine_sim

array([[1.        , 0.        , 0.11846978, ..., 0.10012523, 0.        ,
        0.11470787],
       [0.        , 1.        , 0.05773503, ..., 0.048795  , 0.0559017 ,
        0.        ],
       [0.11846978, 0.05773503, 1.        , ..., 0.16903085, 0.06454972,
        0.        ],
       ...,
       [0.10012523, 0.048795  , 0.16903085, ..., 1.        , 0.05455447,
        0.        ],
       [0.        , 0.0559017 , 0.06454972, ..., 0.05455447, 1.        ,
        0.        ],
       [0.11470787, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

## GET RECOMMENDATIONS

In [80]:
movie_ids = pd.Series(content.index, index=content['tmdb_id']).drop_duplicates()

def get_recommendations(movie_id, cosine_sim=cosine_sim):
    ''' Function that takes in movie ID as input and outputs most similar movies '''
    
    # Get the index of the movie that matches the ID
    idx = movie_ids[movie_id]

    # Get the pairwise similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 30 most similar movies
    similarity_scores = similarity_scores[1:31]

    # Get the movie indices, scores and tmdb_id
    movie_indices = [i[0] for i in similarity_scores]
    movie_scores = [i[1] for i in similarity_scores]
    movie_tmdb_id = content['tmdb_id'].iloc[movie_indices]

    # Put results into dataframe
    df = pd.DataFrame(data = {'tmdb_id': movie_tmdb_id,
                              'score': movie_scores
                            })
    df = df.reset_index(drop=True)

    # Return the top 30 most similar movies
    return df

### (TEST)

In [133]:
get_recommendations(22794)

Unnamed: 0,tmdb_id,score
0,109451,0.76277
1,360920,0.418121
2,13930,0.40452
3,116149,0.369274
4,381719,0.363636
5,9928,0.363636
6,431693,0.348155
7,481084,0.348155
8,364689,0.345857
9,482321,0.341882


In [104]:
get_recommendations(22794).to_dict(orient='records')

[{'tmdb_id': 109451, 'score': 0.762770071396474},
 {'tmdb_id': 360920, 'score': 0.41812100500354543},
 {'tmdb_id': 13930, 'score': 0.40451991747794525},
 {'tmdb_id': 116149, 'score': 0.36927447293799825},
 {'tmdb_id': 381719, 'score': 0.36363636363636365},
 {'tmdb_id': 9928, 'score': 0.36363636363636365},
 {'tmdb_id': 431693, 'score': 0.3481553119113957},
 {'tmdb_id': 481084, 'score': 0.3481553119113957},
 {'tmdb_id': 364689, 'score': 0.3458572319330373},
 {'tmdb_id': 482321, 'score': 0.3418817293789138},
 {'tmdb_id': 587807, 'score': 0.3418817293789138},
 {'tmdb_id': 227973, 'score': 0.3344968040028363},
 {'tmdb_id': 33217, 'score': 0.3223291856101521},
 {'tmdb_id': 454640, 'score': 0.3223291856101521},
 {'tmdb_id': 459151, 'score': 0.3223291856101521},
 {'tmdb_id': 51739, 'score': 0.3223291856101521},
 {'tmdb_id': 65759, 'score': 0.3198010745334156},
 {'tmdb_id': 486589, 'score': 0.3198010745334156},
 {'tmdb_id': 550205, 'score': 0.3198010745334156},
 {'tmdb_id': 166428, 'score': 0.3

## GATHER ALL SCORES INTO A DATAFRAME

In [121]:
all_movies =  content['tmdb_id'].values
all_scores = []

for movie in all_movies:
    scores = get_recommendations(movie).to_dict(orient='records')
    all_scores.append([movie, scores])   

In [131]:
df_final = pd.DataFrame(all_scores, columns=['tmdb_id', 'score_cb'])
df_final

Unnamed: 0,tmdb_id,score_cb
0,614696,"[{'tmdb_id': 581392, 'score': 0.38949041885226..."
1,19913,"[{'tmdb_id': 8488, 'score': 0.3380617018914066..."
2,333371,"[{'tmdb_id': 335866, 'score': 0.33562431103976..."
3,4951,"[{'tmdb_id': 625450, 'score': 0.40824829046386..."
4,389,"[{'tmdb_id': 11975, 'score': 0.258198889747161..."
...,...,...
2995,1946,"[{'tmdb_id': 97, 'score': 0.40089186286863654}..."
2996,437586,"[{'tmdb_id': 308639, 'score': 0.44444444444444..."
2997,381283,"[{'tmdb_id': 301804, 'score': 0.26318067798390..."
2998,537116,"[{'tmdb_id': 316029, 'score': 0.27735009811261..."


## SAVE TO CSV

In [132]:
df_final.to_csv('./src/TMDB_content_based.csv')