# CAEAFLIX - RECOMMANDATION SYSTEM 

IMPORT LIBRARIES AND DATASET

In [3]:
#We import usefull libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('popular')
import spacy

In [4]:
#Dataset to use
df = pd.read_csv('caea_4models')

PREPARATION OF THE SELECTED FEATURES OF THE MODEL

In [6]:
#Detect NaN
def isNaN(num):
    return num != num

#Function that will return the result of the NLP analysis
def importantwords(x):
    if isNaN(x) == False:
        text_words = nltk.word_tokenize(x.lower())
        text_clean = []
        for word in text_words:
            if word not in nltk.corpus.stopwords.words("english") and word.isalpha():
                text_clean.append(word)

        nlp = spacy.load('en_core_web_sm')
        text_clean = nlp(",".join(text_clean))
        text_lemm = []
        for word in text_clean :
            text_lemm.append(word.lemma_)

        for word in text_lemm:
            if word in '-–,..' :
                text_lemm.remove(word)

        return list(set(text_lemm))

#we create and clean the column
df['movies_overview_cleaned'] = df['movies_overview'].apply(lambda x: importantwords(x))
df['movies_overview_cleaned'] = df['movies_overview_cleaned'].astype(str).str[1:-1]

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\antoi\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\antoi\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\antoi\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\antoi\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\antoi\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]   

In [61]:
#2
#Separate the Anime movies from the rest
index_anime = df[df['genres'].str.contains('Animation')].index.to_list()
not_indexanime = df[~df['genres'].str.contains('Animation')].index.to_list()
for index in index_anime:
    df.loc[index,'genre_2'] = 'ANIMEANIMEANIMEANIMEANIMEANIMEANIMEANIMEANIMEANIME'
for index in not_indexanime:
    df.loc[index,'genre_2'] = 'OTHER'
df.genre_2 = df.genre_2.apply(lambda x: [x]).astype(str).str[1:-1]

In [94]:
#3
#Clean the features
features = ['movies_keywords','genres','director','actors','actress','companies','genre_2']
for feature in features:
    df[feature] = df[feature].str.replace("[","")
    df[feature] = df[feature].str.replace("]","")
    df[feature] = df[feature].fillna('')

  df[feature] = df[feature].str.replace("[","")
  df[feature] = df[feature].str.replace("]","")


MODEL 1 - GENERAL RECOMMANDATION

In [102]:
#We combine each selected features of each rows together in a new column
def combined_features(row):
    return row['movies_keywords']+" "+row['genres']+" "+row['companies']+" "+row['actors']+" "+row['actress']+" "+row['director']+" "+row['primaryTitle']
df["combined_features"] = df.apply(combined_features, axis =1)

In [12]:
#LAUNCH THE MODEL
#We save an NLP model to count each words presence and create binary lists 
cv = CountVectorizer()

#We create matrix of binary lists and compare each movie with them 
count_matrix = cv.fit_transform(df["combined_features"])

#We create the recommandation system function
movie_user_likes = "The Dark Knight Rises"

#We find the index with the title entered by the consumer
def get_index_from_title(primaryTitle):
    return df[df.primaryTitle == primaryTitle].index.values[0]
movie_index = get_index_from_title(movie_user_likes)

#We use the index to create a list of similar movies with cosine_sim function which will compare the binary lists
cosine_sim = cosine_similarity(count_matrix)
similar_movies = list(enumerate(cosine_sim[movie_index]))

#We sort by the best scores
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)

#We display the results
def get_title_from_index(index):
    return df[df.index == index][["primaryTitle",'movies_overview']].values[0]
i=0

#We don't want to display the nearest neigbour which is actually the movie the consumer typed
for movie in sorted_similar_movies[1:]:
    print(get_title_from_index(movie[0]))
    i=i+1
    #We fix the number of recommandations we want to display
    print()
    if i>15:
        break

['The Dark Knight'
 'Batman raises the stakes in his war on crime. With the help of Lt. Jim Gordon and District Attorney Harvey Dent, Batman sets out to dismantle the remaining criminal organizations that plague the streets. The partnership proves to be effective, but they soon find themselves prey to a reign of chaos unleashed by a rising criminal mastermind known to the terrified citizens of Gotham as the Joker.']

['Batman Begins'
 'Driven by tragedy, billionaire Bruce Wayne dedicates his life to uncovering and defeating the corruption that plagues his home, Gotham City.  Unable to work within the system, he instead creates a new identity, a symbol of fear for the criminal underworld - The Batman.']

['The Batman'
 'In his second year of fighting crime, Batman uncovers corruption in Gotham City that connects to his own family while facing a serial killer known as the Riddler.']

['Kick-Ass 2'
 'After Kick-Ass’ insane bravery inspires a new wave of self-made masked crusaders, he join

RECOMMANDATION DIRECTORS

In [417]:
#We combine each selected features of each rows together in a new column
def combined_features_2(row):
    return row['director']
df["combined_features_2"] = df.apply(combined_features_2, axis =1)

In [24]:
#LAUNCH THE MODEL
#We save an NLP model to count each words presence and create binary lists 
cv = CountVectorizer()

#We create matrix of binary lists and compare each movie with them 
count_matrix_2 = cv.fit_transform(df["combined_features_2"])

#We create the recommandation system function
movie_user_likes = "The Dark Knight Rises"

#We find the index with the title entered by the consumer
def get_index_from_title(primaryTitle):
    return df[df.primaryTitle == primaryTitle].index.values[0]
movie_index = get_index_from_title(movie_user_likes)

#We use the index to create a list of similar movies with cosine_sim function which will compare the binary lists
cosine_sim = cosine_similarity(count_matrix_2)
similar_movies = list(enumerate(cosine_sim[movie_index]))

#We sort by the best scores
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)

#We display the results
def get_title_from_index(index):
    return df[df.index == index][["primaryTitle",'movies_overview','director']].values[0]
i=0

#We don't want to display the nearest neigbour which is actually the movie the consumer typed
for movie in sorted_similar_movies[1:]:
    print(get_title_from_index(movie[0]))
    i=i+1
    #We fix the number of recommandations we want to display
    print()
    if i>15:
        break

['Memento'
 "Leonard Shelby is tracking down the man who raped and murdered his wife. The difficulty of locating his wife's killer, however, is compounded by the fact that he suffers from a rare, untreatable form of short-term memory loss. Although he can recall details of life before his accident, Leonard cannot remember what happened fifteen minutes ago, where he's going, or why."
 "'Christopher Nolan'"]

['Insomnia'
 "Two Los Angeles homicide detectives are dispatched to a northern town where the sun doesn't set to investigate the methodical murder of a local teen."
 "'Christopher Nolan'"]

['Batman Begins'
 'Driven by tragedy, billionaire Bruce Wayne dedicates his life to uncovering and defeating the corruption that plagues his home, Gotham City.  Unable to work within the system, he instead creates a new identity, a symbol of fear for the criminal underworld - The Batman.'
 "'Christopher Nolan'"]

['The Dark Knight'
 'Batman raises the stakes in his war on crime. With the help of 

RECOMMANDATION COMPANIES

In [29]:
#We combine each features of each rows together in a new column
def combined_features_3(row):
    return row['companies']
df["combined_features_3"] = df.apply(combined_features_3, axis =1)

In [30]:
#LAUNCH THE MODEL
#We save an NLP model to count each words presence and create binary lists 
cv = CountVectorizer()

#We create matrix of binary lists and compare each movie with them 
count_matrix_3 = cv.fit_transform(df["combined_features_3"])

#We create the recommandation system function
movie_user_likes = "The Dark Knight Rises"

#We find the index with the title entered by the consumer
def get_index_from_title(primaryTitle):
    return df[df.primaryTitle == primaryTitle].index.values[0]
movie_index = get_index_from_title(movie_user_likes)

#We use the index to create a list of similar movies with cosine_sim function which will compare the binary lists
cosine_sim = cosine_similarity(count_matrix_3)
similar_movies = list(enumerate(cosine_sim[movie_index]))

#We sort by the best scores
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)

#We display the results
def get_title_from_index(index):
    return df[df.index == index][["primaryTitle",'movies_overview','companies']].values[0]
i=0

#We don't want to display the nearest neigbour which is actually the movie the consumer typed
for movie in sorted_similar_movies[1:]:
    print(get_title_from_index(movie[0]))
    i=i+1
    #We fix the number of recommandations we want to display
    print()
    if i>15:
        break

ValueError: np.nan is an invalid document, expected byte or unicode string.

RECOMMANDATION ACTORS / ACTRESS

In [432]:
#We combine each features of each rows together in a new column
def combined_features_4(row):
    return row['actors']+" "+row['actress']
df["combined_features_4"] = df.apply(combined_features_4, axis =1)

In [32]:
#LAUNCH THE MODEL
#We save an NLP model to count each words presence and create binary lists 
cv = CountVectorizer()

#We create matrix of binary lists and compare each movie with them 
count_matrix_4 = cv.fit_transform(df["combined_features_4"])

#We create the recommandation system function
movie_user_likes = "The Dark Knight Rises"

#We find the index with the title entered by the consumer
def get_index_from_title(primaryTitle):
    return df[df.primaryTitle == primaryTitle].index.values[0]
movie_index = get_index_from_title(movie_user_likes)

#We use the index to create a list of similar movies with cosine_sim function which will compare the binary lists
cosine_sim = cosine_similarity(count_matrix_4)
similar_movies = list(enumerate(cosine_sim[movie_index]))

#We sort by the best scores
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)

#We display the results
def get_title_from_index(index):
    return df[df.index == index][["primaryTitle",'movies_overview','actors','actress']].values[0]
i=0

#We don't want to display the nearest neigbour which is actually the movie the consumer typed
for movie in sorted_similar_movies[1:]:
    print(get_title_from_index(movie[0]))
    i=i+1
    #We fix the number of recommandations we want to display
    print()
    if i>6:
        break

['Child 44'
 'Set in Stalin-era Soviet Union, a disgraced MGB agent is dispatched to investigate a series of child murders -- a case that begins to connect with the very top of party leadership.'
 "'Gary Oldman', 'Tom Hardy', 'Joel Kinnaman'" "'Noomi Rapace'"]

['Tinker Tailor Soldier Spy'
 'In the bleak days of the Cold War, espionage veteran George Smiley is forced from semi-retirement to uncover a Soviet mole within his former colleagues at the heart of MI6.'
 "'Colin Firth', 'Colin Firth', 'Gary Oldman', 'Gary Oldman', 'Mark Strong', 'Mark Strong', 'Tom Hardy', 'Tom Hardy'"
 "'unknown'"]

['Murder in the First'
 'Inspired by a true story. A petty criminal sent to Alcatraz in the 1930s is caught attempting to make an escape. As punishment he is put in solitary confinement. The maximum stay is supposed to be 19 days, but Henri spends years alone, cold and in complete darkness, only to emerge a madman and soon to be a murderer. The story follows a rookie lawyer attempting to prove tha