<a href="https://colab.research.google.com/github/Jerremiah/project-set/blob/main/movie_recommdation_engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install fuzzywuzzy

In [None]:
import json
import pandas as pd
#___________________________
def load_tmdb_movies(path):
    df = pd.read_csv(path)
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries',
                    'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df
#___________________________
def load_tmdb_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df
#___________________
LOST_COLUMNS = [
    'actor_1_facebook_likes',
    'actor_2_facebook_likes',
    'actor_3_facebook_likes',
    'aspect_ratio',
    'cast_total_facebook_likes',
    'color',
    'content_rating',
    'director_facebook_likes',
    'facenumber_in_poster',
    'movie_facebook_likes',
    'movie_imdb_link',
    'num_critic_for_reviews',
    'num_user_for_reviews']
#____________________________________
TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES = {
    'budget': 'budget',
    'genres': 'genres',
    'revenue': 'gross',
    'title': 'movie_title',
    'runtime': 'duration',
    'original_language': 'language',
    'keywords': 'plot_keywords',
    'vote_count': 'num_voted_users'}
#_____________________________________________________
IMDB_COLUMNS_TO_REMAP = {'imdb_score': 'vote_average'}
#_____________________________________________________
def safe_access(container, index_values):
    # return missing value rather than an error upon indexing/key failure
    result = container
    try:
        for idx in index_values:
            result = result[idx]
        return result
    except IndexError or KeyError:
        return pd.np.nan
#_____________________________________________________
def get_director(crew_data):
    directors = [x['name'] for x in crew_data if x['job'] == 'Director']
    return safe_access(directors, [0])
#_____________________________________________________
def pipe_flatten_names(keywords):
    return '|'.join([x['name'] for x in keywords])
#_____________________________________________________
def convert_to_original_format(movies, credits):
    tmdb_movies = movies.copy()
    tmdb_movies.rename(columns=TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES, inplace=True)
    tmdb_movies['title_year'] = pd.to_datetime(tmdb_movies['release_date']).apply(lambda x: x.year)
    # I'm assuming that the first production country is equivalent, but have not been able to validate this
    tmdb_movies['country'] = tmdb_movies['production_countries'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['language'] = tmdb_movies['spoken_languages'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['director_name'] = credits['crew'].apply(get_director)
    tmdb_movies['actor_1_name'] = credits['cast'].apply(lambda x: safe_access(x, [1, 'name']))
    tmdb_movies['actor_2_name'] = credits['cast'].apply(lambda x: safe_access(x, [2, 'name']))
    tmdb_movies['actor_3_name'] = credits['cast'].apply(lambda x: safe_access(x, [3, 'name']))
    tmdb_movies['genres'] = tmdb_movies['genres'].apply(pipe_flatten_names)
    tmdb_movies['plot_keywords'] = tmdb_movies['plot_keywords'].apply(pipe_flatten_names)
    return tmdb_movies

1. Exploration

1.1 Keywords
1.2 Filling factor: missing values
1.3 Number of films per year
1.4 Genres
2. Cleaning

2.1 Cleaning of the keywords
2.1.1 Grouping by roots
2.1.2 Groups of synonyms
2.2 Correlations
2.3 Missing values
2.3.1 Setting missing title years
2.3.2 Extracting keywords from the title
2.3.3 Imputing from regressions
3. Recommendation Engine

3.1 Basic functioning of the engine
3.1.1 Similarity
3.1.2 Popularity
3.2 Definition of the recommendation engine functions
3.3 Making meaningfull recommendations
3.4 Exemple of recommendation: test-case
4. Conclusion: possible improvements and points to adress

5. all codes comes from article below ：

  [film-recommendation-engine](https://www.kaggle.com/fabiendaniel/film-recommendation-engine)

In [None]:
#Load package we need
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import math, nltk, warnings
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet');nltk.download('averaged_perceptron_tagger')
from sklearn import linear_model
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz
from wordcloud import WordCloud, STOPWORDS
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"
pd.options.display.max_columns = 50
%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
#link to the google drive we choose
from google.colab import drive
drive.mount("/content/drive")

In [None]:
import os
main_path = "/content/drive//MyDrive/adventure_time/movie data"
os.chdir(main_path)
os.listdir()#show all data on the direction

data = [d for d in os.listdir() if "tmdb_5000" in d]

#Zip data  
import zipfile
path_file_to_zip = [os.path.join(main_path,d) for d in data]

# for i in path_file_to_zip:
#   with zipfile.ZipFile(i,'r') as zip_file:
#     print(zip_file.namelist())
#     zip_file.extractall(os.getcwd())



# Import and manipulate raw data

In [None]:
# Import data
file_to_read = [f for f in path_file_to_zip if f.endswith(".csv")]

df_movie = pd.read_csv(file_to_read[0])
df_credit = pd.read_csv(file_to_read[1])

#Manipulate data formation：some data are read as json formation.Therefore, we should do some transformation
json_columns = ['genres', 'keywords', 'production_countries','production_companies', 'spoken_languages']
for i in json_columns:
  print("column in the mvie data: {}".format(i))
  df_movie[i] = df_movie[i].apply(json.loads)
json_columns = ['cast', 'crew']
for i in json_columns:
  print("column in the credit data: ",i)
  df_credit[i] = df_credit[i].apply(json.loads)


df_initial = convert_to_original_format(df_movie, df_credit)
print('Shape:',df_initial.shape)
#__________________________________________
# info on variable types and filling factor
tab_info=pd.DataFrame(df_initial.dtypes).T.rename(index={0:'column type'})
tab_info=tab_info.append(pd.DataFrame(df_initial.isnull().sum()).T.rename(index={0:'null values'}))
tab_info=tab_info.append(pd.DataFrame(df_initial.isnull().sum()/df_initial.shape[0]*100).T.
                         rename(index={0:'null values (%)'}))
print("\nData Info:{}".format("df_initial"))
tab_info

In [None]:
#Combine two data frame to initial df
initial_df = convert_to_original_format(df_movie,df_credit)

# Viewing data1
df_type =initial_df.dtypes.reset_index(name="type").rename(columns={"index":"column"})
df_mis =initial_df.isnull().sum().reset_index(name="missing_num").rename(columns={"index":"column"})

df_str = df_type.join(df_mis.set_index("column"),on="column")
df_str['missing_proportion'] = df_str['missing_num']/df_movie.shape[0]*100
df_str.sort_values(by="missing_proportion",ascending=False) #We can tell there mamny missing value in the some columns,
                               # such as "homepage"、"tagline"

# View data2
# df_info = pd.DataFrame(df_initial.dtypes).T.rename(index={0:"column type"})
# df_info = df_info.append(pd.DataFrame(df_initial.isnull().sum()).T.rename(index={0:"null number"}))
# df_info = df_info.append(pd.DataFrame(df_initial.isnull().sum()/df_initial.shape[0]*100).T.rename(index = {0:"null values (%)"}))
# df_info

In [None]:
#Create a set of variable in the plot_keywords column
set_keywords = set()

for i in df_initial.plot_keywords.str.split("|").values:
  if isinstance(i,float): continue

  set_keywords = set_keywords.union(i)

set_keywords.remove("")
#Create a dictionary for each variable set with count number of each variable
tmp = dict()
for i in set_keywords:
  tmp[i]=0
for i in df_initial['plot_keywords']:
  if isinstance(i,float) or pd.isnull(i): continue
  for j in [s for s in i if s in set_keywords]:
     if pd.notnull(j): tmp[j] += 1


# Manipulate the **Keyword** data in the dataframe

In [None]:
keyword_count = dict()
for s in set_keywords: keyword_count[s] = 0
for i in df_initial["plot_keywords"].str.split("|"):
  if type(i) == float and pd.isnull(i): continue
  for s in [s for s in i if s in set_keywords]:
    if pd.notnull(s): keyword_count[s] +=1


In [None]:
#convert dictionary to list
keyword_occurences = []# a list of dictionary containing keywords and counts
for k,v in keyword_count.items():
  keyword_occurences.append([k,v])
keyword_occurences.sort(key=lambda x:x[1],reverse=True)

print("Show the top 5 rows in the dateframe: {}".format("keywords_occurences"))
pd.DataFrame(keyword_occurences,columns=["Keyword","count"]).head()




In [None]:
# wrap upper step into a function

def count_word(df, ref_col, liste):
    keyword_count = dict()
    for s in liste: keyword_count[s] = 0
    for liste_keywords in df[ref_col].str.split('|'):        
        if type(liste_keywords) == float and pd.isnull(liste_keywords): continue        
        for s in [s for s in liste_keywords if s in liste]: 
            if pd.notnull(s): keyword_count[s] += 1
#__________________________________________________________________
    # convert the dictionary in a list to sort the keywords by frequency
    keyword_occurences = []
    for k,v in keyword_count.items():
        keyword_occurences.append([k,v])
    keyword_occurences.sort(key = lambda x:x[1], reverse = True)
    return keyword_occurences, keyword_count

In [None]:
# Go through the keyword procession by function in one line code
keyword_occurences, dum = count_word(df_initial, 'plot_keywords', set_keywords)

In [None]:
#_____________________________________________
# Function that control the color of the words
# WARNING: the scope of variables is used to get the value of the "tone" variable
# I could not find the way to pass it as a parameter of "random_color_func()"
def random_color_func(word=None, font_size=None, position=None,
                      orientation=None, font_path=None, random_state=None):
    h = int(360.0 * tone / 255.0)
    s = int(100.0 * 255.0 / 255.0)
    l = int(100.0 * float(random_state.randint(70, 120)) / 255.0)
    return "hsl({}, {}%, {}%)".format(h, s, l)
#_____________________________________________
# UPPER PANEL: WORDCLOUD
fig = plt.figure(1, figsize=(18,13))
ax1 = fig.add_subplot(2,1,1)
#_______________________________________________________
# I define the dictionary used to produce the wordcloud
words = dict()
trunc_occurences = keyword_occurences[0:50]
for s in trunc_occurences:
    words[s[0]] = s[1]
tone = 55.0 # define the color of the words
#________________________________________________________
wordcloud = WordCloud(width=1000,height=300, background_color='black', 
                      max_words=1628,relative_scaling=1,
                      color_func = random_color_func,
                      normalize_plurals=False)
wordcloud.generate_from_frequencies(words)
ax1.imshow(wordcloud, interpolation="bilinear")
ax1.axis('off')
#_____________________________________________
# LOWER PANEL: HISTOGRAMS
ax2 = fig.add_subplot(2,1,2)
y_axis = [i[1] for i in trunc_occurences]
x_axis = [k for k,i in enumerate(trunc_occurences)]
x_label = [i[0] for i in trunc_occurences]
plt.xticks(rotation=85, fontsize = 15)
plt.yticks(fontsize = 15)
plt.xticks(x_axis, x_label)
plt.ylabel("Nb. of occurences", fontsize = 18, labelpad = 10)
ax2.bar(x_axis, y_axis, align = 'center', color='g')
#_______________________
plt.title("Keywords popularity",bbox={'facecolor':'k', 'pad':5},color='w',fontsize = 25)
plt.show()

# Divide title_year into proper interval

In [None]:
# caculate the decade based on 1900 ：1945 -> 40 decade
df_initial['decade'] = df_initial['title_year'].apply(lambda x :((x-1900)//10)*10)


def get_stats(gr):
  return{'min':gr.min(),"max":gr.max(),'count':gr.count(),'mean':gr.mean()}

test = df_initial['title_year'].groupby(df_initial['decade']).apply(get_stats)
display(pd.DataFrame(test).head(4))
test = test.unstack() #pivot a level of the index label(defualt the last level) 
display(pd.DataFrame(test).head(4))

# How does below code work ?

In [None]:
def label(s):
    val = (1900 + s, s)[s < 100]
    chaine = '' if s < 50 else "{}'s".format(int(val))
    return chaine

[label(s) for s in  test.index]



In [None]:
sns.set_context('poster',font_scale =0.85)

def label(s):
  val = (s + 1900, s)[s<100]
  chaine = '' if s < 50 else "{}'s".format(int(val))
  return chaine

plt.rc('font',weight = 'bold')
fig,ax = plt.subplots(figsize=(20,10))

labels = [label(s) for s in test.index]
sizes = test['count'].values
explode = [0.02 if i < 100 else 0.01 for i in range(11)]
#crat a pie chart
ax.pie(sizes, explode = explode, labels=labels,
       autopct = lambda x:'{:1.0f}%'.format(x) if x > 1 else '',
       shadow=False, startangle=0)
ax.axis('equal')
ax.set_title('% of films per decade',bbox={'facecolor':'k','pad':5},color='w',fontsize =16)
plt.show()

# Show the genres distribution

## 1.make a list of all unique genres

In [None]:
# To see exactly which genres are the most popular, 
# I use approach as same as handling keywords (hence using similar code), 
# first making a census of the genres
genre_set = set()
for i in df_initial['genres'].str.split('|').values:
  # if isinstance(i,float):continue
  genre_set = genre_set.union(i)
  # print(i)
  
genre_set.remove("")
display(genre_set)

## 2.Count the number of each genres in the data

In [None]:
#Check which genres data point belong to iteratively
genres_count = dict()
genres_occurences = list() 
for i in list(genre_set):
  genres_count[i] = 0
  # print(i)
  for j in df_initial['genres'].str.split('|').values:
    if i in j: genres_count[i]+=1
  genres_occurences.append([i,genres_count[i]])

#Sort list by count
genres_occurences.sort(key=lambda x:x[1],reverse=True)
display(genres_occurences)

## 3.Define plot function - Word cloud

In [None]:
def random_color_func(word=None, font_size=None, position=None,
                      orientation=None, font_path=None, random_state=None):
    h = int(360.0 * tone / 255.0)
    s = int(100.0 * 255.0 / 255.0)
    l = int(100.0 * float(random_state.randint(70, 120)) / 255.0)
    return "hsl({}, {}%, {}%)".format(h, s, l)

words = dict()
trunc_occurences = genres_occurences[0:len(genres_occurences)]
for s in trunc_occurences:
    words[s[0]] = s[1]
tone = 100 # define the color of the words
f, ax = plt.subplots(figsize=(14, 6))
wordcloud = WordCloud(width=550,height=300, background_color='black', 
                      max_words=1628,relative_scaling=0.7,
                      color_func = random_color_func,
                      normalize_plurals=False)
wordcloud.generate_from_frequencies(words)
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("The wordcloud of genres")
plt.axis('off')

print("Plot title: ","The wordcloud of genres")
plt.show()





In [None]:
df_initial

# Update genres column in dataframe 

In [None]:
df_initial['genres'] = df_initial['genres'].str.split('|')

# Process keyword column:


## 1.Group Keywords by roots

- Use NLTK to do Text Normalization :

  use [lemmatizer](https://www.datacamp.com/community/tutorials/stemming-lemmatization-python) to  provide roots of word 

In [None]:
#Copy dataset to prevent so as not to modify the original dataset
df_duplicated_cleaned =df_initial.copy()

In [None]:
# Clean the keyword in the plot_keyword by NLTK package
lemmatizer = WordNetLemmatizer() #Use lemmatize stemmer has better result than porter stemmer
keywords_roots = dict()
keywords_select = dict()
category_key = []


for s in df_duplicated_cleaned['plot_keywords'].str.split("|"):
  if all(s) == False:continue
  tmp = [ss.lower() for ss in s]
  for sss in tmp:
    racine = lemmatizer.lemmatize(sss)
    if  racine in keywords_roots:
      keywords_roots[racine].add(sss)

    else:
      keywords_roots[racine] = {sss}


In [None]:
#Replace plot_keywords by roots
df_duplicated_cleaned['plot_keywords']=df_duplicated_cleaned['plot_keywords'].str.split("|").apply(lambda x:[lemmatizer.lemmatize(s) for s in x])

In [None]:
def word_count(df,col):
  keywords_dict = dict()
  keywords_list = list()
  for i in df[col]:
    if any(i):
      for s in i:
        if s in keywords_dict:
          keywords_dict[s] += 1
        else:
          keywords_dict[s] = 1
  
  for k,val in keywords_dict.items():
    keywords_list.append([k,val])
  keywords_list.sort(key=lambda x:x[1],reverse=True)
  return keywords_dict,keywords_list

In [None]:
keywords_roots,keywords_counts = word_count(df_duplicated_cleaned,'plot_keywords')

print("the total amount of various keyword in the plot_keywords: {}".format(len(keywords_roots)))

##2.Group Keywords by synonyms

In [None]:
from nltk.corpus import wordnet as wn

# get the synomyms of the word 
#--------------------------------------------------------------
def get_synonymes(word):
  #get the synonyme of word
  #put them into set of data
  lemma = set()
  if any(wn.synsets(word,pos = wn.NOUN)):
    for i in wn.synsets(word,pos = wn.NOUN):
      for s in i.lemma_names():
        lemma.add(s)
  
  return lemma


#--------------------------------------------------------------
def detector(word):
  # detect whether word has sysnonymes or not
    lemma = get_synonymes(word)
    if any(lemma):
      #make a list to provide the synonymes and their count in a list if they exist
      #sort the list by count and letter in a in descending order
      result = [(s , keywords_roots[s]) for s in lemma if s in keywords_roots]
      result.sort(key=lambda x:(x[1],x[0]),reverse=True)
      return result if any(result) else "no synonyme"
    else:
      return "no synonyme"      
                                                          

In [None]:
 replacement_mot = dict()
 n=0
 for i in keywords_roots.keys():
   if detector(i) != "no synonyme":
      if i != detector(i)[0][0]:
        #if word in the keyword_root isn't equal to the sysnonym
        #put the synonyme into replace list
        replacement_mot[i] = detector(i)[0][0]
        n+=1

print("How many words should been replace by synonyme: {}".format(n))

In [None]:
# replace word with synonyme
n=0
for i in df_duplicated_cleaned['plot_keywords']:
  df_duplicated_cleaned['plot_keywords'][n] = [replacement_mot[s] if s in replacement_mot else s for s in i]
  n+=1


In [None]:
keywords_roots,keywords_counts  = word_count(df_duplicated_cleaned,'plot_keywords')
print("the total amount of various keyword in the plot_keywords: {}".format(len(keywords_roots)))

In [None]:
above_threshold = list()
for k,val in keywords_roots.items():
  if val > 3:
    above_threshold.append(k)

print("number of keywords above threshold:",len(above_threshold))

In [None]:
#replace with synonymes which freauency are above 3
tmp = list()
n = 0
for i in df_duplicated_cleaned['plot_keywords']:
  ls = [s  for s in i if s in above_threshold]
  tmp.append(ls)


df_duplicated_cleaned['plot_keywords'] = tmp

In [None]:
#create a list of synonyme and it count in the data 
dic = dict()
for i in tmp:
  for s in i:
    if s in dic:
      dic[s] += 1
    else:
      dic[s] = 0

ls = list()
for k,val in dic.items():
  ls.append([k,val])

ls.sort(key=lambda x: x[1],reverse = True)

In [None]:
# Graph of keyword occurences
#----------------------------
font = {'family' : 'fantasy', 'weight' : 'normal', 'size'   : 15}
mpl.rc('font', **font)

keywords_counts.sort(key = lambda x:x[1], reverse = True)

y_axis = [i[1] for i in keywords_counts]
x_axis = [k for k,i in enumerate(keywords_counts)]

new_y_axis = [i[1] for i in ls]
new_x_axis = [k for k,i in enumerate(ls)]

f, ax = plt.subplots(figsize=(9, 5))
ax.plot(x_axis, y_axis, 'r-', label='before cleaning')
ax.plot(new_x_axis, new_y_axis, 'b-', label='after cleaning')

# Now add the legend with some customizations.
legend = ax.legend(loc='upper right', shadow=True)
frame = legend.get_frame()
frame.set_facecolor('0.90')
for label in legend.get_texts():
    label.set_fontsize('medium')
            
plt.ylim((0,25))
plt.axhline(y=3.5, linewidth=2, color = 'k')
plt.xlabel("keywords index", family='fantasy', fontsize = 15)
plt.ylabel("Nb. of occurences", family='fantasy', fontsize = 15)
plt.text(3500, 4.5, 'threshold for keyword delation', fontsize = 13)
plt.show()

# Plot correlation matrix

In [None]:
#caculate new keywords_roots and keyword_count after data cleaning
keywords_roots, keyword_count = word_count(df_duplicated_cleaned,"plot_keywords")

In [None]:
fig, ax = plt.subplots(figsize=(12,9))

#caculate correaltion matrix
#__________________________________________________________
corrmat = df_duplicated_cleaned.corr(method ='pearson')

#Order the index by correlation with num_voted_users
#nlargest: return the first n rows vlaues orderd by given columns in decending order
#return the cols in the order from most to least related to num_voted_users
#we can tell from result: except the num_voted_users itself,the most related col is gross; 
#the least related col is title_year
#__________________________________________________________
cols = corrmat.nlargest(17,'num_voted_users')['num_voted_users'].index

#Get the correlation coffience
#__________________________________________________________
cm = np.corrcoef(df_duplicated_cleaned[cols].dropna(how='any').values.T)

sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True,
                 fmt='.2f', annot_kws={'size': 15}, linewidth = 0.1, cmap = 'coolwarm',
                 yticklabels=cols.values, xticklabels=cols.values)

fig.text(0.5, 0.93, "Correlation coefficients", ha='center', fontsize = 18, family='fantasy')
plt.show()

# Handling Missing value

## Inspect the missing value proportion of each column

In [None]:
missing_value_df = df_duplicated_cleaned.isnull().sum().reset_index()
missing_value_df.columns = ['columns',"num_missing_value"]
missing_value_df['filling_ratio'] = (1-(missing_value_df.num_missing_value/df_duplicated_cleaned.shape[0]))*100
#Reset the index, or a level of it.
missing_value_df = missing_value_df.sort_values(by = "filling_ratio").reset_index().drop(["index"],axis =1)
missing_value_df

In [None]:
y_axis = missing_value_df["filling_ratio"]
x_label = missing_value_df['columns']
x_axis = missing_value_df.index

fig = plt.figure(figsize=(11,4))
plt.xticks(rotation = 80,fontsize=14)
plt.yticks(fontsize=13)

plt.xticks(x_axis,x_label,family='fantasy',fontsize=14)
plt.ylabel('Filling ratio (%)', family='fantasy', fontsize = 16)
plt.bar(x_axis, y_axis);
plt.title("Filling ratio of reach columns",family="fantasy",fontsize=20)

#Draw a ceritcal lines
N_threshold =5
plt.axvline(x=N_threshold+0.5,linewidth=2,color='r')
N_thresh = 17
plt.axvline(x=N_thresh-0.5, linewidth=2, color = 'g')

#Add a text box which divide bar plot into three segment
#Text box indicate the threshold ratio of each segment
plt.text(N_threshold-4.5, 30,"filling ratio \n < {}%".format(round(y_axis[N_threshold],1)),
         fontsize = 15, family = 'fantasy', bbox=dict(boxstyle="round",
                                                      ec=(1.0, 0.5, 0.5),
                                                      fc=(0.8, 0.5, 0.5)))

plt.text(N_thresh, 30, 'filling factor \n = {}%'.format(round(y_axis[N_thresh],1)),
         fontsize = 15, family = 'fantasy', bbox=dict(boxstyle="round",
                                                      ec=(1., 0.5, 0.5),
                                                      fc=(0.5, 0.8, 0.5)))



In [None]:
df_duplicated_missing_cleaned = df_duplicated_cleaned.copy()
df_duplicated_missing_cleaned = df_duplicated_missing_cleaned.dropna()
df_duplicated_missing_cleaned = df_duplicated_missing_cleaned.isnull().sum().reset_index()
df_duplicated_missing_cleaned.rename(columns={"index":'column',0:"missing value"},inplace = True)

In [None]:
df_duplicated_cleaned[df_duplicated_cleaned['homepage'].isnull() ==False]

# Setting missing title years

In [None]:
 def fill_year(df):
    col = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name']
    usual_year = [0 for _ in range(4)]
    var        = [0 for _ in range(4)]
    #_____________________________________________________________
    # I get the mean years of activity for the actors and director
    for i in range(4):
        usual_year[i] = df.groupby(col[i])['title_year'].mean()
    #_____________________________________________
    # I create a dictionnary collectinf this info
    actor_year = dict()
    for i in range(4):
        for s in usual_year[i].index:
            if s in actor_year.keys():
                if pd.notnull(usual_year[i][s]) and pd.notnull(actor_year[s]):
                    actor_year[s] = (actor_year[s] + usual_year[i][s])/2
                elif pd.isnull(actor_year[s]):
                    actor_year[s] = usual_year[i][s]
            else:
                actor_year[s] = usual_year[i][s]
        
    #______________________________________
    # identification of missing title years
    missing_year_info = df[df['title_year'].isnull()]
    #___________________________
    # filling of missing values
    icount_replaced = 0
    for index, row in missing_year_info.iterrows():
        value = [ np.NaN for _ in range(4)]
        icount = 0 ; sum_year = 0
        for i in range(4):            
            var[i] = df.loc[index][col[i]]
            if pd.notnull(var[i]): value[i] = actor_year[var[i]]
            if pd.notnull(value[i]): icount += 1 ; sum_year += actor_year[var[i]]
        if icount != 0: sum_year = sum_year / icount 

        if int(sum_year) > 0:
            icount_replaced += 1
            df.set_value(index, 'title_year', int(sum_year))
            if icount_replaced < 10: 
                print("{:<45} -> {:<20}".format(df.loc[index]['movie_title'],int(sum_year)))
    return df

In [None]:
df_filling = df_duplicated_cleaned.copy(deep=True)
missing_year_info = df_filling[df_filling['title_year'].isnull()][[
            'director_name','actor_1_name', 'actor_2_name', 'actor_3_name']]
missing_year_info[:10]
fill_year(df_filling)

In [None]:
def fill_year(df):
    col = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name']
    usual_year = [0 for _ in range(4)]
    var        = [0 for _ in range(4)]
    #_____________________________________________________________
    # I get the mean years of activity for the actors and director
    for i in range(4):
        usual_year[i] = df.groupby(col[i])['title_year'].mean()
    #_____________________________________________
    # I create a dictionnary collectinf this info
    actor_year = dict()
    for i in range(4):
        for s in usual_year[i].index:
            if s in actor_year.keys():
                if pd.notnull(usual_year[i][s]) and pd.notnull(actor_year[s]):
                    actor_year[s] = (actor_year[s] + usual_year[i][s])/2
                elif pd.isnull(actor_year[s]):
                    actor_year[s] = usual_year[i][s]
            else:
                actor_year[s] = usual_year[i][s]
        
    #______________________________________
    # identification of missing title years
    missing_year_info = df[df['title_year'].isnull()]
    #___________________________
    # filling of missing values
    icount_replaced = 0
    for index, row in missing_year_info.iterrows():
        value = [ np.NaN for _ in range(4)]
        icount = 0 ; sum_year = 0
        for i in range(4):            
            var[i] = df.loc[index][col[i]]
            if pd.notnull(var[i]): value[i] = actor_year[var[i]]
            if pd.notnull(value[i]): icount += 1 ; sum_year += actor_year[var[i]]
        if icount != 0: sum_year = sum_year / icount 

        if int(sum_year) > 0:
            icount_replaced += 1
            df.set_value(index, 'title_year', int(sum_year))
            if icount_replaced < 10: 
                print("{:<45} -> {:<20}".format(df.loc[index]['movie_title'],int(sum_year)))
    return 

In [None]:
fill_year(df_filling)

In [None]:
icount = 0
for index, row in df_filling[df_filling['plot_keywords'].isnull()].iterrows():
    icount += 1
    liste_mot = row['movie_title'].strip().split()
    new_keyword = []
    for s in liste_mot:
        lemma = get_synonymes(s)
        for t in list(lemma):
            if t in keywords: 
                new_keyword.append(t)                
    if new_keyword and icount < 15: 
        print('{:<50} -> {:<30}'.format(row['movie_title'], str(new_keyword)))
    if new_keyword:
        df_filling.set_value(index, 'plot_keywords', '|'.join(new_keyword)) 

In [None]:
cols = corrmat.nlargest(9, 'num_voted_users')['num_voted_users'].index
cm = np.corrcoef(df_duplicated_cleaned[cols].dropna(how='any').values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True,
                 fmt='.2f', annot_kws={'size': 10}, 
                 yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
sns.set(font_scale=1.25)
cols = ['gross', 'num_voted_users']
sns.pairplot(df_filling.dropna(how='any')[cols],diag_kind='kde', size = 2.5)
plt.show();

In [None]:
cols

In [None]:
def variable_linreg_imputation(df, col_to_predict, ref_col):
    regr = linear_model.LinearRegression()
    test = df[[col_to_predict,ref_col]].dropna(how='any', axis = 0)
    X = np.array(test[ref_col])
    Y = np.array(test[col_to_predict])
    X = X.reshape(len(X),1)
    Y = Y.reshape(len(Y),1)
    regr.fit(X, Y)
    
    test = df[df[col_to_predict].isnull() & df[ref_col].notnull()]
    for index, row in test.iterrows():
        value = float(regr.predict(row[ref_col]))
        df.set_value(index, col_to_predict, value)

In [None]:
variable_linreg_imputation(df_filling, 'gross', 'num_voted_users')

In [None]:
df = df_filling.copy(deep = True)
missing_df = df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['filling_factor'] = (df.shape[0] 
                                - missing_df['missing_count']) / df.shape[0] * 100
missing_df = missing_df.sort_values('filling_factor').reset_index(drop = True)
missing_df

In [None]:
df = df_filling.copy(deep=True)
df.reset_index(inplace = True, drop = True)

 # Extracting keywords from the title

# Imputing from regressions

# RECOMMENDATION ENGINE
---
We build recommendation Engine in two step:
1. Determine N films with content simularity 
2. choose top 5 popular films to recommend

## Determine N films with content similarity 


In [None]:
gaussian_filter = lambda x,y,sigma: math.exp(-(x-y)**2/(2*sigma**2))

In [None]:
def entry_variables(df,id_entry):
  col_labels = []
  #add director_name to the list
  if pd.notnull(df['director_name'].iloc[id_entry]):
    col_labels.append(df['director_name'].iloc[id_entry])

  # add actor N name to the list
  act_nm = df[['actor_1_name','actor_2_name','actor_3_name']].iloc[id_entry].tolist()
  [col_labels.append(nm) for nm in act_nm if pd.notnull(nm)]
  

  #add keyword to the list
  [col_labels.append(key) for key in df['plot_keywords'].iloc[id_entry]]

  return col_labels

In [None]:
def adjust_format(df,idx_col,nm_col):
  tmp = df.set_index(idx_col)[nm_col].explode().reset_index()
  tmp = pd.get_dummies(tmp,columns=[nm_col],prefix="",prefix_sep="").groupby(idx_col).sum()
  return tmp
  
df['all_staff']=df[['director_name','actor_1_name','actor_2_name','actor_3_name']].values.tolist()
genre_list = adjust_format(df,'movie_title','genres')
keyword_list = adjust_format(df,'movie_title','plot_keywords')
staff_list = adjust_format(df,'movie_title','all_staff')
# staff_list = df[["movie_title",'director_name','actor_1_name','actor_2_name','actor_3_name']].set_index('movie_title')
# staff_list = pd.get_dummies(staff_list,columns=['director_name','actor_1_name','actor_2_name','actor_3_name'],prefix_sep="",prefix="").groupby("movie_title").sum()

all_list = staff_list.join(genre_list).join(keyword_list)


# [Document of K-NearestNeighbor](https://scikit-learn.org/stable/modules/neighbors.html#unsupervised-neighbors)
---
- abstract：
  1. Unsupervised learner of implemeting nearest neighbor searching
  2. dist：distance of the k neighbors to each point(sort by distance in descending order)
  3. indices：the index of the k neighbors to each point(sort by distance in descending order)


In [None]:
display(chosen_movie)
recommedation_list = all_list.iloc[indices[chosen_movie]].index.tolist()
movies_recommedation = df.loc[df['movie_title'].isin(recommedation_list),['movie_title', 'vote_average','num_voted_users', 'title_year']]
# all_list.iloc[[chosen_movie]]
# indices[chosen_movie]
# df.iloc[indices[chosen_movie]][['movie_title', 'vote_average','num_voted_users', 'title_year']]
# df.iloc[df['movie_title'].isin(recommedation_list),]
# # movies_recommedation
movies_recommedation

In [None]:
var = entry_variables(df,2)
X = all_list[var].to_numpy()
nbrs = NearestNeighbors(n_neighbors=31,algorithm='auto',metric='euclidean').fit(X)

dist, indices = nbrs.kneighbors(X)

In [None]:
def critere_selection(title_main, max_users, year1, year2, imdb_score, votes):    
    if pd.notnull(year1):
        facteur_1 = gaussian_filter(year1, year2, 20)
    else:
        facteur_1 = 1        

    sigma = max_users * 1.0

    if pd.notnull(votes):
        facteur_2 = gaussian_filter(votes, max_users, sigma)
    else:
        facteur_2 = 0
        
    note = imdb_score**2 * facteur_1 * facteur_2
    
    return note

chosen_movie =df["movie_title"].iloc[2]
chosen_movie =all_list.index.tolist().index(chosen_movie)
recommedation_list = all_list.iloc[indices[chosen_movie]].index.tolist()
movies_recommedation = df.loc[df['movie_title'].isin(recommedation_list),['movie_title', 'vote_average','num_voted_users', 'title_year']]
max_users = max(movies_recommedation["num_voted_users"])
year = movies_recommedation["title_year"].iloc[0]


sorted_list = pd.DataFrame()
for i in range(len(movies_recommedation)):
  temp_df=pd.DataFrame()
  name = movies_recommedation.iloc[i]['movie_title']
  note = critere_selection(movies_recommedation.iloc[i]['movie_title'],
                max_users,
                year,
                movies_recommedation.iloc[i]['title_year'],
                movies_recommedation.iloc[i]['vote_average'],
                movies_recommedation.iloc[i]['num_voted_users'])
  

  temp_df["movie_title"] = [name]
  temp_df['popularity'] = [note]
  temp_df['simularity'] = [dist[chosen_movie][i]]
  print(temp_df)
  sorted_list=pd.concat([sorted_list,temp_df],axis=0)


  
  


In [None]:
test1 = [(idx,val) for idx,val in sorted_list.items()]
test1.sort(key=lambda x:x[1],reverse=True)
# sorted_list.values()

In [None]:
sorted_list.sort_values(by="popularity",ascending=False)

In [None]:
# df.iloc[[12]]['movie_title']
df.iloc[12]['num_voted_users']
# df.iloc[[12]]['title_year']
# df.iloc[[12]]['title_year']
# df.iloc[[12]['vte_average']]
# df.iloc[[12]]['num_voted_users']

In [None]:
df.columns

In [None]:
# all_list[var].to_numpy()[0][all_list[var].to_numpy()[0]>0]
# all_list[var][all_list[var] > 0]
tmp =all_list.reset_index()
tmp[tmp.movie_title=="Pirates of the Caribbean: Dead Man's Chest"]



In [None]:
var = entry_variables(df,12)
var

In [None]:
def add_variables(df, REF_VAR):    
    for s in REF_VAR: df[s] = pd.Series([0 for _ in range(len(df))])
    colonnes = ['genres', 'actor_1_name', 'actor_2_name',
                'actor_3_name', 'director_name', 'plot_keywords']
    for categorie in colonnes:
        for index, row in df.iterrows():
            if pd.isnull(row[categorie]): continue
            for s in row[categorie].split('|'):
                if s in REF_VAR: df.at[index,s]= 1            
    return df

#Below are code testing area
---
---

In [None]:
TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES = {
    'budget': 'budget',
    'genres': 'genres',
    'revenue': 'gross',
    'title': 'movie_title',
    'runtime': 'duration',
    'original_language': 'language',
    'keywords': 'plot_keywords',
    'vote_count': 'num_voted_users'}
df_movie.rename(columns=TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES,inplace=True)
pd.to_datetime(df_movie.release_date).apply(lambda x:int(x.strftime("%Y")) if pd.notnull(x) else x)

tmdb_movies['country'] = tmdb_movies['production_countries'].apply(lambda x: safe_access(x, [0, 'name']))
df_movie.production_countries
# display(df_movie.rename(columns=TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES).columns)
# display(df_movie.columns)


type(pd.to_datetime(df_movie.release_date)[0].year)

# display(type(df_movie.release_date[0]))

In [None]:
# type(df_movie.production_countries[0][0])

df_movie.production_countries.apply(lambda x: safe_access(x, [0, 'name']))
# pd.to_datetime(df_movie.release_date).apply(lambda x:int(x.year),)

In [None]:
for i in ["name"]:
  print(df_movie.iloc[0]["production_countries"][i])

# df_movie.iloc[0]["production_countries"]

In [None]:
# display(df_credit['cast'])
df_credit = pd.read_csv(file_to_read[0])
# display(df_credit['cast'])
display(df_credit['cast'][0])
display(df_credit['cast'].apply(json.loads)[0])

In [None]:
# Viewing data
df_type =df_movie.dtypes.reset_index(name="type").rename(columns={"index":"column"})
df_mis =df_movie.isnull().sum().reset_index(name="missing_num").rename(columns={"index":"column"})

df_str = df_type.join(df_mis.set_index("column"),on="column")
df_str['missing_proportion'] = df_str['missing_num']/df_movie.shape[0]*100
df_str.sort_values(by="missing_proportion",ascending=False) #We can tell there mamny missing value in the some columns,
                               # such as "homepage"、"tagline"


In [None]:
set_keyword = dict()


In [None]:
# Don't Know why the  bar plot doesn't work
trunc_occurences = genres_occurences[0:len(genres_occurences)]

#_____________________________________________
# LOWER PANEL: HISTOGRAMS
ax2 = fig.add_subplot(2,1,2)
y_axis = [i[1] for i in trunc_occurences]
x_axis = [k for k,i in enumerate(trunc_occurences)]
x_label = [i[0] for i in trunc_occurences]


# x_axis
# x_label
plt.xticks(rotation=85, fontsize = 15)
plt.yticks(fontsize = 15)
plt.xticks(x_axis, x_label)
plt.ylabel("Nb. of occurences", fontsize = 18, labelpad = 10)
ax2.bar(x_axis, y_axis, align = 'center', color='g')
# #_______________________
# plt.title("Keywords popularity",bbox={'facecolor':'k', 'pad':5},color='w',fontsize = 25)
plt.show()