# Data exploration and cleaning

In [None]:
# Libraries import
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter

In [None]:
# Dataset import

credits = pd.read_csv('./archive/credits.csv', delimiter=',',index_col="id")
keywords = pd.read_csv('./archive/keywords.csv', delimiter=',',index_col="id")
movies = pd.read_csv('./archive/movies_metadata.csv', delimiter=',').\
                     drop(['belongs_to_collection', 'homepage', 'imdb_id', 'poster_path', 'status', 'title', 'video'], axis=1).\
                     drop([19730, 29503, 35587]) 

In [None]:
movies.head()

In [None]:
credits.head()

Présence de format Json

In [None]:
# Converssion Json format en nominal format
def get_dictionary(s):
    try:
        d = eval(s)
    except:
        d = {}
    return d

In [None]:
keywords.keywords = keywords.keywords.map(lambda x: [d['name'] for d in get_dictionary(x)]).map(lambda x: ','.join(map(str, x)))
movies.genres = movies.genres.map(lambda x: [d['name'] for d in get_dictionary(x)]).map(lambda x: ','.join(map(str, x)))
movies.production_companies = movies.production_companies.map(lambda x: [d['name'] for d in get_dictionary(x)]).map(lambda x: ','.join(map(str, x)))
movies.spoken_languages = movies.spoken_languages.map(lambda x: [d['name'] for d in get_dictionary(x)]).map(lambda x: ','.join(map(str, x)))
movies.production_countries = movies.production_countries.map(lambda x: [d['name'] for d in get_dictionary(x)]).map(lambda x: ','.join(map(str, x)))

# New columns
credits['characters'] = credits.cast.map(lambda x: [d['character'] for d in get_dictionary(x)]).map(lambda x: ','.join(map(str, x)))
credits['actors'] = credits.cast.map(lambda x: [d['name'] for d in get_dictionary(x)]).map(lambda x: ','.join(map(str, x)))
credits.crew = credits.crew.map(lambda x: [d['name'] for d in get_dictionary(x)]).map(lambda x: ','.join(map(str, x)))
credits.pop('cast')


In [None]:
movies['id'] = movies['id'].astype('int64') # incorrect datatype for merge 
df = movies.merge(keywords, on='id').merge(credits, on='id')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

Il y a la présence de quelques valeurs nulles dans notre dataframe notamment dans tagline, runtime et overview. Il va falloir faire un choix quand au traitement de ces données.

In [None]:
df.info()

Certaines colonnes ne sont pas bien typées : release_date, budget, popularity

In [None]:
df['popularity'] = df['popularity'].astype('float64')
df['budget'] = df['budget'].astype('float64')
df['release_date'] = pd.to_datetime(df['release_date'])

## Visualisation

### Colonne 1 : Adulte movies

In [None]:
plt.figure(figsize=(8,4))

plt.scatter(x=[0.5, 1.5], y=[1,1], s=[6000,15000], color=['red', 'green'])
plt.xlim(0,2)
plt.ylim(0.9,1.2)

plt.title('Distribution of Adult and Non Adult Movies', fontsize=18, weight=600)
plt.text(0.5, 1, '{}\nMovies'.format(str(len(df[df['adult']=='True']))), va='center', ha='center', fontsize=18, weight=600, color='white')
plt.text(1.5, 1, '{}\nMovies'.format(str(len(df[df['adult']=='False']))), va='center', ha='center', fontsize=18, weight=600, color='white')
plt.text(0.5, 1.11, 'Adult', va='center', ha='center', fontsize=17, weight=500)
plt.text(1.5, 1.11, 'Non Adult', va='center', ha='center', fontsize=17, weight=500)

plt.axis('off')



La catgéorie "films pour adultes" n'est pas une catégorie très pertinante pour entrainer un modèle. On pourrait voir à la retirer

### Colonne 2 : Budget

In [None]:
df_plot = df[(df['budget'] != 0)]

fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 4))

plt.suptitle('The Influence of Budget and Revenue\non Popularity of Movies', fontsize=18, weight=600, color='#333d29')
for i, col in enumerate(['popularity', 'revenue', 'runtime','vote_average']):
    sns.regplot(data=df_plot, x=col, y='budget',
                scatter_kws={"color": "#06837f", "alpha": 0.6}, line_kws={"color": "#fdc100"}, ax=axes[i])

plt.tight_layout()

On pourrait continuer mais ce sera plus rapide avec une heatmap

### Colonnes numériques

In [None]:
df_corr = df

df_corr.pop('id')
plt.figure(figsize=(12,10))
plt.title('Correlation of Movie Features\n', fontsize=18, weight=600)
sns.heatmap(df_corr.corr(), annot=True) 
plt.show()

### Colonne : overview

In [None]:
overview = df['overview'][df['original_language'] == 'en']
overview = overview.dropna()

plt.figure(figsize=(10,10))
plt.title('The Most Common Word in Movie Overviews\n', fontsize=30, weight=600, color='#333d29')
wc = WordCloud(max_words=500, min_font_size=10, height=800,width=1600,background_color="white").generate(' '.join(overview))
plt.imshow(wc)

In [None]:
title = df['original_title'][df['original_language'] == 'en']
title = title.dropna()

plt.figure(figsize=(10,10))
plt.title('The Most Common Word in titles\n', fontsize=30, weight=600, color='#333d29')
wc = WordCloud(max_words=500, min_font_size=10, height=800,width=1600,background_color="white").generate(' '.join(title))
plt.imshow(wc)

In [None]:
title = df['tagline'][df['original_language'] == 'en']
title = title.dropna()

plt.figure(figsize=(10,10))
plt.title('The Most Common Word in tagline\n', fontsize=30, weight=600, color='#333d29')
wc = WordCloud(max_words=500, min_font_size=10, height=800,width=1600,background_color="white").generate(' '.join(title))
plt.imshow(wc)

In [None]:
genres_list = []
for i in df['genres']:
    genres_list.extend(i.split(','))

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14,6))

df_plot = pd.DataFrame(Counter(genres_list).most_common(5), columns=['genre', 'total'])
ax = sns.barplot(data=df_plot, x='genre', y='total', ax=axes[0])
ax.set_title('Top 5 Genres in Movies', fontsize=18, weight=600)
sns.despine()

df_plot_full = pd.DataFrame([Counter(genres_list)]).transpose().sort_values(by=0, ascending=False)
df_plot.loc[len(df_plot)] = {'genre': 'Others', 'total':df_plot_full[6:].sum()[0]}
plt.title('Percentage Ratio of Movie Genres', fontsize=18, weight=600)
wedges = axes[1].pie(x=df_plot['total'], labels=df_plot['genre'], autopct='%.2f%%', explode=[0,0,0,0,0,0.1])



## Gestion valeurs manquantes

In [None]:
df['original_language'] = df['original_language'].fillna('')
df['tagline'] = df['tagline'].fillna('')

# Valeurs numériques : Plusieurs choix (statistiquement il faudrait prendre le plus proche voisin mais en informatique c'est un peu lourd)
df['runtime'] = df['runtime'].fillna(0)   # On remplace par 0
df['runtime'] = df['runtime'].fillna(df['runtime'].mean()) # On remplace par la moyenne
df['runtime'] = df['runtime'].fillna(df['runtime'].median()) # On remplace par la median
# Pour les plus proche voisin voir KNeighborsRegressor


df.dropna(inplace=True)