# Analyse des meilleurs films

## Imports des modules

In [32]:
import pandas as pd
import numpy as np
import plotly.express as px 
import plotly.graph_objects as go 
import matplotlib.pyplot as plt
import seaborn as sns

## Lecture des datasets

In [33]:
ratings_df = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz", sep="\t")

In [34]:
basics_df = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", sep="\t", low_memory=False)

In [35]:
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1832
1,tt0000002,6.0,236
2,tt0000003,6.5,1592
3,tt0000004,6.0,153
4,tt0000005,6.2,2408


In [36]:
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


## Filtres

Filtre sur le type `movie`

In [37]:
basics_df_movies =  basics_df[basics_df['titleType'] == 'movie']
basics_df_movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


Retrait des films pour adultes avec la colonne `isAdult`

In [38]:
basics_df_movies = basics_df_movies[basics_df_movies['isAdult'] == '0']
basics_df_movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


Sélection des colonnes pertinentes avant le merge

In [39]:
basics_df_stripped = basics_df_movies[['tconst', 'primaryTitle', 'startYear', 'genres']]
basics_df_stripped.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres
498,tt0000502,Bohemios,1905,\N
570,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography"
587,tt0000591,The Prodigal Son,1907,Drama
610,tt0000615,Robbery Under Arms,1907,Drama
625,tt0000630,Hamlet,1908,Drama


## Jointure des deux dataframes

Merge des dataframes

In [40]:
movies_ratings = pd.merge(basics_df_stripped, ratings_df, how='inner', left_on='tconst', right_on='tconst')
movies_ratings.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
0,tt0000502,Bohemios,1905,\N,4.4,15
1,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",6.1,736
2,tt0000591,The Prodigal Son,1907,Drama,5.0,17
3,tt0000615,Robbery Under Arms,1907,Drama,4.5,24
4,tt0000630,Hamlet,1908,Drama,3.7,24


Réinitialisation de l'index

In [41]:
movies_ratings.reset_index(drop=True, inplace=True)
movies_ratings.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
0,tt0000502,Bohemios,1905,\N,4.4,15
1,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",6.1,736
2,tt0000591,The Prodigal Son,1907,Drama,5.0,17
3,tt0000615,Robbery Under Arms,1907,Drama,4.5,24
4,tt0000630,Hamlet,1908,Drama,3.7,24


## Sélection des données à étudier

Limite des films à un score supérieur ou égal à 8.0

In [42]:
movies_ratings_top = movies_ratings[movies_ratings['averageRating'] >= 8.5]
movies_ratings_top.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
211,tt0004295,The Man from Mexico,1914,Comedy,8.6,39
335,tt0005554,The Island of Regeneration,1915,Drama,8.6,14
387,tt0005994,Ruslan and Ludmilla,1915,\N,8.5,13
427,tt0006371,Arms and the Woman,1916,Drama,9.2,18
443,tt0006463,The Brand of Cowardice,1916,Drama,8.8,16


Tri par `numvotes` afin d'avoir une idée de où fixer la limite

In [43]:
movies_ratings_top.sort_values(by=['numVotes'], ascending=False)

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
60334,tt0111161,The Shawshank Redemption,1994,Drama,9.3,2480651
130555,tt0468569,The Dark Knight,2008,"Action,Crime,Drama",9.0,2435010
165487,tt1375666,Inception,2010,"Action,Adventure,Sci-Fi",8.8,2185134
69405,tt0137523,Fight Club,1999,Drama,8.8,1954675
60186,tt0110912,Pulp Fiction,1994,"Crime,Drama",8.9,1920023
...,...,...,...,...,...,...
254879,tt7846750,UnRepresented,2019,Documentary,9.2,5
150896,tt11354272,Raghupathi Venkaiah Naidu,2019,Drama,8.6,5
199761,tt2413146,Planeta blanc,2013,"Biography,Documentary,History",8.8,5
245079,tt6487526,Angel Wagenstein: Art Is a Weapon,2017,Documentary,8.8,5


In [44]:
movies_ratings_top['numVotes'].mean()

7663.883278688525

In [45]:
movies_ratings_top['numVotes'].median()

16.0

In [46]:
movies_ratings_top['numVotes'].quantile(0.75)

36.0

Sélection des films ayant uniquement plus de 10000 votes afin d'éliminer les outliers ayant un score non significatif

In [47]:
movies_ratings_top = movies_ratings_top.loc[movies_ratings_top['numVotes'] >= 10000]
movies_ratings_top

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
4590,tt0021749,City Lights,1931,"Comedy,Drama,Romance",8.5,176889
8232,tt0027977,Modern Times,1936,"Comedy,Drama,Family",8.5,229946
12604,tt0034583,Casablanca,1942,"Drama,Romance,War",8.5,545356
15447,tt0038650,It's a Wonderful Life,1946,"Drama,Family,Fantasy",8.6,424167
21385,tt0047396,Rear Window,1954,"Mystery,Thriller",8.5,468007
...,...,...,...,...,...,...
258381,tt8413338,Kumbalangi Nights,2019,"Comedy,Drama,Romance",8.6,12488
261358,tt8948790,Jersey,2019,"Drama,Sport",8.6,12171
264122,tt9477520,Asuran,2019,"Action,Drama",8.5,20165
264607,tt9617456,For Sama,2019,"Documentary,War",8.5,10684


In [48]:
movies_ratings_top[['mainGenre', 'secondaryGenres']] = movies_ratings['genres'].str.split(',', n=1, expand=True)
movies_ratings_top

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes,mainGenre,secondaryGenres
4590,tt0021749,City Lights,1931,"Comedy,Drama,Romance",8.5,176889,Comedy,"Drama,Romance"
8232,tt0027977,Modern Times,1936,"Comedy,Drama,Family",8.5,229946,Comedy,"Drama,Family"
12604,tt0034583,Casablanca,1942,"Drama,Romance,War",8.5,545356,Drama,"Romance,War"
15447,tt0038650,It's a Wonderful Life,1946,"Drama,Family,Fantasy",8.6,424167,Drama,"Family,Fantasy"
21385,tt0047396,Rear Window,1954,"Mystery,Thriller",8.5,468007,Mystery,Thriller
...,...,...,...,...,...,...,...,...
258381,tt8413338,Kumbalangi Nights,2019,"Comedy,Drama,Romance",8.6,12488,Comedy,"Drama,Romance"
261358,tt8948790,Jersey,2019,"Drama,Sport",8.6,12171,Drama,Sport
264122,tt9477520,Asuran,2019,"Action,Drama",8.5,20165,Action,Drama
264607,tt9617456,For Sama,2019,"Documentary,War",8.5,10684,Documentary,War


## Graphiques

In [50]:
fig = px.scatter_3d(movies_ratings_top, 
    x='startYear', 
    y='averageRating', 
    z='mainGenre', 
    color='averageRating', 
    size='numVotes', 
    opacity = 0.8,
    labels={
        'startYear': 'Year',
        'averageRating': 'Rating',
        'mainGenre': 'Genre',
        'numVotes': 'Number of Votes'
    }, 
    size_max=25,
    template='plotly_dark',
    hover_name='primaryTitle'
)


fig.update_layout(width=1300, height=1000)
fig.show()