# Analyse des meilleurs films

## Imports des modules

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px 
import plotly.graph_objects as go 
import matplotlib.pyplot as plt
import seaborn as sns

## Lecture des datasets

In [2]:
ratings_df = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz", sep="\t")

In [3]:
basics_df = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", sep="\t", low_memory=False)

In [4]:
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1836
1,tt0000002,6.0,236
2,tt0000003,6.5,1597
3,tt0000004,6.0,153
4,tt0000005,6.2,2412


In [5]:
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


## Filtres

Filtre sur le type `movie`

In [6]:
basics_df_movies =  basics_df[basics_df['titleType'] == 'movie']
basics_df_movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


Retrait des films pour adultes avec la colonne `isAdult`

In [7]:
basics_df_movies = basics_df_movies[basics_df_movies['isAdult'] == '0']
basics_df_movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


Sélection des colonnes pertinentes avant le merge

In [8]:
basics_df_stripped = basics_df_movies[['tconst', 'primaryTitle', 'startYear', 'genres']]
basics_df_stripped.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres
498,tt0000502,Bohemios,1905,\N
570,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography"
587,tt0000591,The Prodigal Son,1907,Drama
610,tt0000615,Robbery Under Arms,1907,Drama
625,tt0000630,Hamlet,1908,Drama


## Jointure des deux dataframes

Merge des dataframes

In [9]:
movies_ratings = pd.merge(basics_df_stripped, ratings_df, how='inner', left_on='tconst', right_on='tconst')
movies_ratings.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
0,tt0000502,Bohemios,1905,\N,4.5,14
1,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",6.1,736
2,tt0000591,The Prodigal Son,1907,Drama,5.2,16
3,tt0000615,Robbery Under Arms,1907,Drama,4.5,23
4,tt0000630,Hamlet,1908,Drama,3.8,23


Réinitialisation de l'index

In [10]:
movies_ratings.reset_index(drop=True, inplace=True)
movies_ratings.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
0,tt0000502,Bohemios,1905,\N,4.5,14
1,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",6.1,736
2,tt0000591,The Prodigal Son,1907,Drama,5.2,16
3,tt0000615,Robbery Under Arms,1907,Drama,4.5,23
4,tt0000630,Hamlet,1908,Drama,3.8,23


## Sélection des données à étudier

Limite des films à un score supérieur ou égal à 8.4

In [11]:
movies_ratings_top = movies_ratings[movies_ratings['averageRating'] >= 8.4]
movies_ratings_top.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
205,tt0004295,The Man from Mexico,1914,Comedy,8.6,39
226,tt0004496,A Prince of India,1914,Drama,8.4,14
329,tt0005554,The Island of Regeneration,1915,Drama,8.6,14
381,tt0005994,Ruslan and Ludmilla,1915,\N,8.5,13
421,tt0006371,Arms and the Woman,1916,Drama,9.2,18


Tri par `numvotes` afin d'avoir une idée de où fixer la limite

In [12]:
movies_ratings_top.sort_values(by=['numVotes'], ascending=False)

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
60323,tt0111161,The Shawshank Redemption,1994,Drama,9.3,2487387
130469,tt0468569,The Dark Knight,2008,"Action,Crime,Drama",9.0,2440794
165536,tt1375666,Inception,2010,"Action,Adventure,Sci-Fi",8.8,2190285
69390,tt0137523,Fight Club,1999,Drama,8.8,1959479
60176,tt0110912,Pulp Fiction,1994,"Crime,Drama",8.9,1923691
...,...,...,...,...,...,...
154164,tt11863798,The coach,2019,Documentary,8.4,5
213516,tt3437786,Solving,2014,Documentary,8.8,5
227591,tt4695942,"Gleiche Liebe, falsche Liebe?!?: Homo et alors?!?",2015,Documentary,8.4,5
234217,tt5341740,Poor Mama's Boy,2016,Drama,9.0,5


In [13]:
movies_ratings_top['numVotes'].mean()

8062.583502294312

In [14]:
movies_ratings_top['numVotes'].median()

16.0

In [15]:
movies_ratings_top['numVotes'].quantile(0.75)

37.0

Sélection des films ayant uniquement plus de 10000 votes afin d'éliminer les outliers ayant un score non significatif

In [16]:
movies_ratings_top = movies_ratings_top[movies_ratings_top['numVotes'] >= 20000]
movies_ratings_top

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
3627,tt0019760,Man with a Movie Camera,1929,"Documentary,Music",8.4,23992
4587,tt0021749,City Lights,1931,"Comedy,Drama,Romance",8.5,177165
8228,tt0027977,Modern Times,1936,"Comedy,Drama,Family",8.5,230289
11207,tt0032553,The Great Dictator,1940,"Comedy,Drama,War",8.4,214490
12599,tt0034583,Casablanca,1942,"Drama,Romance,War",8.5,546100
...,...,...,...,...,...,...
251988,tt7392212,Rangasthalam 1985,2018,"Action,Drama",8.4,21118
254085,tt7681902,Won't You Be My Neighbor?,2018,"Biography,Documentary",8.4,24678
257766,tt8267604,Capernaum,2018,Drama,8.4,77539
259198,tt8503618,Hamilton,2020,"Biography,Drama,History",8.4,76314


In [17]:
movies_ratings_top[['mainGenre', 'secondaryGenres']] = movies_ratings['genres'].str.split(',', n=1, expand=True)
movies_ratings_top

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes,mainGenre,secondaryGenres
3627,tt0019760,Man with a Movie Camera,1929,"Documentary,Music",8.4,23992,Documentary,Music
4587,tt0021749,City Lights,1931,"Comedy,Drama,Romance",8.5,177165,Comedy,"Drama,Romance"
8228,tt0027977,Modern Times,1936,"Comedy,Drama,Family",8.5,230289,Comedy,"Drama,Family"
11207,tt0032553,The Great Dictator,1940,"Comedy,Drama,War",8.4,214490,Comedy,"Drama,War"
12599,tt0034583,Casablanca,1942,"Drama,Romance,War",8.5,546100,Drama,"Romance,War"
...,...,...,...,...,...,...,...,...
251988,tt7392212,Rangasthalam 1985,2018,"Action,Drama",8.4,21118,Action,Drama
254085,tt7681902,Won't You Be My Neighbor?,2018,"Biography,Documentary",8.4,24678,Biography,Documentary
257766,tt8267604,Capernaum,2018,Drama,8.4,77539,Drama,
259198,tt8503618,Hamilton,2020,"Biography,Drama,History",8.4,76314,Biography,"Drama,History"


In [18]:
# movies_ratings_top.to_csv('movies_ratings.csv')

## Graphiques

In [19]:
fig = px.scatter_3d(movies_ratings_top, 
    x='startYear', 
    y='averageRating', 
    z='mainGenre', 
    color='averageRating', 
    size='numVotes', 
    opacity = 0.8,
    labels={
        'startYear': 'Year',
        'averageRating': 'Rating',
        'mainGenre': 'Genre',
        'numVotes': 'Number of Votes'
    }, 
    size_max=25,
    template='plotly_dark',
    hover_name='primaryTitle'
)


fig.update_layout(width=1300, height=1000, scene=dict(zaxis=dict(nticks=11)), title='IMDB Top Rated Movies (>= 8.4) per Genre, Number of Votes and Year')
fig.show()

In [20]:
fig = px.histogram(data_frame=movies_ratings_top, x='mainGenre', color='mainGenre', labels={'mainGenre': 'Genre'}, color_discrete_sequence=px.colors.qualitative.Pastel)

fig.update_layout(width=1300, height=600, title='IMDB Top Rated Movies (>= 8.4) Genre Distribution', template='plotly_dark')
fig.show()