# Analyse des meilleurs films

## Imports des modules

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px 
import plotly.graph_objects as go 
import matplotlib.pyplot as plt
import seaborn as sns

## Lecture des datasets

In [3]:
ratings_df = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz", sep="\t")

In [4]:
basics_df = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", sep="\t", low_memory=False)

In [5]:
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1832
1,tt0000002,6.0,236
2,tt0000003,6.5,1592
3,tt0000004,6.0,153
4,tt0000005,6.2,2408


In [6]:
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


## Filtres

Filtre sur le type `movie`

In [7]:
basics_df_movies =  basics_df[basics_df['titleType'] == 'movie']
basics_df_movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


Retrait des films pour adultes avec la colonne `isAdult`

In [8]:
basics_df_movies = basics_df_movies[basics_df_movies['isAdult'] == '0']
basics_df_movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


Sélection des colonnes pertinentes avant le merge

In [9]:
basics_df_stripped = basics_df_movies[['tconst', 'primaryTitle', 'startYear', 'genres']]
basics_df_stripped.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres
498,tt0000502,Bohemios,1905,\N
570,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography"
587,tt0000591,The Prodigal Son,1907,Drama
610,tt0000615,Robbery Under Arms,1907,Drama
625,tt0000630,Hamlet,1908,Drama


## Jointure des deux dataframes

Merge des dataframes

In [10]:
movies_ratings = pd.merge(basics_df_stripped, ratings_df, how='inner', left_on='tconst', right_on='tconst')
movies_ratings.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
0,tt0000502,Bohemios,1905,\N,4.4,15
1,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",6.1,736
2,tt0000591,The Prodigal Son,1907,Drama,5.0,17
3,tt0000615,Robbery Under Arms,1907,Drama,4.5,24
4,tt0000630,Hamlet,1908,Drama,3.7,24


Réinitialisation de l'index

In [11]:
movies_ratings.reset_index(drop=True, inplace=True)
movies_ratings.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
0,tt0000502,Bohemios,1905,\N,4.4,15
1,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",6.1,736
2,tt0000591,The Prodigal Son,1907,Drama,5.0,17
3,tt0000615,Robbery Under Arms,1907,Drama,4.5,24
4,tt0000630,Hamlet,1908,Drama,3.7,24


## Sélection des données à étudier

Limite des films à un score supérieur ou égal à 8.0

In [12]:
movies_ratings_top = movies_ratings[movies_ratings['averageRating'] >= 8.0]
movies_ratings_top.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
163,tt0003748,Captain Alvarez,1914,Drama,8.3,16
195,tt0004167,Jane Eyre,1914,Drama,8.0,19
211,tt0004295,The Man from Mexico,1914,Comedy,8.6,39
219,tt0004363,Mrs. Black Is Back,1914,Comedy,8.0,18
232,tt0004496,A Prince of India,1914,Drama,8.4,14


Tri par `numvotes` afin d'avoir une idée de où fixer la limite

In [13]:
movies_ratings_top.sort_values(by=['numVotes'], ascending=False)

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
60334,tt0111161,The Shawshank Redemption,1994,Drama,9.3,2480651
130555,tt0468569,The Dark Knight,2008,"Action,Crime,Drama",9.0,2435010
165487,tt1375666,Inception,2010,"Action,Adventure,Sci-Fi",8.8,2185134
69405,tt0137523,Fight Club,1999,Drama,8.8,1954675
60186,tt0110912,Pulp Fiction,1994,"Crime,Drama",8.9,1920023
...,...,...,...,...,...,...
159719,tt1284984,Heimatklänge,1994,\N,8.8,5
241710,tt6139792,Sarasota Half in Dream,2016,Documentary,8.2,5
159543,tt12821820,Agha Saeed,1988,Documentary,8.0,5
158950,tt12729378,Kings of Europe: The Chelsea Story,2012,Documentary,8.6,5


In [14]:
movies_ratings_top['numVotes'].mean()

9959.015175219023

In [15]:
movies_ratings_top['numVotes'].median()

19.0

In [16]:
movies_ratings_top['numVotes'].quantile(0.75)

50.0

Sélection des films ayant uniquement plus de 10000 votes afin d'éliminer les outliers ayant un score non significatif

In [17]:
movies_ratings_top = movies_ratings_top.loc[movies_ratings_top['numVotes'] >= 10000]
movies_ratings_top

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
1033,tt0010323,The Cabinet of Dr. Caligari,1920,"Horror,Mystery,Thriller",8.1,60826
1469,tt0012349,The Kid,1921,"Comedy,Drama,Family",8.3,120811
1474,tt0012364,The Phantom Carriage,1921,"Drama,Fantasy,Horror",8.1,11772
1934,tt0014429,Safety Last!,1923,"Action,Comedy,Thriller",8.1,19762
2094,tt0015064,The Last Laugh,1924,Drama,8.1,13522
...,...,...,...,...,...,...
264122,tt9477520,Asuran,2019,"Action,Drama",8.5,20165
264183,tt9500372,Pocket Hercules: Naim Süleymanoglu,2019,"Biography,Drama,Sport",8.3,10781
264607,tt9617456,For Sama,2019,"Documentary,War",8.5,10684
265587,tt9817070,Just 6.5,2019,"Action,Crime,Drama",8.0,11446


In [18]:
movies_ratings_top[['mainGenre', 'secondaryGenres']] = movies_ratings['genres'].str.split(',', n=1, expand=True)
movies_ratings_top

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes,mainGenre,secondaryGenres
1033,tt0010323,The Cabinet of Dr. Caligari,1920,"Horror,Mystery,Thriller",8.1,60826,Horror,"Mystery,Thriller"
1469,tt0012349,The Kid,1921,"Comedy,Drama,Family",8.3,120811,Comedy,"Drama,Family"
1474,tt0012364,The Phantom Carriage,1921,"Drama,Fantasy,Horror",8.1,11772,Drama,"Fantasy,Horror"
1934,tt0014429,Safety Last!,1923,"Action,Comedy,Thriller",8.1,19762,Action,"Comedy,Thriller"
2094,tt0015064,The Last Laugh,1924,Drama,8.1,13522,Drama,
...,...,...,...,...,...,...,...,...
264122,tt9477520,Asuran,2019,"Action,Drama",8.5,20165,Action,Drama
264183,tt9500372,Pocket Hercules: Naim Süleymanoglu,2019,"Biography,Drama,Sport",8.3,10781,Biography,"Drama,Sport"
264607,tt9617456,For Sama,2019,"Documentary,War",8.5,10684,Documentary,War
265587,tt9817070,Just 6.5,2019,"Action,Crime,Drama",8.0,11446,Action,"Crime,Drama"


In [27]:
movies_ratings_top.sort_values('averageRating', ascending=False, inplace=True)
movies_ratings_top = movies_ratings_top.iloc[:100,:]

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes,mainGenre,secondaryGenres
95643,tt0252487,The Chaos Class,1975,"Comedy,Drama",9.3,39207,Comedy,Drama
60334,tt0111161,The Shawshank Redemption,1994,Drama,9.3,2480651,Drama,
36124,tt0068646,The Godfather,1972,"Crime,Drama",9.2,1713340,Crime,Drama
143183,tt10280296,Sardar Udham,2021,"Biography,Crime,Drama",9.1,22618,Biography,"Crime,Drama"
234112,tt5354160,Mirror Game,2016,"Crime,Mystery,Thriller",9.1,25151,Crime,"Mystery,Thriller"
...,...,...,...,...,...,...,...,...
132265,tt0482571,The Prestige,2006,"Drama,Mystery,Sci-Fi",8.5,1252179,Drama,"Mystery,Sci-Fi"
134525,tt0770802,Samsara,2011,"Documentary,Music",8.5,34760,Documentary,Music
33093,tt0064116,Once Upon a Time in the West,1968,Western,8.5,315842,Western,
216392,tt3674140,The Salt of the Earth,2014,"Biography,Documentary,History",8.5,20935,Biography,"Documentary,History"




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Graphiques

In [31]:
fig = px.scatter(movies_ratings_top,
    x='startYear',
    y='averageRating',
    size='numVotes',
    color='mainGenre',
    color_discrete_sequence= px.colors.cyclical.Edge,
    title='Ratings',
    template='plotly',
    labels={
        'startYear': 'Year',
        'averageRating': 'Rating',
        'mainGenre': 'Genre'
    }
)
fig.update_layout(width=1000, height=600)
fig.show()

In [29]:
fig = px.scatter_3d(movies_ratings_top, 
    x='startYear', 
    y='averageRating', 
    z='mainGenre', 
    color='averageRating', 
    size='numVotes', 
    opacity = 0.8,
    labels={
        'startYear': 'Year',
        'averageRating': 'Rating',
        'mainGenre': 'Genre',
        'numVotes': 'Number of Votes'
    }, 
    size_max=25,
    template='plotly_dark',
    hover_name='primaryTitle'
)


fig.update_layout(width=1300, height=1000)
fig.show()