# Analyse des meilleurs films

## Imports des modules

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px 
import plotly.graph_objects as go 
import matplotlib.pyplot as plt
import seaborn as sns

## Lecture des datasets

In [2]:
ratings_df = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz", sep="\t")

In [3]:
basics_df = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", sep="\t", low_memory=False)

In [4]:
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1831
1,tt0000002,6.0,236
2,tt0000003,6.5,1591
3,tt0000004,6.0,153
4,tt0000005,6.2,2407


In [5]:
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


## Filtres

Filtre sur le type `movie`

In [6]:
basics_df_movies =  basics_df[basics_df['titleType'] == 'movie']
basics_df_movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


Retrait des films pour adultes avec la colonne `isAdult`

In [7]:
basics_df_movies = basics_df_movies[basics_df_movies['isAdult'] == '0']
basics_df_movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


Sélection des colonnes pertinentes avant le merge

In [8]:
basics_df_stripped = basics_df_movies[['tconst', 'primaryTitle', 'startYear', 'genres']]
basics_df_stripped.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres
498,tt0000502,Bohemios,1905,\N
570,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography"
587,tt0000591,The Prodigal Son,1907,Drama
610,tt0000615,Robbery Under Arms,1907,Drama
625,tt0000630,Hamlet,1908,Drama


## Jointure des deux dataframes

Merge des dataframes

In [9]:
movies_ratings = pd.merge(basics_df_stripped, ratings_df, how='inner', left_on='tconst', right_on='tconst')
movies_ratings.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
0,tt0000502,Bohemios,1905,\N,4.5,14
1,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",6.1,735
2,tt0000591,The Prodigal Son,1907,Drama,5.2,16
3,tt0000615,Robbery Under Arms,1907,Drama,4.5,23
4,tt0000630,Hamlet,1908,Drama,3.8,23


Réinitialisation de l'index

In [10]:
movies_ratings.reset_index(drop=True, inplace=True)
movies_ratings.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
0,tt0000502,Bohemios,1905,\N,4.5,14
1,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",6.1,735
2,tt0000591,The Prodigal Son,1907,Drama,5.2,16
3,tt0000615,Robbery Under Arms,1907,Drama,4.5,23
4,tt0000630,Hamlet,1908,Drama,3.8,23


## Sélection des données à étudier

Limite des films à un score supérieur ou égal à 8.0

In [11]:
movies_ratings_top = movies_ratings[movies_ratings['averageRating'] >= 8.0]
movies_ratings_top.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
157,tt0003748,Captain Alvarez,1914,Drama,8.3,16
189,tt0004167,Jane Eyre,1914,Drama,8.0,19
205,tt0004295,The Man from Mexico,1914,Comedy,8.6,39
213,tt0004363,Mrs. Black Is Back,1914,Comedy,8.0,18
226,tt0004496,A Prince of India,1914,Drama,8.4,14


Tri par `numvotes` afin d'avoir une idée de où fixer la limite

In [12]:
movies_ratings_top.sort_values(by=['numVotes'], ascending=False)

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
60325,tt0111161,The Shawshank Redemption,1994,Drama,9.3,2479369
130547,tt0468569,The Dark Knight,2008,"Action,Crime,Drama",9.0,2433653
165447,tt1375666,Inception,2010,"Action,Adventure,Sci-Fi",8.8,2183856
69395,tt0137523,Fight Club,1999,Drama,8.8,1953392
60177,tt0110912,Pulp Fiction,1994,"Crime,Drama",8.9,1919104
59595,tt0109830,Forrest Gump,1994,"Drama,Romance",8.8,1917006
68321,tt0133093,The Matrix,1999,"Action,Sci-Fi",8.7,1771638
64999,tt0120737,The Lord of the Rings: The Fellowship of the Ring,2001,"Action,Adventure,Drama",8.8,1739384
75921,tt0167260,The Lord of the Rings: The Return of the King,2003,"Action,Adventure,Drama",8.9,1718127
36115,tt0068646,The Godfather,1972,"Crime,Drama",9.2,1712575


In [13]:
movies_ratings_top['numVotes'].mean()

9976.46484027814

In [14]:
movies_ratings_top['numVotes'].median()

19.0

Sélection des films ayant uniquement plus de 10000 votes afin d'éliminer les outliers ayant un score non significatif

In [15]:
movies_ratings_top = movies_ratings_top.loc[movies_ratings_top['numVotes'] >= 10000]
movies_ratings_top

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
1027,tt0010323,The Cabinet of Dr. Caligari,1920,"Fantasy,Horror,Mystery",8.1,60787
1463,tt0012349,The Kid,1921,"Comedy,Drama,Family",8.3,120758
1468,tt0012364,The Phantom Carriage,1921,"Drama,Fantasy,Horror",8.1,11761
1928,tt0014429,Safety Last!,1923,"Action,Comedy,Thriller",8.1,19756
2088,tt0015064,The Last Laugh,1924,Drama,8.1,13514
2157,tt0015324,Sherlock Jr.,1924,"Action,Comedy,Romance",8.2,46364
2241,tt0015648,Battleship Potemkin,1925,"Drama,History,Thriller",8.0,55802
2305,tt0015864,The Gold Rush,1925,"Adventure,Comedy,Drama",8.2,106946
2605,tt0016847,Faust,1926,"Drama,Fantasy,Horror",8.1,14726
2713,tt0017136,Metropolis,1927,"Drama,Sci-Fi",8.3,167508


In [16]:
movies_ratings_top[['mainGenre', 'secondaryGenres']] = movies_ratings['genres'].str.split(',', n=1, expand=True)
movies_ratings_top

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes,mainGenre,secondaryGenres
1027,tt0010323,The Cabinet of Dr. Caligari,1920,"Fantasy,Horror,Mystery",8.1,60787,Fantasy,"Horror,Mystery"
1463,tt0012349,The Kid,1921,"Comedy,Drama,Family",8.3,120758,Comedy,"Drama,Family"
1468,tt0012364,The Phantom Carriage,1921,"Drama,Fantasy,Horror",8.1,11761,Drama,"Fantasy,Horror"
1928,tt0014429,Safety Last!,1923,"Action,Comedy,Thriller",8.1,19756,Action,"Comedy,Thriller"
2088,tt0015064,The Last Laugh,1924,Drama,8.1,13514,Drama,
2157,tt0015324,Sherlock Jr.,1924,"Action,Comedy,Romance",8.2,46364,Action,"Comedy,Romance"
2241,tt0015648,Battleship Potemkin,1925,"Drama,History,Thriller",8.0,55802,Drama,"History,Thriller"
2305,tt0015864,The Gold Rush,1925,"Adventure,Comedy,Drama",8.2,106946,Adventure,"Comedy,Drama"
2605,tt0016847,Faust,1926,"Drama,Fantasy,Horror",8.1,14726,Drama,"Fantasy,Horror"
2713,tt0017136,Metropolis,1927,"Drama,Sci-Fi",8.3,167508,Drama,Sci-Fi


## Graphiques

In [51]:
fig = px.scatter(movies_ratings_top,
    x='startYear',
    y='averageRating',
    size='numVotes',
    color='mainGenre',
    color_discrete_sequence= px.colors.cyclical.Edge,
    title='Ratings',
    template='plotly',
    labels={
        'startYear': 'Year',
        'averageRating': 'Rating',
        'mainGenre': 'Genre'
    }
)
fig.update_layout(width=1000, height=600)
fig.show()

In [56]:
fig = px.scatter_3d(movies_ratings_top, 
    x='startYear', 
    y='averageRating', 
    z='mainGenre', 
    color='averageRating', 
    size='numVotes', 
    opacity = 0.8,
    labels={
        'startYear': 'Year',
        'averageRating': 'Rating',
        'mainGenre': 'Genre',
        'numVotes': 'Number of Votes'
    }, 
    size_max=25,
    template='plotly_dark'
)


fig.update_layout(width=1300, height=1000)
fig.show()