In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
from scipy import stats
import chart_studio.plotly as py
import plotly.express as px
import cufflinks as cf
import plotly.graph_objects as go

In [None]:
name_df = pd.read_csv("https://datasets.imdbws.com/name.basics.tsv.gz", sep="\t")
name_df.head(1)

In [None]:
akas_df = pd.read_csv("https://datasets.imdbws.com/title.akas.tsv.gz", sep="\t", dtype={"titleId": "string", "ordering": str, "title ": "string", "region ": "string", 
"language ": "string", "types ": str, "attributes ": str, "isOriginalTitle": str})
akas_df.head(1)

In [None]:
basics_df = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", sep="\t", low_memory=False)
basics_df.head(1)

In [None]:
crew_df = pd.read_csv("https://datasets.imdbws.com/title.crew.tsv.gz", sep="\t", low_memory=False)
crew_df.head(1)

In [None]:
episode_df = pd.read_csv("https://datasets.imdbws.com/title.episode.tsv.gz", sep="\t", low_memory=False)
episode_df.head(1)

In [None]:
principals_df = pd.read_csv("https://datasets.imdbws.com/title.principals.tsv.gz", sep="\t")
principals_df.head(1)

In [None]:
ratings_df = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz", sep="\t", low_memory=False)
ratings_df.head(1)

# <span style="color:orange">Quels sont les pays qui produisent le plus de films ?</span> 

In [33]:
# Quels sont les pays qui produisent le plus de films ?
# étape 1 : juste un value_counts sur les régions dans la Database akas en prenant le top 15
movie_By_Region2 = akas_df["region"].value_counts().reset_index().head(15)
# je retire les régions \N
movie_By_Region3 = movie_By_Region2[movie_By_Region2['index'] != '\\N']

fig = px.bar(movie_By_Region3, x="index", y ='region', color = 'index',
    title = 'Quels sont les pays qui produisent le plus de films ?',
    labels = {'region': 'Nombre de films', 'index': 'Pays'},
    width=800, height=600)

fig.update_layout(showlegend=False, title_x=0.5)

# <span style="color:orange">Quels sont les acteurs les plus présents ?</span> 

In [None]:
# Quels sont les acteurs les plus présents ?
# étape 1 : nettoyage de la Database

principals_df2 = principals_df[principals_df['category'].str.contains('actor|actress')]
principals_df3 = principals_df2[principals_df2['characters'] != '\\N']
principals_df4 = principals_df3[principals_df3['characters'] != 'Narrator']
principals_df5 = principals_df4[principals_df4['characters'] != 'Various']
principals_df6 = principals_df5[principals_df5['characters'] != 'Various Characters']
principals_df7 = principals_df6[principals_df6['characters'] != 'Additional Voices']

In [None]:
# Quels sont les acteurs les plus présents ?
# étape 2 : set index puis merge
# set index on tconst for basics and principal table
basics_df2 = basics_df.set_index('tconst')
principals_df8 = principals_df7.set_index('tconst')
# Merge de la table principals et basics afin de filtrer uniquement sur les movies et les shorts
merge_basics_principals = pd.merge(basics_df2, principals_df8, how='inner', left_index=True, right_index=True)

In [None]:
# Quels sont les acteurs les plus présents ?
# étape 3 : Nettoyage pour garder uniquement les titleType movie et short
merge_basics_principals2 = merge_basics_principals[(merge_basics_principals['titleType'] == 'movie') | (merge_basics_principals['titleType'] == 'short')]

In [None]:
# Quels sont les acteurs les plus présents ?
# étape 4 : set index puis merge
# set index on nconst for Database name and Database merge_basics_principals2 to prepare a merge
name_df2 = name_df.set_index('nconst')
merge_basics_principals3 = merge_basics_principals2.set_index('nconst')
# merge to link first name with nconst
merge_basics_principals4 = pd.merge(name_df2, merge_basics_principals3, how='inner', left_index=True, right_index=True)

In [None]:
# Quels sont les acteurs les plus présents ?
# étape 5 : Nettoyage pour retirer les oeuvres très courtes comme les cartoons de quelques minutes pour lesquelles les acteurs font seulement les voix
merge_basics_principals5 = merge_basics_principals4[merge_basics_principals4['runtimeMinutes'] != '\\N']
merge_basics_principals6 = merge_basics_principals5.astype({"runtimeMinutes": int})
merge_basics_principals7 = merge_basics_principals6[merge_basics_principals6['runtimeMinutes'] > 20]

In [None]:
# Quels sont les acteurs les plus présents ?
# étape 6 : Value_counts pour trouver les acteurs les plus présents
merge_basics_principals8 = merge_basics_principals7['primaryName'].value_counts()
merge_basics_principals9 = merge_basics_principals8.head(20)

In [26]:
# Quels sont les acteurs les plus présents ?
# étape 7 : visualisation
fig = px.bar(merge_basics_principals9.reset_index(), x="primaryName", y ='index', color = 'index',
    title = 'Quels sont les acteurs les plus présents ?',
    labels = {'primaryName': 'Nombre de films', 'index': 'Acteurs'},
    width=800, height=600)

fig.update_layout(showlegend=False, title_x=0.5)

# <span style="color:orange">Quels sont les acteurs les plus présents ? A quelle période ?</span> 

In [33]:
# Quels sont les acteurs les plus présents ? A quelle période ?
# étape 1 : Nettoyage de la base de données pour retirer les titres qui n'ont pas de startYear
merge_basics_principals8 = merge_basics_principals7[merge_basics_principals7['startYear'] != "\\N"]

In [34]:
# Quels sont les acteurs les plus présents ? A quelle période ?
# étape 1 : Nettoyage de la base de données pour retirer les titres qui n'ont pas de startYear
merge_basics_principals8

Unnamed: 0_level_0,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,ordering,category,job,characters
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0050419,tt0072308,tt0053137",movie,The Gay Divorcee,The Gay Divorcee,0,1934,\N,107,"Comedy,Musical,Romance",1,actor,\N,"[""Guy Holden""]"
nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0050419,tt0072308,tt0053137",movie,Roberta,Roberta,0,1935,\N,106,"Comedy,Musical,Romance",2,actor,\N,"[""Huck Haines""]"
nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0050419,tt0072308,tt0053137",movie,Top Hat,Top Hat,0,1935,\N,101,"Comedy,Musical,Romance",1,actor,\N,"[""Jerry Travers""]"
nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0050419,tt0072308,tt0053137",movie,Follow the Fleet,Follow the Fleet,0,1936,\N,110,"Comedy,Musical,Romance",1,actor,\N,"[""Bake Baker""]"
nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0050419,tt0072308,tt0053137",movie,Swing Time,Swing Time,0,1936,\N,103,"Comedy,Musical,Romance",1,actor,\N,"[""Lucky Garnett""]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9993616,Ryan Mac Lennan,\N,\N,actor,tt4844148,movie,Mia and the White Lion,Mia et le lion blanc,0,2018,\N,98,"Adventure,Drama,Family",4,actor,\N,"[""Mick Owen""]"
nm9993636,Adam French,\N,\N,actor,"tt10842376,tt9532986,tt8983162,tt10402496",movie,Homeless But Happy,Homeless But Happy,0,2020,\N,90,"Documentary,Drama",3,actor,\N,"[""Bobby""]"
nm9993650,Marcin Balcerak,\N,\N,actor,tt8739208,movie,Autsajder,Autsajder,0,2018,\N,93,Drama,4,actor,\N,"[""Guardian""]"
nm9993693,Apsara Rani,\N,\N,actress,"tt13847502,tt8302382,tt8737752,tt12856788",movie,4 Letters,4 Letters,0,2019,\N,121,"Comedy,Drama,Romance",2,actress,\N,"[""Anupama""]"


In [1]:
#quelques tests

#name_df6.loc[name_df6['primaryName'] == "Nikita"]

#name_df6.loc[name_df6['primaryName'] == "Brad Pitt"]

#principals_df.loc[(principals_df['nconst'] == "nm0000093") & (principals_df['category'] == "actor")]

#principals_df[principals_df['category'].str.contains('actor|actress')]
#principals_df['nconst'].value_counts()

#name_df.loc[name_df['nconst'] == "nm10120013"]

#principals_df2 = principals_df[principals_df['category'].str.contains('actor|actress')]

#principals_df2['nconst'].value_counts()

#principals_df.loc[principals_df['nconst'] == "nm0000093"]

# name_df6['numberOfKnownForTitles'] = name_df6['knownForTitles'].str.count("tt")
# name_df6.sort_values(by=['numberOfKnownForTitles'], ascending=False).head(15)

# name_df2 = name_df.dropna(axis=0)
# name_df3 = name_df2[name_df2['primaryProfession'] != '\\N']
# name_df4 = name_df3[name_df3['birthYear'] != '\\N']
# name_df5 = name_df4[name_df4['knownForTitles'] != '\\N']
# name_df6 = name_df5[name_df5['primaryProfession'].str.contains('actor|actress')]

#akas_df2 = akas_df[(akas_df['region'] == 'FR') & (akas_df['title'] == 'Star Wars : Épisode VII - Le Réveil de la Force')]
#akas_df2 = akas_df[akas_df['region'] == 'FR']
#akas_df2 = akas_df[(akas_df['title'] == 'Mr. & Mrs. Smith') & (akas_df['region'] == 'FR')]
#akas_df2 = akas_df[akas_df['titleId'] == 'tt0000001']
#akas_df2 = akas_df[akas_df['title'] == 'Mr. & Mrs. Smith']