In [187]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
from scipy import stats
import chart_studio.plotly as py
import plotly.express as px
import cufflinks as cf
import plotly.graph_objects as go
import seaborn as sns

import ipywidgets as widgets

In [2]:
name_df = pd.read_csv("https://datasets.imdbws.com/name.basics.tsv.gz", sep="\t")
name_df.head(1)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0072308,tt0031983,tt0050419,tt0053137"


In [3]:
akas_df = pd.read_csv("https://datasets.imdbws.com/title.akas.tsv.gz", sep="\t", dtype={"titleId": "string", "ordering": str, "title ": "string", "region ": "string", 
"language ": "string", "types ": str, "attributes ": str, "isOriginalTitle": str})
akas_df.head(1)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0


In [3]:
basics_df = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", sep="\t", low_memory=False)
basics_df.head(1)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"


In [None]:
crew_df = pd.read_csv("https://datasets.imdbws.com/title.crew.tsv.gz", sep="\t", low_memory=False)
crew_df.head(1)

In [None]:
episode_df = pd.read_csv("https://datasets.imdbws.com/title.episode.tsv.gz", sep="\t", low_memory=False)
episode_df.head(1)

In [4]:
principals_df = pd.read_csv("https://datasets.imdbws.com/title.principals.tsv.gz", sep="\t")
principals_df.head(1)

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"


In [None]:
ratings_df = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz", sep="\t", low_memory=False)
ratings_df.head(1)

# <span style="color:orange">Quels sont les pays qui produisent le plus de films ?</span> 

In [None]:
# Quels sont les pays qui produisent le plus de films ?
# étape 1 : juste un value_counts sur les régions dans la Database akas en prenant le top 15
movie_By_Region2 = akas_df["region"].value_counts().reset_index().head(15)
# je retire les régions \N
movie_By_Region3 = movie_By_Region2[movie_By_Region2['index'] != '\\N']

fig = px.bar(movie_By_Region3, x="index", y ='region', color = 'index', text ='region',
    title = 'Quels sont les pays qui produisent le plus de films ?',
    labels = {'region': 'Nombre de films', 'index': 'Pays'},
    width=800, height=600)
    
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8)

fig.update_layout(showlegend=False, title_x=0.5, yaxis={'visible': False})

# <span style="color:orange">Quels sont les acteurs les plus présents ?</span> 

In [6]:
# Quels sont les acteurs les plus présents ?
# étape 1 : nettoyage de la Database
principals_df2 = principals_df[["tconst", "nconst", "category", "characters"]]
principals_df3 = principals_df2[principals_df2['category'].str.contains('actor|actress') & (principals_df2['characters'] != '\\N')]
principals_df4 = principals_df3[(principals_df3['characters'] != 'Narrator') & (principals_df3['characters'] != 'Various') & (principals_df3['characters'] != 'Additional Voices')]

In [22]:
# Quels sont les acteurs les plus présents ?
basics_df2 = basics_df[["tconst", "titleType", "startYear", "runtimeMinutes", "genres"]]
# étape 2 : set index puis merge
# set index on tconst for basics and principal table
basics_df3 = basics_df2.set_index('tconst')
principals_df5 = principals_df4.set_index('tconst')
# Merge de la table principals et basics afin de filtrer uniquement sur les movies et les shorts
actors_occurence_df = pd.merge(basics_df3, principals_df5, how='inner', left_index=True, right_index=True)

In [24]:
# Quels sont les acteurs les plus présents ?
# étape 3 : Nettoyage pour garder uniquement les titleType movie et short
actors_occurence_df2 = actors_occurence_df[(actors_occurence_df['titleType'] == 'movie') | (actors_occurence_df['titleType'] == 'short')]

In [27]:
# Quels sont les acteurs les plus présents ?
name_df2 = name_df[["nconst", "primaryName"]]
# étape 4 : set index puis merge
# set index on nconst for Database name and Database actors_occurence_df2 to prepare a merge
name_df3 = name_df2.set_index('nconst')
actors_occurence_df3 = actors_occurence_df2.set_index('nconst')
# merge to link first name with nconst
actors_occurence_df4 = pd.merge(name_df3, actors_occurence_df3, how='inner', left_index=True, right_index=True)

In [28]:
# Quels sont les acteurs les plus présents ?
# étape 5 : Nettoyage pour retirer les oeuvres très courtes comme les cartoons de quelques minutes pour lesquelles les acteurs font seulement les voix
actors_occurence_df5 = actors_occurence_df4[actors_occurence_df4['runtimeMinutes'] != '\\N']
actors_occurence_df6 = actors_occurence_df5.astype({"runtimeMinutes": int})
actors_occurence_df7 = actors_occurence_df6[actors_occurence_df6['runtimeMinutes'] > 20]

In [29]:
# Quels sont les acteurs les plus présents ?
# étape 6 : Value_counts pour trouver les acteurs les plus présents
actors_occurence_df8 = actors_occurence_df7['primaryName'].value_counts()
actors_occurence_df9 = actors_occurence_df8.head(20)

In [30]:
# Quels sont les acteurs les plus présents ?
# étape 7 : visualisation
fig = px.bar(actors_occurence_df9.reset_index(), x="primaryName", y ='index', color = 'index',
    title = 'Quels sont les acteurs les plus présents ?',
    labels = {'primaryName': 'Nombre de films', 'index': 'Acteurs'},
    width=800, height=600)

fig.update_layout(showlegend=False, title_x=0.5)

# <span style="color:orange">Quels sont les acteurs les plus présents ? A quelle période ?</span> 

In [35]:
# étape 1 : Nettoyage de la base de données pour retirer les titres qui n'ont pas de startYear
actors_occurence_df8 = actors_occurence_df7[actors_occurence_df7['startYear'] != "\\N"]
# Garder les colonnes primaryName et startYear pour alléger le dataset
actors_occurence_df9 = actors_occurence_df8[["primaryName", "startYear"]]
# Reset de l'index
actors_occurence_df10 = actors_occurence_df9.reset_index(drop=True)

In [38]:
# étape 2 : création d'une fonction pour transformer une date en décennie
def find_decade(year):
    decade1 = (year // 10 * 10)
    decade2 = (year // 10 * 10) + 10
    return str(decade1) + " - " + str(decade2)

print(find_decade(1934))


1930 - 1940


In [40]:
# étape 3 : application de la fonction
actors_occurence_df11 = actors_occurence_df10.astype({"startYear": int})
actors_occurence_df11["startYear"] = actors_occurence_df11["startYear"].apply(find_decade)

In [49]:
# étape 4 : reset de l'index
actors_occurence_df11.reset_index(inplace=True)

In [84]:
# étape 5 : groupby sur la startYear et primaryName
df_grouped_notreset  = pd.DataFrame({'count' : actors_occurence_df11.groupby(['startYear', 'primaryName'] ).size()})
df_grouped_notreset.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
startYear,primaryName,Unnamed: 2_level_1
1890 - 1900,Blanche Bayliss,1
1890 - 1900,Chauncey Depew,1
1890 - 1900,William Courtenay,1
1900 - 1910,Adelaide Fitz-Allen,1
1900 - 1910,Alexandre Arquillière,1


In [85]:
# étape 6 : placer les résultats du count du plus haut au plus bas
df_grouped_notreset_sorted=df_grouped_notreset.sort_values(['startYear', 'count'], ascending=False)
df_grouped_notreset_sorted.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
startYear,primaryName,Unnamed: 2_level_1
2020 - 2030,Eric Roberts,30
2020 - 2030,Ross K. Foad,24
2020 - 2030,Simon Hill,24
2020 - 2030,Cascade Nerida,15
2020 - 2030,Shawn C. Phillips,15


In [86]:
# étape 6 : placer les résultats du count du plus haut au plus bas
groupedDf = actors_occurence_df11.groupby(['startYear', 'primaryName'] ).size()
groupedDf.head()

startYear    primaryName          
1890 - 1900  Blanche Bayliss          1
             Chauncey Depew           1
             William Courtenay        1
1900 - 1910  Adelaide Fitz-Allen      1
             Alexandre Arquillière    1
dtype: int64

In [195]:
# étape 7 : Récupérer uniquement les 5 plus hautes valeur de la colonne count
df_final  = pd.DataFrame({'count' : groupedDf.groupby(level='startYear').nlargest(1).reset_index(level=0, drop=True)})
df_final.head(60)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
startYear,primaryName,Unnamed: 2_level_1
1890 - 1900,Blanche Bayliss,1
1900 - 1910,Adelaide Fitz-Allen,1
1910 - 1920,Theodore Roberts,46
1920 - 1930,Hoot Gibson,64
1930 - 1940,Bob Steele,66
1940 - 1950,Al St. John,79
1950 - 1960,Toshirô Mifune,49
1960 - 1970,Türkan Soray,73
1970 - 1980,Cüneyt Arkin,94
1980 - 1990,Mohanlal,101


In [197]:
df_final.tail(12)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
startYear,primaryName,Unnamed: 2_level_1
1910 - 1920,Theodore Roberts,46
1920 - 1930,Hoot Gibson,64
1930 - 1940,Bob Steele,66
1940 - 1950,Al St. John,79
1950 - 1960,Toshirô Mifune,49
1960 - 1970,Türkan Soray,73
1970 - 1980,Cüneyt Arkin,94
1980 - 1990,Mohanlal,101
1990 - 2000,Simon Yam,57
2000 - 2010,Seiji Nakamitsu,118


In [198]:
df_final.reset_index(inplace=True)
df_final["test"] = df_final.index

In [111]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   startYear    42 non-null     object
 1   primaryName  42 non-null     object
 2   count        42 non-null     int64 
 3   test         42 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 1.4+ KB


In [202]:
df_final2 = df_final.tail(12)

In [214]:
# étape 8 : Visualisation
fig = px.bar(df_final2, x="startYear", y ='count', color = 'primaryName', text ='primaryName',
    title = 'Quels sont les acteurs les plus présents par périodes ?',
    labels = {'startYear': 'Période', 'primaryName': 'Acteurs'},
    width=1300, height=800)
 
fig.update_traces(textfont_size=12, textposition='outside')
fig.update_layout()

fig.update_layout(showlegend=False, title_x=0.5)

In [204]:
fig = px.bar(df_final2, x="primaryName", y ='count', color = 'startYear', text ='startYear',
    title = 'Quels sont les acteurs les plus présents par périodes ?',
    labels = {'startYear': 'Période', 'primaryName': 'Acteurs'},
    width=1200, height=900)
 
fig.update_traces(textfont_size=14)
fig.update_xaxes(tickangle=45)

fig.update_layout(showlegend=False, title_x=0.5)

In [190]:
def change_df(decade):
    """
    Print the current widget value in short sentence
    """
    print(f'the choosen df is {decade}')

widgets.interact(change_df, decade=["1920 - 1930", "1930 - 1940", "1940 - 1950"])


interactive(children=(Dropdown(description='decade', options=('1920 - 1930', '1930 - 1940', '1940 - 1950'), va…

<function __main__.change_df(decade)>

In [206]:
fig = px.bar(df_final2, x="startYear", y ='count', color = 'primaryName', text ='primaryName',
    title = 'Quels sont les acteurs les plus présents par périodes ?',
    labels = {'startYear': 'Période', 'primaryName': 'Acteurs'},
    width=1200, height=900)
 
fig.update_traces(textfont_size=14)
fig.update_xaxes(tickangle=45)

fig.update_layout(showlegend=False, title_x=0.5)

In [181]:

px.imshow(df_final3, width=1000, height=900)

In [171]:
import plotly.graph_objects as go
x = [
    ["BB+", "BB+", "BB+", "BB", "BB", "BB"],
    [16, 17, 18, 16, 17, 18,]
]
fig = go.Figure()
fig.add_bar(x=x,y=[1,2,3,4,5,6])
fig.add_bar(x=x,y=[6,5,4,3,2,1])
fig.update_layout(barmode="relative")
fig.show()

In [97]:
#merge_basics_principals_test5 = merge_basics_principals_test4.pivot(columns="startYear")
#merge_basics_principals_test6 = merge_basics_principals_test5.apply(pd.Series.value_counts)
#merge_basics_principals_test7 = merge_basics_principals_test6.fillna(0)




In [39]:
#merge_basics_principals_test8 = merge_basics_principals_test7.transform(np.sort)
#merge_basics_principals_test8.tail(60)

                            primaryName                                      \
startYear                   1890 - 1900 1900 - 1910 1910 - 1920 1920 - 1930   
''Knife'' Sotelo                    0.0         0.0         0.0         0.0   
'Ace Primo' Niko Warren             0.0         0.0         0.0         0.0   
'Baby' Carmen De Rue                0.0         0.0         0.0         0.0   
'Big' Bill Wilson                   0.0         0.0         0.0         0.0   
'Big' LeRoy Mobley                  0.0         0.0         0.0         0.0   
...                                 ...         ...         ...         ...   
Þórhallur Þórhallsson               0.0         1.0        40.0        55.0   
Þórhildur Ýr Arnardóttir            0.0         1.0        40.0        58.0   
Þórir Waagfjörð                     1.0         1.0        40.0        58.0   
Þórunn Arna Kristjánsdóttir         1.0         1.0        41.0        63.0   
Þórður Aðalbjörnsson                1.0         1.0 

In [1]:
#quelques tests

#name_df6.loc[name_df6['primaryName'] == "Nikita"]

#name_df6.loc[name_df6['primaryName'] == "Brad Pitt"]

#principals_df.loc[(principals_df['nconst'] == "nm0000093") & (principals_df['category'] == "actor")]

#principals_df[principals_df['category'].str.contains('actor|actress')]
#principals_df['nconst'].value_counts()

#name_df.loc[name_df['nconst'] == "nm10120013"]

#principals_df2 = principals_df[principals_df['category'].str.contains('actor|actress')]

#principals_df2['nconst'].value_counts()

#principals_df.loc[principals_df['nconst'] == "nm0000093"]

# name_df6['numberOfKnownForTitles'] = name_df6['knownForTitles'].str.count("tt")
# name_df6.sort_values(by=['numberOfKnownForTitles'], ascending=False).head(15)

# name_df2 = name_df.dropna(axis=0)
# name_df3 = name_df2[name_df2['primaryProfession'] != '\\N']
# name_df4 = name_df3[name_df3['birthYear'] != '\\N']
# name_df5 = name_df4[name_df4['knownForTitles'] != '\\N']
# name_df6 = name_df5[name_df5['primaryProfession'].str.contains('actor|actress')]

#akas_df2 = akas_df[(akas_df['region'] == 'FR') & (akas_df['title'] == 'Star Wars : Épisode VII - Le Réveil de la Force')]
#akas_df2 = akas_df[akas_df['region'] == 'FR']
#akas_df2 = akas_df[(akas_df['title'] == 'Mr. & Mrs. Smith') & (akas_df['region'] == 'FR')]
#akas_df2 = akas_df[akas_df['titleId'] == 'tt0000001']
#akas_df2 = akas_df[akas_df['title'] == 'Mr. & Mrs. Smith']