In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
from scipy import stats
import chart_studio.plotly as py
import plotly.express as px
import cufflinks as cf
import plotly.graph_objects as go
import seaborn as sns
import ipywidgets as widgets

In [71]:
name_df = pd.read_csv("https://datasets.imdbws.com/name.basics.tsv.gz", sep="\t")
name_df.head(1)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0053137,tt0031983,tt0050419,tt0072308"


In [6]:
akas_df = pd.read_csv("https://datasets.imdbws.com/title.akas.tsv.gz", sep="\t", dtype={"titleId": "string", "ordering": str, "title ": "string", "region ": "string", 
"language ": "string", "types ": str, "attributes ": str, "isOriginalTitle": str})
akas_df.head(1)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0


In [72]:
basics_df = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", sep="\t", low_memory=False)
basics_df.head(1)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"


In [None]:
crew_df = pd.read_csv("https://datasets.imdbws.com/title.crew.tsv.gz", sep="\t", low_memory=False)
crew_df.head(1)

In [None]:
episode_df = pd.read_csv("https://datasets.imdbws.com/title.episode.tsv.gz", sep="\t", low_memory=False)
episode_df.head(1)

In [73]:
principals_df = pd.read_csv("https://datasets.imdbws.com/title.principals.tsv.gz", sep="\t")
principals_df.head(1)

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"


In [9]:
ratings_df = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz", sep="\t", low_memory=False)
ratings_df.head(1)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1831


# <span style="color:orange">Quels sont les pays qui produisent le plus de films ?</span> 

In [39]:
# Etape 1 : value_counts sur les régions dans la Database akas en prenant le top 15
movie_By_Region = akas_df["region"].value_counts().reset_index().head(15)
# Retrait des régions qui n'ont pas d'indication
movie_By_Region2 = movie_By_Region[movie_By_Region['index'] != '\\N']


NameError: name 'akas_df' is not defined

In [12]:
# Sauvegarde du dataframe
movie_By_Region2.to_csv(r'C:\Users\Berenger\Desktop\projet abc\pays_nombre_de_film.csv', index = False, header = True)

In [40]:
# Chargement du dataframe
movie_By_Region = pd.read_csv(r"C:\Users\Berenger\Desktop\projet abc\pays_nombre_de_film.csv")
movie_By_Region.head()

Unnamed: 0,index,region
0,FR,3501512
1,JP,3496246
2,DE,3458776
3,ES,3426798
4,IN,3420547


In [5]:
# Visualisation
fig = px.bar(movie_By_Region, x="index", y ='region', color = 'index', text ='region',
    title = 'Quels sont les pays qui produisent le plus de films ?',
    labels = {'region': 'Nombre de films', 'index': 'Pays'},
    width=800, height=600)
    
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8)

fig.update_layout(showlegend=False, title_x=0.5, yaxis={'visible': False})

# <span style="color:orange">Quels sont les acteurs les plus présents ?</span> 

In [74]:
# Etape 1 : nettoyage de la Database
principals_df2 = principals_df[["tconst", "nconst", "category", "characters"]]
principals_df3 = principals_df2[principals_df2['category'].str.contains('actor|actress') & (principals_df2['characters'] != '\\N')]
principals_df4 = principals_df3[(principals_df3['characters'] != 'Narrator') & (principals_df3['characters'] != 'Various') & (principals_df3['characters'] != 'Additional Voices')]

In [75]:
# Nettoyage de la Database
basics_df2 = basics_df[["tconst", "titleType", "startYear", "runtimeMinutes", "genres"]]
# Etape 2 : set index puis merge
# Set index on tconst for basics and principal table
basics_df3 = basics_df2.set_index('tconst')
principals_df5 = principals_df4.set_index('tconst')
# Merge de la table principals et basics afin de filtrer uniquement sur les movies et les shorts
actors_occurence_df = pd.merge(basics_df3, principals_df5, how='inner', left_index=True, right_index=True)

In [79]:
actors_occurence_df["titleType"].value_counts()

tvEpisode       10494130
short            1562016
movie            1275537
tvSeries          659377
video             438800
tvMovie           302705
tvMiniSeries      143258
videoGame          43151
tvSpecial          22439
tvShort            16386
tvPilot                1
Name: titleType, dtype: int64

In [80]:
# Etape 3 : Nettoyage pour garder uniquement les titleType movie et short
actors_occurence_df2 = actors_occurence_df[(actors_occurence_df['titleType'] == 'movie') | (actors_occurence_df['titleType'] == 'short')]
#actors_occurence_df2 = actors_occurence_df[(actors_occurence_df['titleType'] == 'tvEpisode') | (actors_occurence_df['titleType'] == 'tvSeries') | (actors_occurence_df['titleType'] == 'tvMiniSeries')]

In [27]:
# Nettoyage de la Database
name_df2 = name_df[["nconst", "primaryName"]]
# Etape 4 : set index puis merge
# Set index on nconst for Database name and Database actors_occurence_df2 to prepare a merge
name_df3 = name_df2.set_index('nconst')
actors_occurence_df3 = actors_occurence_df2.set_index('nconst')
# Merge to link first name with nconst
actors_occurence_df4 = pd.merge(name_df3, actors_occurence_df3, how='inner', left_index=True, right_index=True)

In [28]:
# Etape 5 : Nettoyage pour retirer les oeuvres très courtes comme les cartoons de quelques minutes pour lesquelles les acteurs font seulement les voix
actors_occurence_df5 = actors_occurence_df4[actors_occurence_df4['runtimeMinutes'] != '\\N']
actors_occurence_df6 = actors_occurence_df5.astype({"runtimeMinutes": int})
actors_occurence_df7 = actors_occurence_df6[actors_occurence_df6['runtimeMinutes'] > 20]

In [119]:
# Etape 6 : Value_counts pour trouver les acteurs les plus présents
actors_occurence_df8 = actors_occurence_df7['primaryName'].value_counts()
actors_occurence_df9 = actors_occurence_df8.head(20)

In [123]:
# Etape 7 : transformation d'une série en dataframe puis reset de l'index
actors_occurence_df10 = pd.DataFrame(actors_occurence_df9)
actors_occurence_df10.reset_index(inplace=True)

In [125]:
# # Etape 8 : Sauvegarde du dataframe
actors_occurence_df10.to_csv(r'C:\Users\Berenger\Desktop\projet abc\presence_acteurs.csv', index = False, header = True)

In [3]:
# # Etape 9 : Chargement du dataframe
presence_acteur = pd.read_csv(r"C:\Users\Berenger\Desktop\projet abc\presence_acteurs.csv")
presence_acteur.head()

Unnamed: 0,index,primaryName
0,Mohanlal,236
1,Eric Roberts,227
2,Mammootty,224
3,Cüneyt Arkin,219
4,Raymond Hatton,202


In [4]:
# étape 10 : visualisation
fig = px.bar(presence_acteur, x="primaryName", y ='index', color = 'index',
    title = 'Quels sont les acteurs les plus présents ?',
    labels = {'primaryName': 'Nombre de films', 'index': 'Acteurs'},
    width=800, height=600)

fig.update_layout(showlegend=False, title_x=0.5, yaxis={'visible': True})

# <span style="color:orange">Quels sont les acteurs les plus présents ? A quelle période ?</span> 

In [128]:
# étape 1 : Nettoyage de la base de données pour retirer les titres qui n'ont pas de startYear
actors_occurence_df8 = actors_occurence_df7[actors_occurence_df7['startYear'] != "\\N"]
# Garder les colonnes primaryName et startYear pour alléger le dataset
actors_occurence_df9 = actors_occurence_df8[["primaryName", "startYear"]]
# Reset de l'index
actors_occurence_df10 = actors_occurence_df9.reset_index(drop=True)

In [17]:
# étape 2 : création d'une fonction pour transformer une date en décennie
def find_decade(year):
    decade1 = (year // 10 * 10)
    decade2 = (year // 10 * 10) + 10
    return str(decade1) + " - " + str(decade2)

print(find_decade(1934))


1930 - 1940


In [130]:
# étape 3 : application de la fonction
actors_occurence_df11 = actors_occurence_df10.astype({"startYear": int})
actors_occurence_df11["startYear"] = actors_occurence_df11["startYear"].apply(find_decade)

In [131]:
# étape 4 : reset de l'index
actors_occurence_df11.reset_index(inplace=True)

In [132]:
# étape 5 : groupby sur la startYear et primaryName
df_grouped_notreset  = pd.DataFrame({'count' : actors_occurence_df11.groupby(['startYear', 'primaryName'] ).size()})
df_grouped_notreset.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
startYear,primaryName,Unnamed: 2_level_1
1890 - 1900,Blanche Bayliss,1
1890 - 1900,Chauncey Depew,1
1890 - 1900,William Courtenay,1
1900 - 1910,Adelaide Fitz-Allen,1
1900 - 1910,Alexandre Arquillière,1


In [133]:
# étape 6 : placer les résultats du count du plus haut au plus bas
df_grouped_notreset_sorted=df_grouped_notreset.sort_values(['startYear', 'count'], ascending=False)
df_grouped_notreset_sorted.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
startYear,primaryName,Unnamed: 2_level_1
2020 - 2030,Eric Roberts,30
2020 - 2030,Ross K. Foad,24
2020 - 2030,Simon Hill,24
2020 - 2030,Cascade Nerida,15
2020 - 2030,Shawn C. Phillips,15


In [134]:
groupedDf = actors_occurence_df11.groupby(['startYear', 'primaryName'] ).size()
groupedDf.head()

startYear    primaryName          
1890 - 1900  Blanche Bayliss          1
             Chauncey Depew           1
             William Courtenay        1
1900 - 1910  Adelaide Fitz-Allen      1
             Alexandre Arquillière    1
dtype: int64

In [135]:
# étape 7 : Récupérer uniquement les 5 plus hautes valeur de la colonne count
df_final  = pd.DataFrame({'count' : groupedDf.groupby(level='startYear').nlargest(5).reset_index(level=0, drop=True)})

In [136]:
df_final.reset_index(inplace=True)
df_final["test"] = df_final.index

In [137]:
df_final2 = df_final.tail(60)

In [6]:
acteur_par_periode['rank'] = acteur_par_periode.groupby('startYear')['count'].rank(method = 'first')
acteur_par_periode.style.background_gradient(subset=pd.IndexSlice[:, ['rank']])
acteur_par_periode.head(5)

Unnamed: 0,startYear,primaryName,count,test,rank
0,1910 - 1920,Theodore Roberts,46,8,5.0
1,1910 - 1920,Lon Chaney,41,9,4.0
2,1910 - 1920,Charles Ray,40,10,1.0
3,1910 - 1920,Wallace Reid,40,11,2.0
4,1910 - 1920,William S. Hart,40,12,3.0


In [7]:
# # Etape 8 : Sauvegarde du dataframe
acteur_par_periode.to_csv(r'C:\Users\Berenger\Desktop\projet abc\acteur_par_periode.csv', index = False, header = True)

In [8]:
# # Etape 9 : Chargement du dataframe
acteur_par_periode = pd.read_csv(r"C:\Users\Berenger\Desktop\projet abc\acteur_par_periode.csv")
acteur_par_periode.head()

Unnamed: 0,startYear,primaryName,count,test,rank
0,1910 - 1920,Theodore Roberts,46,8,5.0
1,1910 - 1920,Lon Chaney,41,9,4.0
2,1910 - 1920,Charles Ray,40,10,1.0
3,1910 - 1920,Wallace Reid,40,11,2.0
4,1910 - 1920,William S. Hart,40,12,3.0


In [6]:
fig = px.bar(acteur_par_periode, x = 'count', y="rank", text ='primaryName', color = 'primaryName',
    title = 'Quels sont les acteurs les plus présents par périodes ?',
    labels = {'startYear': 'Période', 'primaryName': 'Acteurs'},
    orientation='h',
    animation_frame="startYear",
    range_x=[0,150],
    range_y=[0,6],
    width=1300, height=800)
 
fig.update_traces(textfont_size=12, textposition='outside')
fig.update_layout()
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1000

fig.update_layout(showlegend=False, title_x=0.5)

In [7]:
# étape 8 : Visualisation
fig = px.bar(acteur_par_periode, x="startYear", y ='count', color = 'primaryName', text ='primaryName',
    title = 'Quels sont les acteurs les plus présents par périodes ?',
    labels = {'startYear': 'Période', 'primaryName': 'Acteurs'},
    width=1300, height=800)
 
fig.update_traces(textfont_size=12, textposition='outside')
fig.update_layout()

fig.update_layout(showlegend=False, title_x=0.5)

In [8]:
fig = px.bar(acteur_par_periode, x="primaryName", y ='count', color = 'startYear', text ='startYear',
    title = 'Quels sont les acteurs les plus présents par périodes ?',
    labels = {'startYear': 'Période', 'primaryName': 'Acteurs'},
    width=1200, height=900)
 
fig.update_traces(textfont_size=14)
fig.update_xaxes(tickangle=45)

fig.update_layout(showlegend=False, title_x=0.5)

In [91]:




def change_df(decade):
    """
    Print the current widget value in short sentence
    """

    #print(f'the choosen df is {decade}')
    if decade == "1920 - 1930":

    else:
        print ("not lol")

widgets.interact(change_df, decade=["1920 - 1930", "1930 - 1940", "1940 - 1950"])

interactive(children=(Dropdown(description='decade', options=('1920 - 1930', '1930 - 1940', '1940 - 1950'), va…

<function __main__.change_df(decade)>

In [50]:
def change_df(decade):
    """
    Print the current widget value in short sentence
    """
    print(f'the choosen df is {decade}')

widgets.interact(change_df, decade=["1920 - 1930", "1930 - 1940", "1940 - 1950"])


interactive(children=(Dropdown(description='decade', options=('1920 - 1930', '1930 - 1940', '1940 - 1950'), va…

<function __main__.change_df(decade)>

In [9]:
fig = px.bar(acteur_par_periode, x="startYear", y ='count', color = 'primaryName', text ='primaryName',
    title = 'Quels sont les acteurs les plus présents par périodes ?',
    labels = {'startYear': 'Période', 'primaryName': 'Acteurs'},
    width=1200, height=900)
 
fig.update_traces(textfont_size=14)
fig.update_xaxes(tickangle=45)

fig.update_layout(showlegend=False, title_x=0.5)

In [58]:
def make_boxes():
    vbox1 = widgets.VBox([widgets.Label('Left'), b1, b2])
    vbox2 = widgets.VBox([widgets.Label('Right'), dropdown, radiobuttons])
    return vbox1, vbox2
 
vbox1, vbox2 = make_boxes()
 
widgets.HBox([vbox1, vbox2])



HBox(children=(VBox(children=(Label(value='Left'), Button(description='1910 - 1920', style=ButtonStyle()), But…

In [95]:


b1 = widgets.Button(description='1910 - 1920')
b2 = widgets.Button(description='1920 - 1930')
b3 = widgets.Button(description='1930 - 1940')
b4 = widgets.Button(description='1930 - 1940')
b5 = widgets.Button(description='1940 - 1950')
b6 = widgets.Button(description='1950 - 1960')
b7 = widgets.Button(description='1960 - 1970')
b8 = widgets.Button(description='1970 - 1980')
b9 = widgets.Button(description='1980 - 1990')
b10 = widgets.Button(description='1990 - 2000')
b11 = widgets.Button(description='2000 - 2010')
b12 = widgets.Button(description='2010 - 2020')
b13 = widgets.Button(description='2020 - 2030')

def make_boxes():
    vbox1 = widgets.VBox([b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13])
    return vbox1
 
vbox1 = make_boxes()
 
widgets.HBox([vbox1])







HBox(children=(VBox(children=(Button(description='1910 - 1920', style=ButtonStyle()), Button(description='1920…

In [51]:
def change_df(decade):
    """
    Print the current widget value in short sentence
    """
    print(f'the choosen df is {decade}')



interactive(children=(Dropdown(description='decade', options=('1920 - 1930', '1930 - 1940', '1940 - 1950'), va…

<function __main__.change_df(decade)>

# <span style="color:orange">La durée moyenne des films s'allonge ou se raccourcit avec les années ?</span> 

In [12]:
basics_df.head(1)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"


In [13]:
basics_df_average1 = basics_df[["titleType", "startYear", "runtimeMinutes"]]
basics_df_average2 = basics_df_average1.loc[(basics_df_average1['titleType'] == "movie") & (basics_df_average1['runtimeMinutes'] != "\\N") & (basics_df_average1['startYear'] != "\\N")]
basics_df_average3 = basics_df_average2[["startYear", "runtimeMinutes"]]

In [14]:
basics_df_average3.head(10)

Unnamed: 0,startYear,runtimeMinutes
498,1905,100
570,1906,70
587,1907,90
672,1908,120
1172,1910,58
1246,1910,45
1273,1909,50
1485,1911,51
1578,1911,52
1616,1911,45


In [15]:
basics_df_average4 = basics_df_average3.astype({"startYear": int})
basics_df_average5 = basics_df_average4.astype({"runtimeMinutes": int})

In [19]:
# étape 2 : création d'une fonction pour transformer une date en décennie
def find_decade2(year):
    decade1 = (year // 10 * 10)
    decade2 = (year // 10 * 10) + 10
    return str(decade1) + " - " + str(decade2)

print(find_decade(1934))

1930 - 1940


In [20]:
basics_df_average5["startYear"] = basics_df_average5["startYear"].apply(find_decade2)

In [41]:
basics_df_average5.tail(20)

Unnamed: 0,startYear,runtimeMinutes
8359700,2010 - 2020,74
8359715,2020 - 2030,96
8359937,2010 - 2020,70
8360129,2010 - 2020,97
8360165,1990 - 2000,96
8360218,1990 - 2000,250
8360248,1990 - 2000,108
8360251,2010 - 2020,94
8360265,2010 - 2020,72
8360266,1990 - 2000,45


In [21]:
#basics_df_average6  = pd.DataFrame({'count' : basics_df_average5.groupby(['startYear', 'runtimeMinutes'] ).size()})
#basics_df_average6 = basics_df_average5.groupby(['startYear', 'runtimeMinutes']).mean()
basics_df_average6 = basics_df_average5.groupby(['startYear']).mean()

In [22]:
basics_df_average6.reset_index(inplace=True)

In [23]:
basics_df_average7 = basics_df_average6.astype({"runtimeMinutes": int})
basics_df_average7

Unnamed: 0,startYear,runtimeMinutes
0,1890 - 1900,98
1,1900 - 1910,69
2,1910 - 1920,64
3,1920 - 1930,72
4,1930 - 1940,81
5,1940 - 1950,86
6,1950 - 1960,91
7,1960 - 1970,93
8,1970 - 1980,90
9,1980 - 1990,91


In [26]:
# # Etape 8 : Sauvegarde du dataframe
basics_df_average7.to_csv(r'C:\Users\Berenger\Desktop\projet abc\duree_film.csv', index = False, header = True)

In [51]:
# # Etape 9 : Chargement du dataframe
duree_film = pd.read_csv(r"C:\Users\Berenger\Desktop\projet abc\duree_film.csv")
duree_film.head()

Unnamed: 0,startYear,runtimeMinutes
0,1890 - 1900,98
1,1900 - 1910,69
2,1910 - 1920,64
3,1920 - 1930,72
4,1930 - 1940,81


In [52]:
fig = px.line(duree_film, x="startYear", y="runtimeMinutes", 
title="La durée moyenne des films s'allonge ou se raccourcit avec les années ?",
labels = {'startYear': 'Périodes', 'runtimeMinutes': 'Minutes'},
markers=True) 
fig.update_xaxes(tickangle=20)

fig.update_layout(showlegend=False, title_x=0.5)

# <span style="color:orange">Les acteurs de série sont-ils les mêmes qu’au cinéma ?</span> 

In [None]:
# # Etape 9 : Chargement du dataframe
acteurs_occurences = pd.read_csv(r"C:\Users\Berenger\Desktop\projet abc\acteurs_occurences.csv")

In [84]:
# Nettoyage de la Database
name_df2 = name_df[["nconst", "primaryName"]]
# Etape 4 : set index puis merge
# Set index on nconst for Database name and Database actors_occurence_df2 to prepare a merge
name_df3 = name_df2.set_index('nconst')
actors_occurence_df3 = actors_occurence_df2.set_index('nconst')
# Merge to link first name with nconst
actors_occurence_df4 = pd.merge(name_df3, actors_occurence_df3, how='inner', left_index=True, right_index=True)

In [89]:
# Etape 5 : Nettoyage pour retirer les oeuvres très courtes comme les cartoons de quelques minutes pour lesquelles les acteurs font seulement les voix
actors_occurence_df5 = actors_occurence_df4[actors_occurence_df4['runtimeMinutes'] != '\\N']

In [90]:
actors_occurence_df5

Unnamed: 0_level_0,primaryName,titleType,startYear,runtimeMinutes,genres,category,characters
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
nm0000001,Fred Astaire,tvSeries,1968,51,"Action,Adventure,Crime",actor,"[""Alistair Mundy"",""The Panther""]"
nm0000001,Fred Astaire,tvEpisode,1962,60,Drama,actor,"[""Ted Miller""]"
nm0000001,Fred Astaire,tvEpisode,1962,60,Drama,actor,"[""Ivor St. George""]"
nm0000001,Fred Astaire,tvEpisode,1962,60,Drama,actor,"[""Andrew E. Whitbeck""]"
nm0000001,Fred Astaire,tvEpisode,1964,60,"Adventure,Comedy,Crime",actor,"[""Fred Addams""]"
...,...,...,...,...,...,...,...
nm9993408,Mark Case,tvEpisode,1976,25,"Drama,Family",actor,"[""Peter""]"
nm9993409,Tamsin Mitchell,tvEpisode,1976,25,"Drama,Family",actress,"[""Wendy Carveth""]"
nm9993447,Ruxandra Colhon,tvSeries,2013,30,Talk-Show,actor,"[""Host""]"
nm9993636,Adam French,tvSeries,\N,60,Drama,actor,"[""Danny Boy""]"


In [91]:
# Etape 6 : Value_counts pour trouver les acteurs les plus présents
actors_occurence_df8 = actors_occurence_df5['primaryName'].value_counts()
actors_occurence_df9 = actors_occurence_df8.head(20)

In [92]:
# Etape 7 : transformation d'une série en dataframe puis reset de l'index
actors_occurence_df10 = pd.DataFrame(actors_occurence_df9)
actors_occurence_df10.reset_index(inplace=True)

In [94]:
# Etape 8 : Sauvegarde du dataframe
actors_occurence_df10.to_csv(r'C:\Users\Berenger\Desktop\projet abc\presence_acteurs_series.csv', index = False, header = True)

In [96]:
# Etape 9 : Chargement du dataframe
presence_acteurs_series = pd.read_csv(r"C:\Users\Berenger\Desktop\projet abc\presence_acteurs_series.csv")
presence_acteurs_series.head(20)

Unnamed: 0,index,primaryName
0,Ray Meagher,3373
1,Jacqueline Andere,2241
2,Héctor Bonilla,2055
3,Kaneta Kimotsuki,1946
4,Lynne McGranger,1921
5,Edith González,1901
6,César Évora,1895
7,Arturo Peniche,1870
8,Noriko Ohara,1845
9,Nobuyo Ôyama,1807


In [107]:
presence_acteurs_series.rename(columns={'index': 'acteurSeries', 'primaryName': 'nombreApparition'}, inplace=True)
presence_acteurs_series.head(20)

Unnamed: 0,acteurSeries,apparitionSeries
0,Ray Meagher,3373
1,Jacqueline Andere,2241
2,Héctor Bonilla,2055
3,Kaneta Kimotsuki,1946
4,Lynne McGranger,1921
5,Edith González,1901
6,César Évora,1895
7,Arturo Peniche,1870
8,Noriko Ohara,1845
9,Nobuyo Ôyama,1807


In [108]:
presence_acteur.rename(columns={'acteurSeries': 'acteurFilms', 'apparitionSeries': 'nombreApparition'}, inplace=True)
presence_acteur.head(20)

Unnamed: 0,acteurFilms,nombreApparition
0,Mohanlal,236
1,Eric Roberts,227
2,Mammootty,224
3,Cüneyt Arkin,219
4,Raymond Hatton,202
5,Mithun Chakraborty,178
6,Seiji Nakamitsu,159
7,Amitabh Bachchan,158
8,Charles Starrett,157
9,Dharmendra,150


In [111]:
presence_acteur["acteurSeries"] = presence_acteurs_series["acteurSeries"]
presence_acteur["apparitionSeries"] = presence_acteurs_series["apparitionSeries"]
presence_acteur_film_series = presence_acteur

In [None]:
presence_acteur_film_series

In [112]:
# Etape 8 : Sauvegarde du dataframe
presence_acteur_film_series.to_csv(r'C:\Users\Berenger\Desktop\projet abc\presence_acteur_film_series.csv', index = False, header = True)

In [113]:
presence_acteur_film_series = pd.read_csv(r"C:\Users\Berenger\Desktop\projet abc\presence_acteur_film_series.csv")
presence_acteur_film_series.head(20)

Unnamed: 0,acteurFilms,nombreApparition,acteurSeries,apparitionSeries
0,Mohanlal,236,Ray Meagher,3373
1,Eric Roberts,227,Jacqueline Andere,2241
2,Mammootty,224,Héctor Bonilla,2055
3,Cüneyt Arkin,219,Kaneta Kimotsuki,1946
4,Raymond Hatton,202,Lynne McGranger,1921
5,Mithun Chakraborty,178,Edith González,1901
6,Seiji Nakamitsu,159,César Évora,1895
7,Amitabh Bachchan,158,Arturo Peniche,1870
8,Charles Starrett,157,Noriko Ohara,1845
9,Dharmendra,150,Nobuyo Ôyama,1807


# <span style="color:orange">Les acteurs ont en moyenne quel âge ?</span> 

In [None]:
# étape 1 : nettoyage de la Database
principals_df2 = principals_df[["tconst", "nconst", "category", "characters"]]
principals_df3 = principals_df2[principals_df2['category'].str.contains('actor|actress') & (principals_df2['characters'] != '\\N')]
principals_df4 = principals_df3[(principals_df3['characters'] != 'Narrator') & (principals_df3['characters'] != 'Various') & (principals_df3['characters'] != 'Additional Voices')]

basics_df2 = basics_df[["tconst", "titleType", "startYear", "runtimeMinutes", "genres"]]
# étape 2 : set index puis merge
# set index on tconst for basics and principal table
basics_df3 = basics_df2.set_index('tconst')
principals_df5 = principals_df4.set_index('tconst')
# Merge de la table principals et basics afin de filtrer uniquement sur les movies et les shorts
actors_occurence_df = pd.merge(basics_df3, principals_df5, how='inner', left_index=True, right_index=True)

actors_occurence_df2 = actors_occurence_df[(actors_occurence_df['titleType'] == 'movie') | (actors_occurence_df['titleType'] == 'short')]
name_df2 = name_df[["nconst", "primaryName", "birthYear"]]
name_df3 = name_df2.set_index('nconst')
actors_occurence_df3 = actors_occurence_df2.set_index('nconst')
# merge to link first name with nconst
actors_occurence_df4 = pd.merge(name_df3, actors_occurence_df3, how='inner', left_index=True, right_index=True)

actors_occurence_df5 = actors_occurence_df4[["birthYear", "startYear", "category"]]
actors_occurence_df6 = actors_occurence_df5.loc[(actors_occurence_df5['birthYear'] != "\\N") & (actors_occurence_df5['startYear'] != "\\N")]
actors_occurence_df6

In [None]:
#sauvegarde du dataframe
actors_occurence_df6.to_csv(r'C:\Users\Berenger\Desktop\projet abc\moyenne_age_acteurs.csv', index = False, header = True)

In [4]:
#chargement du dataframe
moyenne_age = pd.read_csv(r"C:\Users\Berenger\Desktop\projet abc\moyenne_age_acteurs.csv")

In [6]:
moyenne_age["age"] = moyenne_age["startYear"] - moyenne_age["birthYear"]
moyenne_age["age"].mean()

37.957760724284036

# <span style="color:orange">Moyenne d'âge des acteurs/actrices, par sexe</span> 

In [7]:
moyenne_age_femme = moyenne_age.loc[moyenne_age['category'] == "actress"]
moyenne_age_femme["age"].mean()



33.51685233480365

In [8]:
moyenne_age_homme = moyenne_age.loc[moyenne_age['category'] == "actor"]
moyenne_age_homme["age"].mean()

40.4562468111571

# <span style="color:orange">Quels sont les films les mieux notés ?</span> 

In [114]:
best_ratings = ratings_df.loc[ratings_df['numVotes'] >= 10000]
best_ratings

Unnamed: 0,tconst,averageRating,numVotes
11,tt0000012,7.4,11305
300,tt0000417,8.2,46833
310,tt0000439,7.3,18367
1598,tt0004972,6.2,23952
1991,tt0006864,7.7,15014
...,...,...,...
1198418,tt9898836,9.9,35027
1198420,tt9898858,5.1,12867
1198480,tt9900092,7.3,11721
1198526,tt9900782,8.5,17804


In [115]:
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [116]:
best_ratings_basics = basics_df[["tconst", "titleType", "primaryTitle", "startYear", "genres"]]
best_ratings_basics

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres
0,tt0000001,short,Carmencita,1894,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,1892,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,1892,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,1892,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,1893,"Comedy,Short"
...,...,...,...,...,...
8364110,tt9916848,tvEpisode,Episode #3.17,2010,"Action,Drama,Family"
8364111,tt9916850,tvEpisode,Episode #3.19,2010,"Action,Drama,Family"
8364112,tt9916852,tvEpisode,Episode #3.20,2010,"Action,Drama,Family"
8364113,tt9916856,short,The Wind,2015,Short


In [117]:
best_ratings_basics2 = best_ratings_basics.set_index('tconst')
best_ratings2 = best_ratings.set_index('tconst')

In [118]:
# 
best_ratings3 = pd.merge(best_ratings_basics2, best_ratings2, how='inner', left_index=True, right_index=True)

In [119]:
best_ratings4 = best_ratings3.loc[best_ratings3['titleType'] == "movie"]
best_ratings4

Unnamed: 0_level_0,titleType,primaryTitle,startYear,genres,averageRating,numVotes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0004972,movie,The Birth of a Nation,1915,"Drama,History,War",6.2,23952
tt0006864,movie,Intolerance,1916,"Drama,History",7.7,15014
tt0009968,movie,Broken Blossoms,1919,"Drama,Romance",7.3,10010
tt0010323,movie,The Cabinet of Dr. Caligari,1920,"Fantasy,Horror,Mystery",8.1,60787
tt0012349,movie,The Kid,1921,"Comedy,Drama,Family",8.3,120758
...,...,...,...,...,...,...
tt9860728,movie,Falling Inn Love,2019,"Comedy,Romance",5.6,18062
tt9866072,movie,Holidate,2020,"Comedy,Romance",6.1,53507
tt9893250,movie,I Care a Lot,2020,"Comedy,Crime,Thriller",6.3,116546
tt9898858,movie,Coffee & Kareem,2020,"Action,Comedy,Crime",5.1,12867


In [120]:
best_ratings5 = best_ratings4.sort_values(by=['averageRating'], ascending=False).head(20)

In [121]:
best_ratings5

Unnamed: 0_level_0,titleType,primaryTitle,startYear,genres,averageRating,numVotes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0252487,movie,The Chaos Class,1975,"Comedy,Drama",9.3,39196
tt0111161,movie,The Shawshank Redemption,1994,Drama,9.3,2479369
tt0068646,movie,The Godfather,1972,"Crime,Drama",9.2,1712575
tt10280296,movie,Sardar Udham,2021,"Biography,Crime,Drama",9.1,21150
tt5354160,movie,Mirror Game,2016,"Crime,Mystery,Thriller",9.1,25135
tt10189514,movie,Soorarai Pottru,2020,Drama,9.1,90750
tt2592910,movie,CM101MMXI Fundamentals,2013,"Comedy,Documentary",9.1,44771
tt0468569,movie,The Dark Knight,2008,"Action,Crime,Drama",9.0,2433653
tt0253828,movie,Tosun Pasa,1976,"Comedy,History",9.0,22612
tt0050083,movie,12 Angry Men,1957,"Crime,Drama",9.0,733855


In [122]:
#sauvegarde du dataframe
best_ratings5.to_csv(r'C:\Users\Berenger\Desktop\projet abc\films_meilleures_notes.csv', index = True, header = True)

In [159]:
#chargement du dataframe
films_meilleures_notes = pd.read_csv(r"C:\Users\Berenger\Desktop\projet abc\films_meilleures_notes.csv")

In [54]:
fig = px.bar(films_meilleures_notes.head(20), x="averageRating", y ='primaryTitle', color = "primaryTitle", text ='averageRating',
    title = 'Quels sont les films les mieux notés ?',
    labels = {'averageRating': '', 'primaryTitle': ''},
    width=1000, height=700)
 
fig.update_traces(textfont_size=11)

fig.update_layout(showlegend=False, title_x=0.5)
fig.show()

# <span style="color:orange">Partagent-ils des caractéristiques communes ?</span> 

In [10]:
films_meilleures_notes = pd.read_csv(r"C:\Users\Berenger\Desktop\projet abc\films_meilleures_notes.csv")
genres_df = films_meilleures_notes[['genres']]
new = genres_df["genres"].str.split(",", n = 1, expand = True)
new2 = new[1].str.split(",", n = 1, expand = True)
genres_df["genre1"] = new[0]
genres_df["genre2"] = new2[0]
genres_df["genre3"] = new2[1]
genres_df.head()

Unnamed: 0,genres,genre1,genre2,genre3
0,"Comedy,Drama",Comedy,Drama,
1,Drama,Drama,,
2,"Crime,Drama",Crime,Drama,
3,"Biography,Crime,Drama",Biography,Crime,Drama
4,"Crime,Mystery,Thriller",Crime,Mystery,Thriller


In [11]:
new_df = pd.concat([genres_df["genre1"], genres_df["genre2"], genres_df["genre3"]])
new_df

0          Comedy
1           Drama
2           Crime
3       Biography
4           Crime
5           Drama
6          Comedy
7          Action
8          Comedy
9           Crime
10          Crime
11      Biography
12         Comedy
13         Action
14         Comedy
15          Crime
16         Comedy
17      Biography
18         Comedy
19      Biography
0           Drama
1            None
2           Drama
3           Crime
4         Mystery
5            None
6     Documentary
7           Crime
8         History
9           Drama
10          Drama
11    Documentary
12          Drama
13      Adventure
14         Family
15          Drama
16           None
17          Crime
18          Drama
19          Drama
0            None
1            None
2            None
3           Drama
4        Thriller
5            None
6            None
7           Drama
8            None
9            None
10           None
11           None
12           None
13          Drama
14        Mystery
15        

In [12]:
new_df2 = new_df.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
new_df2

Drama          29.5%
Crime          18.2%
Comedy         15.9%
Biography       9.1%
Documentary     6.8%
Action          4.5%
Mystery         4.5%
History         4.5%
Adventure       2.3%
Family          2.3%
Thriller        2.3%
dtype: object

In [15]:
new_df3 = pd.DataFrame(new_df2)
new_df3.reset_index(inplace=True)

Unnamed: 0,index,0
0,Drama,29.5%
1,Crime,18.2%
2,Comedy,15.9%
3,Biography,9.1%
4,Documentary,6.8%
5,Action,4.5%
6,Mystery,4.5%
7,History,4.5%
8,Adventure,2.3%
9,Family,2.3%


In [35]:
new_df3.rename(columns = {0 : "genre_percent", "index" : "genre"}, inplace=True)


In [37]:
#sauvegarde du dataframe
new_df3.to_csv(r'C:\Users\Berenger\Desktop\projet abc\share_carac.csv', index = True, header = True)

In [None]:
# # Etape 9 : Chargement du dataframe
new_df3 = pd.read_csv(r"C:\Users\Berenger\Desktop\projet abc\share_carac.csv")
new_df3.head()

In [36]:
fig = px.bar(new_df3, x="genre", y="genre_percent", color = "genre_percent", text ='genre_percent',
    title = 'Partagent-ils des caractéristiques communes ?',
    labels = {'genre': '', 'genre_percent': ''},
    width=1000, height=700)



fig.update_traces(textfont_size=14)


fig.update_layout(showlegend=False, title_x=0.5, yaxis={'visible': False})
fig.show()



    

In [69]:
films_meilleures_notes["startYear"].median()

1993.5

In [97]:
#merge_basics_principals_test5 = merge_basics_principals_test4.pivot(columns="startYear")
#merge_basics_principals_test6 = merge_basics_principals_test5.apply(pd.Series.value_counts)
#merge_basics_principals_test7 = merge_basics_principals_test6.fillna(0)

In [None]:
#merge_basics_principals_test8 = merge_basics_principals_test7.transform(np.sort)
#merge_basics_principals_test8.tail(60)

In [87]:
#quelques tests

#name_df6.loc[name_df6['primaryName'] == "Nikita"]

#name_df6.loc[name_df6['primaryName'] == "Brad Pitt"]

#principals_df.loc[(principals_df['nconst'] == "nm0000093") & (principals_df['category'] == "actor")]

#principals_df[principals_df['category'].str.contains('actor|actress')]
#principals_df['nconst'].value_counts()

#name_df.loc[name_df['nconst'] == "nm10120013"]

#principals_df2 = principals_df[principals_df['category'].str.contains('actor|actress')]

#principals_df2['nconst'].value_counts()

#principals_df.loc[principals_df['nconst'] == "nm0000093"]

# name_df6['numberOfKnownForTitles'] = name_df6['knownForTitles'].str.count("tt")
# name_df6.sort_values(by=['numberOfKnownForTitles'], ascending=False).head(15)

# name_df2 = name_df.dropna(axis=0)
# name_df3 = name_df2[name_df2['primaryProfession'] != '\\N']
# name_df4 = name_df3[name_df3['birthYear'] != '\\N']
# name_df5 = name_df4[name_df4['knownForTitles'] != '\\N']
# name_df6 = name_df5[name_df5['primaryProfession'].str.contains('actor|actress')]

#akas_df2 = akas_df[(akas_df['region'] == 'FR') & (akas_df['title'] == 'Star Wars : Épisode VII - Le Réveil de la Force')]
#akas_df2 = akas_df[akas_df['region'] == 'FR']
#akas_df2 = akas_df[(akas_df['title'] == 'Mr. & Mrs. Smith') & (akas_df['region'] == 'FR')]
#akas_df2 = akas_df[akas_df['titleId'] == 'tt0000001']
#akas_df2 = akas_df[akas_df['title'] == 'Mr. & Mrs. Smith']

#actors_occurence_df6.to_csv(r'C:\Users\Berenger\Desktop\projet abc\moyenne_age_acteurs.csv', index = False, header = True)