In [2]:
import pandas as pd
import gdown
import json
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

# VERIFICATION DU FICHIER TMDB

id_drive = '1VB5_gl1fnyBDzcIOXZ5vUSbCY68VZN1v'
output_tmdb = 'tmdb_final.csv'
url_drive = f'https://drive.google.com/uc?id={id_drive}'

if not os.path.exists(output_tmdb):
    print("téléchargement")
    gdown.download(url_drive, output_tmdb, quiet=False)
else:
    print("Fichier présent")

print("Chargement TMDB...")

df_tmdb = pd.read_csv(output_tmdb)

# Nettoyage JSON

def clean_json(x):
    try:
        if pd.isna(x): return np.nan
        data = json.loads(x.replace("'", '"'))
        return ", ".join([i['name'] for i in data])
    except:
        return np.nan

if 'production_companies' in df_tmdb.columns:
    df_tmdb['companies_clean'] = df_tmdb['production_companies'].apply(clean_json)

# Suppression des colonnes inutiles

cols_drop = ['homepage', 'video', 'backdrop_path', 'status', 'production_companies', 'production_countries']
df_tmdb = df_tmdb.drop(columns=[c for c in cols_drop if c in df_tmdb.columns])


# IMDb BASICS
print("IMDb Basics : Sélection (>= 1960)...")
url_basics = "https://datasets.imdbws.com/title.basics.tsv.gz"
chunks_basics = []

if 'imdb_id' in df_tmdb.columns:
    ids_tmdb = set(df_tmdb['imdb_id'].dropna())
else:
    ids_tmdb = set()

with pd.read_csv(url_basics, sep='\t', compression='gzip', 
                 usecols=['tconst', 'titleType', 'startYear', 'isAdult', 'primaryTitle'], 
                 chunksize=500000) as reader:
    for chunk in reader:
        chunk['startYear'] = pd.to_numeric(chunk['startYear'], errors='coerce')
        chunk['isAdult'] = pd.to_numeric(chunk['isAdult'], errors='coerce').fillna(0)
        mask = ((chunk['titleType'] == 'movie') & (chunk['isAdult'] == 0) & (chunk['startYear'] >= 1960) & (chunk['tconst'].isin(ids_tmdb)))
        
        res = chunk[mask]
        if not res.empty:
            chunks_basics.append(res[['tconst', 'primaryTitle', 'startYear']])

df_basics = pd.concat(chunks_basics)
print(f"Films retenus (1960-2025) : {len(df_basics)}")

# IMDb DIRECTORS

url_principals = "https://datasets.imdbws.com/title.principals.tsv.gz"
chunks_directors = []
ids_films_finaux = set(df_basics['tconst'])

with pd.read_csv(url_principals, sep='\t', compression='gzip', 
                 usecols=['tconst', 'nconst', 'category'], chunksize=500000) as reader:
    for chunk in reader:
        mask = (chunk['category'] == 'director') & (chunk['tconst'].isin(ids_films_finaux))
        if not chunk[mask].empty:
            chunks_directors.append(chunk[mask][['tconst', 'nconst']])

if chunks_directors:
    df_directors = pd.concat(chunks_directors).drop_duplicates(subset='tconst')
    df_basics = pd.merge(df_basics, df_directors, on='tconst', how='left')
print("IMDb Directors...")

# IMDb AKAS (Régions)

url_akas = "https://datasets.imdbws.com/title.akas.tsv.gz"
chunks_akas = []
print("IMDb Akas (Régions)...")
with pd.read_csv(url_akas, sep='\t', compression='gzip', 
                 usecols=['titleId', 'region', 'language'], chunksize=500000) as reader:
    for chunk in reader:
        mask = chunk['titleId'].isin(ids_films_finaux)
        if not chunk[mask].empty:
            chunks_akas.append(chunk[mask].dropna(subset=['region']))

if chunks_akas:
    df_akas = pd.concat(chunks_akas).drop_duplicates(subset='titleId')
    df_basics = pd.merge(df_basics, df_akas, left_on='tconst', right_on='titleId', how='left')

# FUSION FINALE

print("FUSION FINALE...")
df_final = pd.merge(df_basics, df_tmdb, left_on='tconst', right_on='imdb_id', how='inner')
df_final = df_final.drop(columns=['titleId', 'imdb_id'])

print(f"résultat final : {len(df_final)} films (1960+).")
display(df_final.head())

# EXPORT
df_final.to_csv("Dataset_1960_Plus.csv", index=False)



Fichier présent
Chargement TMDB...


  df_tmdb = pd.read_csv(output_tmdb)


IMDb Basics : Sélection (>= 1960)...
Films retenus (1960-2025) : 175348
IMDb Directors...
IMDb Akas (Régions)...
FUSION FINALE...
résultat final : 175348 films (1960+).


Unnamed: 0,tconst,primaryTitle,startYear,nconst,region,language,adult,budget,genres,id,...,release_date,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count,production_companies_name,production_companies_country
0,tt0011801,Tötet nicht mehr,2019.0,nm0681726,\N,\N,False,0,"['Crime', 'Drama']",611205,...,1919-01-01,0,127,['de'],,Misericordia,0.0,0,['Rex-Film GmbH'],['']
1,tt0015724,Dama de noche,1993.0,nm0529960,\N,\N,False,0,"['Drama', 'Mystery', 'Romance', 'Thriller']",286375,...,1993-05-18,0,96,[],,Dama de Noche,7.0,6,['Centro de Capacitación Cinematográfica (CCC)'],
2,tt0035423,Kate & Leopold,2001.0,nm0003506,\N,\N,False,48000000,"['Romance', 'Fantasy', 'Comedy']",11232,...,2001-12-25,76019048,118,"['en', 'fr', 'it']","If they lived in the same century, they'd be p...",Kate & Leopold,6.326,1187,"['Konrad Pictures', 'Miramax']","['', 'US']"
3,tt0036606,"Another Time, Another Place",1983.0,nm0705535,\N,\N,False,0,['Drama'],73069,...,1983-05-13,0,118,"['en', 'it']",,"Another Time, Another Place",4.7,6,"['Umbrella', 'Associated-Rediffusion Televisio...",
4,tt0038687,Let There Be Light,1980.0,,\N,\N,False,0,"['Documentary', 'War']",86990,...,1946-12-16,0,58,['en'],,Let There Be Light,7.4,35,['U.S. Army Pictorial Services'],


In [3]:
df_final.columns.tolist()

['tconst',
 'primaryTitle',
 'startYear',
 'nconst',
 'region',
 'language',
 'adult',
 'budget',
 'genres',
 'id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'tagline',
 'title',
 'vote_average',
 'vote_count',
 'production_companies_name',
 'production_companies_country']

In [4]:
df_final.head()

Unnamed: 0,tconst,primaryTitle,startYear,nconst,region,language,adult,budget,genres,id,...,release_date,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count,production_companies_name,production_companies_country
0,tt0011801,Tötet nicht mehr,2019.0,nm0681726,\N,\N,False,0,"['Crime', 'Drama']",611205,...,1919-01-01,0,127,['de'],,Misericordia,0.0,0,['Rex-Film GmbH'],['']
1,tt0015724,Dama de noche,1993.0,nm0529960,\N,\N,False,0,"['Drama', 'Mystery', 'Romance', 'Thriller']",286375,...,1993-05-18,0,96,[],,Dama de Noche,7.0,6,['Centro de Capacitación Cinematográfica (CCC)'],
2,tt0035423,Kate & Leopold,2001.0,nm0003506,\N,\N,False,48000000,"['Romance', 'Fantasy', 'Comedy']",11232,...,2001-12-25,76019048,118,"['en', 'fr', 'it']","If they lived in the same century, they'd be p...",Kate & Leopold,6.326,1187,"['Konrad Pictures', 'Miramax']","['', 'US']"
3,tt0036606,"Another Time, Another Place",1983.0,nm0705535,\N,\N,False,0,['Drama'],73069,...,1983-05-13,0,118,"['en', 'it']",,"Another Time, Another Place",4.7,6,"['Umbrella', 'Associated-Rediffusion Televisio...",
4,tt0038687,Let There Be Light,1980.0,,\N,\N,False,0,"['Documentary', 'War']",86990,...,1946-12-16,0,58,['en'],,Let There Be Light,7.4,35,['U.S. Army Pictorial Services'],


In [5]:
df_final["language"].unique()

array(['\\N', nan], dtype=object)

#primary_title
#region
#language
#budget
#title
#production_companie_name et country


In [6]:
df_final_copy = df_final.copy()

In [7]:
df_coldrop = df_final_copy.drop(columns=['primaryTitle', 'region', 'language', 'budget', 'title', 'tagline', 'production_companies_name', 'production_companies_country'])

In [8]:
df_coldrop.head()

Unnamed: 0,tconst,startYear,nconst,adult,genres,id,original_language,original_title,overview,popularity,poster_path,release_date,revenue,runtime,spoken_languages,vote_average,vote_count
0,tt0011801,2019.0,nm0681726,False,"['Crime', 'Drama']",611205,de,Tötet nicht mehr!,The director and co-writer Lupu Pick plays mus...,0.6,/39PDyEcYl2B3XeRcsCXJ4V72g8x.jpg,1919-01-01,0,127,['de'],0.0,0
1,tt0015724,1993.0,nm0529960,False,"['Drama', 'Mystery', 'Romance', 'Thriller']",286375,en,Dama de Noche,"Bruno, a novelist with no luck goes to call fo...",0.833,/zCmyAl7VG6aZJqWZ7PFfj9e6ToU.jpg,1993-05-18,0,96,[],7.0,6
2,tt0035423,2001.0,nm0003506,False,"['Romance', 'Fantasy', 'Comedy']",11232,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,15.77,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,2001-12-25,76019048,118,"['en', 'fr', 'it']",6.326,1187
3,tt0036606,1983.0,nm0705535,False,['Drama'],73069,it,"Another Time, Another Place",Set in 1943 in Scotland during World War II. J...,1.4,/anoPMnxdrL4B7EMZZA5tQCmod65.jpg,1983-05-13,0,118,"['en', 'it']",4.7,6
4,tt0038687,1980.0,,False,"['Documentary', 'War']",86990,en,Let There Be Light,The final entry in a trilogy of films produced...,3.575,/wgcAMb5BLKFANzTDfKwnxeH1kYA.jpg,1946-12-16,0,58,['en'],7.4,35


In [9]:
df_coldrop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175346 entries, 0 to 175345
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   tconst             175346 non-null  object 
 1   startYear          175346 non-null  float64
 2   nconst             173662 non-null  object 
 3   adult              175346 non-null  bool   
 4   genres             175346 non-null  object 
 5   id                 175346 non-null  int64  
 6   original_language  175346 non-null  object 
 7   original_title     175346 non-null  object 
 8   overview           163219 non-null  object 
 9   popularity         175346 non-null  float64
 10  poster_path        158745 non-null  object 
 11  release_date       172243 non-null  object 
 12  revenue            175346 non-null  int64  
 13  runtime            175346 non-null  int64  
 14  spoken_languages   175346 non-null  object 
 15  vote_average       175346 non-null  float64
 16  vo

In [10]:
df_coldrop.head()

Unnamed: 0,tconst,startYear,nconst,adult,genres,id,original_language,original_title,overview,popularity,poster_path,release_date,revenue,runtime,spoken_languages,vote_average,vote_count
0,tt0011801,2019.0,nm0681726,False,"['Crime', 'Drama']",611205,de,Tötet nicht mehr!,The director and co-writer Lupu Pick plays mus...,0.6,/39PDyEcYl2B3XeRcsCXJ4V72g8x.jpg,1919-01-01,0,127,['de'],0.0,0
1,tt0015724,1993.0,nm0529960,False,"['Drama', 'Mystery', 'Romance', 'Thriller']",286375,en,Dama de Noche,"Bruno, a novelist with no luck goes to call fo...",0.833,/zCmyAl7VG6aZJqWZ7PFfj9e6ToU.jpg,1993-05-18,0,96,[],7.0,6
2,tt0035423,2001.0,nm0003506,False,"['Romance', 'Fantasy', 'Comedy']",11232,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,15.77,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,2001-12-25,76019048,118,"['en', 'fr', 'it']",6.326,1187
3,tt0036606,1983.0,nm0705535,False,['Drama'],73069,it,"Another Time, Another Place",Set in 1943 in Scotland during World War II. J...,1.4,/anoPMnxdrL4B7EMZZA5tQCmod65.jpg,1983-05-13,0,118,"['en', 'it']",4.7,6
4,tt0038687,1980.0,,False,"['Documentary', 'War']",86990,en,Let There Be Light,The final entry in a trilogy of films produced...,3.575,/wgcAMb5BLKFANzTDfKwnxeH1kYA.jpg,1946-12-16,0,58,['en'],7.4,35


In [11]:
copy_coldrop = df_coldrop.copy()

In [12]:
df_null = ["nconst", "overview", "poster_path", "release_date"]

In [13]:
copy_coldrop[df_null] = copy_coldrop[df_null].fillna("unknown")

In [14]:
copy_coldrop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175346 entries, 0 to 175345
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   tconst             175346 non-null  object 
 1   startYear          175346 non-null  float64
 2   nconst             175346 non-null  object 
 3   adult              175346 non-null  bool   
 4   genres             175346 non-null  object 
 5   id                 175346 non-null  int64  
 6   original_language  175346 non-null  object 
 7   original_title     175346 non-null  object 
 8   overview           175346 non-null  object 
 9   popularity         175346 non-null  float64
 10  poster_path        175346 non-null  object 
 11  release_date       175346 non-null  object 
 12  revenue            175346 non-null  int64  
 13  runtime            175346 non-null  int64  
 14  spoken_languages   175346 non-null  object 
 15  vote_average       175346 non-null  float64
 16  vo

In [15]:
df_clean = copy_coldrop

In [16]:
url_names_basics = "https://datasets.imdbws.com/name.basics.tsv.gz"
df_names_basics = pd.read_csv(url_names_basics, sep='\t', compression='gzip', nrows=500000)
df_names_basics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0027125,tt0025164"
1,nm0000002,Lauren Bacall,1924,2014,"actress,miscellaneous,soundtrack","tt0037382,tt0075213,tt0038355,tt0117057"
2,nm0000003,Brigitte Bardot,1934,2025,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


In [17]:
df_clean.head()

Unnamed: 0,tconst,startYear,nconst,adult,genres,id,original_language,original_title,overview,popularity,poster_path,release_date,revenue,runtime,spoken_languages,vote_average,vote_count
0,tt0011801,2019.0,nm0681726,False,"['Crime', 'Drama']",611205,de,Tötet nicht mehr!,The director and co-writer Lupu Pick plays mus...,0.6,/39PDyEcYl2B3XeRcsCXJ4V72g8x.jpg,1919-01-01,0,127,['de'],0.0,0
1,tt0015724,1993.0,nm0529960,False,"['Drama', 'Mystery', 'Romance', 'Thriller']",286375,en,Dama de Noche,"Bruno, a novelist with no luck goes to call fo...",0.833,/zCmyAl7VG6aZJqWZ7PFfj9e6ToU.jpg,1993-05-18,0,96,[],7.0,6
2,tt0035423,2001.0,nm0003506,False,"['Romance', 'Fantasy', 'Comedy']",11232,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,15.77,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,2001-12-25,76019048,118,"['en', 'fr', 'it']",6.326,1187
3,tt0036606,1983.0,nm0705535,False,['Drama'],73069,it,"Another Time, Another Place",Set in 1943 in Scotland during World War II. J...,1.4,/anoPMnxdrL4B7EMZZA5tQCmod65.jpg,1983-05-13,0,118,"['en', 'it']",4.7,6
4,tt0038687,1980.0,unknown,False,"['Documentary', 'War']",86990,en,Let There Be Light,The final entry in a trilogy of films produced...,3.575,/wgcAMb5BLKFANzTDfKwnxeH1kYA.jpg,1946-12-16,0,58,['en'],7.4,35


In [18]:
df_clean["release_date"] = pd.to_datetime(df_clean["release_date"], errors='coerce')
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175346 entries, 0 to 175345
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   tconst             175346 non-null  object        
 1   startYear          175346 non-null  float64       
 2   nconst             175346 non-null  object        
 3   adult              175346 non-null  bool          
 4   genres             175346 non-null  object        
 5   id                 175346 non-null  int64         
 6   original_language  175346 non-null  object        
 7   original_title     175346 non-null  object        
 8   overview           175346 non-null  object        
 9   popularity         175346 non-null  float64       
 10  poster_path        175346 non-null  object        
 11  release_date       172243 non-null  datetime64[ns]
 12  revenue            175346 non-null  int64         
 13  runtime            175346 non-null  int64   

In [19]:
df_film = df_clean[["original_title", "nconst", "release_date"]]
df_film.head()

Unnamed: 0,original_title,nconst,release_date
0,Tötet nicht mehr!,nm0681726,1919-01-01
1,Dama de Noche,nm0529960,1993-05-18
2,Kate & Leopold,nm0003506,2001-12-25
3,"Another Time, Another Place",nm0705535,1983-05-13
4,Let There Be Light,unknown,1946-12-16


In [20]:
df_merge_filmactors = pd.merge(df_film, df_names_basics, on ='nconst', how='left')
df_merge_filmactors.head()

Unnamed: 0,original_title,nconst,release_date,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,Tötet nicht mehr!,nm0681726,1919-01-01,,,,,
1,Dama de Noche,nm0529960,1993-05-18,Eva López Sánchez,1954.0,\N,"director,writer,editor","tt0151790,tt0314085,tt0015724,tt1139666"
2,Kate & Leopold,nm0003506,2001-12-25,James Mangold,1963.0,\N,"producer,director,writer","tt3315342,tt11563598,tt1950186,tt0358273"
3,"Another Time, Another Place",nm0705535,1983-05-13,,,,,
4,Let There Be Light,unknown,1946-12-16,,,,,


In [21]:
df_film_duplicates = df_merge_filmactors.drop_duplicates()
df_film_duplicates.head()

Unnamed: 0,original_title,nconst,release_date,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,Tötet nicht mehr!,nm0681726,1919-01-01,,,,,
1,Dama de Noche,nm0529960,1993-05-18,Eva López Sánchez,1954.0,\N,"director,writer,editor","tt0151790,tt0314085,tt0015724,tt1139666"
2,Kate & Leopold,nm0003506,2001-12-25,James Mangold,1963.0,\N,"producer,director,writer","tt3315342,tt11563598,tt1950186,tt0358273"
3,"Another Time, Another Place",nm0705535,1983-05-13,,,,,
4,Let There Be Light,unknown,1946-12-16,,,,,


In [22]:
df_chunck = pd.read_csv(url_basics, sep='\t', compression='gzip', chunksize=500000)
if not res.empty:
    chunks_basics.append(res)


In [23]:
df_merge_filmactors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175346 entries, 0 to 175345
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   original_title     175346 non-null  object        
 1   nconst             175346 non-null  object        
 2   release_date       172243 non-null  datetime64[ns]
 3   primaryName        57246 non-null   object        
 4   birthYear          57246 non-null   object        
 5   deathYear          57246 non-null   object        
 6   primaryProfession  57246 non-null   object        
 7   knownForTitles     57246 non-null   object        
dtypes: datetime64[ns](1), object(7)
memory usage: 10.7+ MB


In [24]:
df_merge_filmactors.columns.tolist()

['original_title',
 'nconst',
 'release_date',
 'primaryName',
 'birthYear',
 'deathYear',
 'primaryProfession',
 'knownForTitles']

In [25]:
cols_keep = ["original_title", "release_date", "nconst","primaryName", "primaryProfession"]
# df_clean = nettoyage de merge df_names_basics et df_clean(nettoyage_pro nettoyé)

df_clean2 = df_merge_filmactors[cols_keep]

In [26]:
df_clean2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175346 entries, 0 to 175345
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   original_title     175346 non-null  object        
 1   release_date       172243 non-null  datetime64[ns]
 2   nconst             175346 non-null  object        
 3   primaryName        57246 non-null   object        
 4   primaryProfession  57246 non-null   object        
dtypes: datetime64[ns](1), object(4)
memory usage: 6.7+ MB


In [27]:
df_clean2['release_date'] = pd.to_datetime(df_clean2['release_date'], errors='coerce')
df_clean2['year'] = df_clean2['release_date'].dt.year
actors_df = df_clean2[df_clean2['primaryProfession'].str.contains('actor|actress', case=False, na=False)].copy()
analyse_acteurs = actors_df.groupby(['nconst', 'primaryName']).agg(nombre_de_films=('original_title', 'count'), debut_carriere=('year', 'min'), fin_carriere=('year', 'max')
).reset_index()
analyse_acteurs['longevite'] = analyse_acteurs['fin_carriere'] - analyse_acteurs['debut_carriere']
top_acteurs = analyse_acteurs.sort_values(by='nombre_de_films', ascending=False)
top_acteurs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean2['release_date'] = pd.to_datetime(df_clean2['release_date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean2['year'] = df_clean2['release_date'].dt.year


Unnamed: 0,nconst,primaryName,nombre_de_films,debut_carriere,fin_carriere,longevite
256,nm0001238,Jesús Franco,137,1960.0,2013.0,53.0
4970,nm0482774,Joel Lamangan,75,1991.0,2023.0,32.0
2351,nm0187671,Alfredo B. Crevenna,72,1960.0,1995.0,35.0
3322,nm0297935,Kinji Fukasaku,59,1961.0,2000.0,39.0
2104,nm0159201,Yuen Chor,58,1960.0,1990.0,30.0
...,...,...,...,...,...,...
3447,nm0311355,Bob Gebert,1,2007.0,2007.0,0.0
1494,nm0095613,Julian Boote,1,2001.0,2001.0,0.0
1492,nm0095573,Katrine Boorman,1,2012.0,2012.0,0.0
1491,nm0095561,Mika Boorem,1,2021.0,2021.0,0.0


In [None]:
import plotly.express as px
fig = px.bar(top_acteurs, x='nombre_de_films', y='primaryName', orientation='h', title="Top 10 des acteurs les plus présents",
             labels={'nombre_de_films': 'Nombre de films', 'primaryName': 'Acteur'},
             color='longevite')
st.plotly_chart(fig)-+8*7+9*

2026-01-20 14:31:41.323 
  command:

    streamlit run c:\Users\kinga\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [29]:
df_clean['release_date'] = pd.to_datetime(df_clean['release_date'], errors='coerce')
df_clean['year'] = df_clean['release_date'].dt.year
df_clean = df_clean.dropna(subset=['year', 'runtime'])
evolution_duree = df_clean.groupby('year')['runtime'].mean().reset_index()

In [30]:
# 1. Nettoyage de l'année
copy_coldrop['year'] = copy_coldrop['startYear'].astype(int)

# 2. Filtrer les durées réalistes (sécurité)
copy_coldrop = copy_coldrop[(copy_coldrop['runtime'] >= 30) & (copy_coldrop['runtime'] <= 300)]

# 3. Calcul de la durée moyenne par année
evolution_duree = (copy_coldrop.groupby('year')['runtime'].mean().reset_index())
evolution_duree

Unnamed: 0,year,runtime
0,1960,95.422747
1,1961,94.311311
2,1962,95.607558
3,1963,94.986330
4,1964,96.085577
...,...,...
63,2023,97.907672
64,2024,93.617391
65,2025,99.863636
66,2026,96.000000


In [32]:
copy_coldrop.head()

Unnamed: 0,tconst,startYear,nconst,adult,genres,id,original_language,original_title,overview,popularity,poster_path,release_date,revenue,runtime,spoken_languages,vote_average,vote_count,year
0,tt0011801,2019.0,nm0681726,False,"['Crime', 'Drama']",611205,de,Tötet nicht mehr!,The director and co-writer Lupu Pick plays mus...,0.6,/39PDyEcYl2B3XeRcsCXJ4V72g8x.jpg,1919-01-01,0,127,['de'],0.0,0,2019
1,tt0015724,1993.0,nm0529960,False,"['Drama', 'Mystery', 'Romance', 'Thriller']",286375,en,Dama de Noche,"Bruno, a novelist with no luck goes to call fo...",0.833,/zCmyAl7VG6aZJqWZ7PFfj9e6ToU.jpg,1993-05-18,0,96,[],7.0,6,1993
2,tt0035423,2001.0,nm0003506,False,"['Romance', 'Fantasy', 'Comedy']",11232,en,Kate & Leopold,When her scientist ex-boyfriend discovers a po...,15.77,/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg,2001-12-25,76019048,118,"['en', 'fr', 'it']",6.326,1187,2001
3,tt0036606,1983.0,nm0705535,False,['Drama'],73069,it,"Another Time, Another Place",Set in 1943 in Scotland during World War II. J...,1.4,/anoPMnxdrL4B7EMZZA5tQCmod65.jpg,1983-05-13,0,118,"['en', 'it']",4.7,6,1983
4,tt0038687,1980.0,unknown,False,"['Documentary', 'War']",86990,en,Let There Be Light,The final entry in a trilogy of films produced...,3.575,/wgcAMb5BLKFANzTDfKwnxeH1kYA.jpg,1946-12-16,0,58,['en'],7.4,35,1980


In [33]:
df_merge_filmactors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175346 entries, 0 to 175345
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   original_title     175346 non-null  object        
 1   nconst             175346 non-null  object        
 2   release_date       172243 non-null  datetime64[ns]
 3   primaryName        57246 non-null   object        
 4   birthYear          57246 non-null   object        
 5   deathYear          57246 non-null   object        
 6   primaryProfession  57246 non-null   object        
 7   knownForTitles     57246 non-null   object        
dtypes: datetime64[ns](1), object(7)
memory usage: 10.7+ MB


In [38]:
# --- A. PRÉPARATION DES DONNÉES ---

# 1. Extraction de l'année de sortie
# On convertit en datetime et on récupère l'année
df_merge_filmactors['release_date'] = pd.to_datetime(df_merge_filmactors['release_date'], errors='coerce')
df_merge_filmactors['year_film'] = df_merge_filmactors['release_date'].dt.year

# 2. Nettoyage de l'année de naissance
# 'coerce' va transformer les erreurs (textes, vides) en NaN (Not a Number)
df_merge_filmactors['birthYear'] = pd.to_numeric(df_merge_filmactors['birthYear'], errors='coerce')

# 3. Calcul de l'âge au moment du film
df_merge_filmactors['age_au_tournage'] = df_merge_filmactors['year_film'] - df_merge_filmactors['birthYear']

# 4. Filtrage des données aberrantes
# On ne garde que les âges réalistes (entre 5 ans et 100 ans)
# Cela élimine les erreurs de dates ou les données manquantes
df_age_clean = df_merge_filmactors[(df_merge_filmactors['age_au_tournage'] > 5) & (df_merge_filmactors['age_au_tournage'] < 100)].copy()
# --- B. AGRÉGATION ---

# On calcule la moyenne d'âge par année de film
age_moyenne_actor = df_age_clean.groupby('year_film')['age_au_tournage'].mean().round(2).reset_index()
age_moyenne_actor

Unnamed: 0,year_film,age_au_tournage
0,1938.0,41.00
1,1952.0,27.00
2,1953.0,46.33
3,1957.0,21.50
4,1958.0,37.50
...,...,...
69,2023.0,59.00
70,2024.0,61.62
71,2025.0,51.00
72,2026.0,72.00


In [None]:
class_columns = ["tconst", "original_title", "genres", "vote_count"]
class_film = copy_coldrop[class_columns].sort_values(by='vote_count', ascending=False)
class_film


Unnamed: 0,tconst,original_title,genres,vote_count
96404,tt1375666,Inception,"['Action', 'Science Fiction', 'Adventure']",33630
73216,tt0816692,Interstellar,"['Adventure', 'Drama', 'Science Fiction']",31296
69519,tt0468569,The Dark Knight,"['Drama', 'Action', 'Crime', 'Thriller']",29639
71773,tt0499549,Avatar,"['Action', 'Adventure', 'Fantasy', 'Science Fi...",29045
74023,tt0848228,The Avengers,"['Science Fiction', 'Action', 'Adventure']",28541
...,...,...,...,...
109791,tt1691334,Out of the Ashes,['Documentary'],0
109792,tt1691341,Son Gülen Tam Güler,"['Thriller', 'Action']",0
109794,tt1691448,Adventures in Plymptoons!,['Documentary'],0
109798,tt16914952,బ్లాక్,"['Action', 'Thriller']",0


In [40]:
df_coldrop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175346 entries, 0 to 175345
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   tconst             175346 non-null  object 
 1   startYear          175346 non-null  float64
 2   nconst             173662 non-null  object 
 3   adult              175346 non-null  bool   
 4   genres             175346 non-null  object 
 5   id                 175346 non-null  int64  
 6   original_language  175346 non-null  object 
 7   original_title     175346 non-null  object 
 8   overview           163219 non-null  object 
 9   popularity         175346 non-null  float64
 10  poster_path        158745 non-null  object 
 11  release_date       172243 non-null  object 
 12  revenue            175346 non-null  int64  
 13  runtime            175346 non-null  int64  
 14  spoken_languages   175346 non-null  object 
 15  vote_average       175346 non-null  float64
 16  vo

In [None]:
import pandas as pd

url_names_basics = "https://datasets.imdbws.com/name.basics.tsv.gz"

# 1. Définir la taille du morceau (ex: 100 000 lignes à la fois)
chunk_size = 100000

# 2. Créer l'itérateur
# On garde sep='\t' et compression='gzip'
reader = pd.read_csv(url_names_basics, sep='\t', compression='gzip', chunksize=chunk_size)

# 3. Boucler sur les morceaux pour traiter les données
# On va créer une liste pour stocker les résultats filtrés (pour économiser la RAM)
df_final_list = []

for i, chunk in enumerate(reader):
    # EXEMPLE : On ne garde que les acteurs pour réduire la taille immédiatement
    # (On utilise .str.contains comme vu précédemment)
    chunk_filtered = chunk[chunk['primaryProfession'].str.contains('actor|actress', na=False, case=False)]
    
    # On ajoute le morceau filtré à notre liste
    df_final_list.append(chunk_filtered)
    
    print(f"Traitement du morceau n°{i+1} terminé")
    
    # OPTIONNEL : Arrêter après un certain nombre de chunks pour tester
    if i == 5: 
        break

# 4. Rassembler tous les morceaux filtrés en un seul DataFrame
df_names_basics = pd.concat(df_final_list, ignore_index=True)

st.write("Chargement terminé !")
st.dataframe(df_names_basics.head())

In [None]:
import pandas as pd

# Charger les noms IMDb
url_names = "https://datasets.imdbws.com/name.basics.tsv.gz"

df_names = pd.read_csv(url_names,sep="\t", compression="gzip", usecols=["nconst", "primaryName"])

# Fusion avec le dataframe films (qui contient déjà nconst)
df_basics = df_basics.merge(df_names,on="nconst",how="left")

# Renommer proprement
df_basics = df_basics.rename(columns={"primaryName": "director_name"})


NameError: name 'df_basics' is not defined

In [3]:
url_principals = "https://datasets.imdbws.com/title.principals.tsv.gz"
chunks_directors = []
ids_films_finaux = set(df_basics['tconst'])

with pd.read_csv(url_principals, sep='\t', compression='gzip', 
                 usecols=['tconst', 'nconst', 'category'], chunksize=500000) as reader:
    for chunk in reader:
        mask = (chunk['category'] == 'director') & (chunk['tconst'].isin(ids_films_finaux))
        if not chunk[mask].empty:
            chunks_directors.append(chunk[mask][['tconst', 'nconst']])

if chunks_directors:
    df_directors = pd.concat(chunks_directors).drop_duplicates(subset='tconst')
    df_basics = pd.merge(df_basics, df_directors, on='tconst', how='left')
print("IMDb Directors...")

KeyboardInterrupt: 