In [None]:
import pandas as pd
import gdown
import json
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

# VERIFICATION DU FICHIER TMDB

id_drive = '1VB5_gl1fnyBDzcIOXZ5vUSbCY68VZN1v'
output_tmdb = 'tmdb_final.csv'
url_drive = f'https://drive.google.com/uc?id={id_drive}'

if not os.path.exists(output_tmdb):
    print("téléchargement")
    gdown.download(url_drive, output_tmdb, quiet=False)
else:
    print("Fichier présent")

print("Chargement TMDB...")

df_tmdb = pd.read_csv(output_tmdb)

# Nettoyage JSON

def clean_json(x):
    try:
        if pd.isna(x): return np.nan
        data = json.loads(x.replace("'", '"'))
        return ", ".join([i['name'] for i in data])
    except:
        return np.nan

if 'production_companies' in df_tmdb.columns:
    df_tmdb['companies_clean'] = df_tmdb['production_companies'].apply(clean_json)

# Suppression des colonnes inutiles

cols_drop = ['homepage', 'video', 'backdrop_path', 'status', 'production_companies', 'production_countries']
df_tmdb = df_tmdb.drop(columns=[c for c in cols_drop if c in df_tmdb.columns])


# IMDb BASICS
print("IMDb Basics : Sélection (>= 1960)...")
url_basics = "https://datasets.imdbws.com/title.basics.tsv.gz"
chunks_basics = []

if 'imdb_id' in df_tmdb.columns:
    ids_tmdb = set(df_tmdb['imdb_id'].dropna())
else:
    ids_tmdb = set()

with pd.read_csv(url_basics, sep='\t', compression='gzip', 
                 usecols=['tconst', 'titleType', 'startYear', 'isAdult', 'primaryTitle'], 
                 chunksize=500000) as reader:
    for chunk in reader:
        chunk['startYear'] = pd.to_numeric(chunk['startYear'], errors='coerce')
        chunk['isAdult'] = pd.to_numeric(chunk['isAdult'], errors='coerce').fillna(0)
        mask = ((chunk['titleType'] == 'movie') & (chunk['isAdult'] == 0) & (chunk['startYear'] >= 1960) & (chunk['tconst'].isin(ids_tmdb)))
        
        res = chunk[mask]
        if not res.empty:
            chunks_basics.append(res[['tconst', 'primaryTitle', 'startYear']])

df_basics = pd.concat(chunks_basics)
print(f"Films retenus (1960-2025) : {len(df_basics)}")

# IMDb DIRECTORS

url_principals = "https://datasets.imdbws.com/title.principals.tsv.gz"
chunks_directors = []
ids_films_finaux = set(df_basics['tconst'])

with pd.read_csv(url_principals, sep='\t', compression='gzip', 
                 usecols=['tconst', 'nconst', 'category'], chunksize=500000) as reader:
    for chunk in reader:
        mask = (chunk['category'] == 'director') & (chunk['tconst'].isin(ids_films_finaux))
        if not chunk[mask].empty:
            chunks_directors.append(chunk[mask][['tconst', 'nconst']])

if chunks_directors:
    df_directors = pd.concat(chunks_directors).drop_duplicates(subset='tconst')
    df_basics = pd.merge(df_basics, df_directors, on='tconst', how='left')
print("IMDb Directors...")

# IMDb AKAS (Régions)

url_akas = "https://datasets.imdbws.com/title.akas.tsv.gz"
chunks_akas = []
print("IMDb Akas (Régions)...")
with pd.read_csv(url_akas, sep='\t', compression='gzip', 
                 usecols=['titleId', 'region', 'language'], chunksize=500000) as reader:
    for chunk in reader:
        mask = chunk['titleId'].isin(ids_films_finaux)
        if not chunk[mask].empty:
            chunks_akas.append(chunk[mask].dropna(subset=['region']))

if chunks_akas:
    df_akas = pd.concat(chunks_akas).drop_duplicates(subset='titleId')
    df_basics = pd.merge(df_basics, df_akas, left_on='tconst', right_on='titleId', how='left')


# FUSION FINALE

print("FUSION FINALE...")
df_final = pd.merge(df_basics, df_tmdb, left_on='tconst', right_on='imdb_id', how='inner')
df_final = df_final.drop(columns=['titleId', 'imdb_id'])

print(f"résultat final : {len(df_final)} films (1960+).")
display(df_final.head())

# EXPORT
df_final.to_csv("Dataset_1960_Plus.csv", index=False)

ModuleNotFoundError: No module named 'pandas'