In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import xgboost 
from sklearn import set_config
set_config(transform_output="pandas")
pd.set_option('display.max_columns', 500)

In [None]:
from db.database_mysql import engine

df = pd.read_sql_query('''SELECT jp.raw_title, 
        YEAR(jp.date) AS year, 
        MONTH(jp.date) AS month, 
        DAY(jp.date) AS day, 
        im.director, im.date,im.casting, im.distributor, im.genre, jp.country, jp.duration, jp.first_day, jp.first_week, jp.first_weekend, jp.hebdo_rank, 
jp.total_spectator, jp.copies, im.rating_press, im.budget, im.lang, im.award 
FROM films_jp as jp
LEFT JOIN films_imdb im ON im.id_jp = jp.id 
where im.id_jp is not null and im.date = jp.date
order by jp.first_week desc''', engine)

# CREATION DES SCORES

In [None]:
from modelisation.functions import *

clone = df.copy()
calculate_director_scores(clone)
calculate_distributor_scores(clone)
calculate_actor_scores(clone)
calculate_year_scores(clone)
calculate_country_scores(clone)
print()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

from modelisation.functions import *

entree_transformer = FunctionTransformer(classify_entrees_year, kw_args={'column' : 'year'})
season_transformer = FunctionTransformer(classify_season,  kw_args={'column' : 'month'})
month_transformer = FunctionTransformer(classify_month_name,  kw_args={'column' : 'month'})
holiday_transformer = FunctionTransformer(is_holiday)
drop_transformer = FunctionTransformer(drop_temp)

pipe_scores = Pipeline([
  ('season_dict' , make_pipeline(season_transformer)),
  ('entree_dict' , make_pipeline(entree_transformer)),
  ('month_dict' , make_pipeline(month_transformer)),
  ('holiday_dict' , make_pipeline(holiday_transformer)),
  #('drop_dict' , make_pipeline(drop_transformer))
])

scores = df.copy()
scores = pipe_scores.fit_transform(scores)
scores.head(1)

# MODELE

In [None]:
# film = pd.read_sql_query('''SELECT 
#                               YEAR(date) AS year, 
#                               MONTH(date) AS month, 
#                               DAY(date) AS day, 
#                               director, distributor, casting, copies, duration, country
#                             FROM functionalities_filmscrap
#                             LIMIT 1
#                          ''', engine)
# film

In [77]:
from modelisation.converter import *

def convert_entrees_year2(df, column):
    print("convert_entrees_year", df.iloc[0][column])
    scores = load_file("year_scores")

    val = []
    for index, row in df.iterrows():
        # if index == 2448:
        #     print('****')
        #     print(df.iloc[index])
        try:
            found = scores.loc[scores[column] == df.iloc[index][column]]
            val.append(found.iloc[0]["year_combined_score"])
        except Exception:
            val.append(0)

    df["year_combined_score"] = pd.Series(val)

    return df

entree_converter = FunctionTransformer(convert_entrees_year2, kw_args={'column' : 'year'})
country_converter = FunctionTransformer(convert_country, kw_args={'column' : 'country'})
season_converter = FunctionTransformer(classify_season,  kw_args={'column' : 'month'})
month_converter = FunctionTransformer(classify_month_name,  kw_args={'column' : 'month'})
holiday_converter = FunctionTransformer(is_holiday)
director_converter = FunctionTransformer(convert_director, kw_args={'column' : 'director'})
actor_converter = FunctionTransformer(convert_actor, kw_args={'column' : 'casting'})
distributor_converter = FunctionTransformer(convert_distributor, kw_args={'column' : 'distributor'})

def drop_after_converter(df):
  return df.drop(columns=["director",	"casting", "copies",	"distributor",	"country", "month", "day", "year"])

drop_after_converter_pipeline = Pipeline([
  ('drop_after_converter', make_pipeline(FunctionTransformer(drop_after_converter)))
])

def set_ohe(df):
  return pd.get_dummies(df, columns=['season', 'month_name'], dtype='int')

set_ohe_pipeline = Pipeline([
  ('drop_after_converter', make_pipeline(FunctionTransformer(set_ohe)))
])

pipe = Pipeline([
  ('entree_converter' , make_pipeline(entree_converter)),
  ('actor_converter' , make_pipeline(actor_converter)),
  ('season_converter' , make_pipeline(season_converter)),
  ('country_converter' , make_pipeline(country_converter)),
  ('month_converter' , make_pipeline(month_converter)),
  ('holiday_converter' , make_pipeline(holiday_converter)),
  ('director_converter' , make_pipeline(director_converter)),
  ('distributor_converter' , make_pipeline(distributor_converter)),
  ('drop' , make_pipeline(drop_after_converter_pipeline)),
  ('set_ohe' , make_pipeline(set_ohe_pipeline)),
  #('poly' , make_pipeline(PolynomialFeatures())),
  #('scale', make_pipeline(RobustScaler(with_centering=False)))
])

# t = df.copy()
# t.drop(columns=['raw_title', 'date', 'genre', 'budget', 'total_spectator', 'first_day', 'first_weekend', 'hebdo_rank', 'total_spectator', 'rating_press', 'award', 'lang'], inplace=True)
# p = pipe.fit_transform(t)
# p[p.isna().any(axis=1)]


In [91]:
df_clean = df.copy() 
df_clean.drop(columns=['raw_title', 'date', 'genre', 'budget', 'total_spectator', 'first_day', 'first_weekend', 'hebdo_rank', 'total_spectator', 'rating_press', 'award', 'lang'], inplace=True)

X = df_clean.drop(['first_week'], axis=1)
y = df_clean.first_week

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42)

model = make_pipeline(
            pipe,
            #make_pipeline(RobustScaler(with_centering=False))
            #drop_after_converter_pipeline,
            #set_ohe_pipeline,
            #PolynomialFeatures(),
            #preprocessing,
            #xgboost.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
        )

# model.fit(X_train, y_train)
# xgboost.plot_importance(model[-1], max_num_features=20)

p = model.transform(X_train)


convert_entrees_year 1995
convert_actor
convert_country etatsunis
convert_director
convert_distributor


In [94]:
#display(p.iloc[2448])
p[p.isna().any(axis=1)]

Unnamed: 0,duration,year_combined_score,actor_combined_score,country_combined_score,is_holiday,director_combined_score,distributor_combined_score,season_autumn,season_spring,season_summer,season_winter,month_name_april,month_name_august,month_name_december,month_name_february,month_name_january,month_name_july,month_name_june,month_name_march,month_name_may,month_name_novembre,month_name_october,month_name_september
2448,5400,,,,0,,,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2429,5100,,,,0,,,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
2540,6120,,,,0,,,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2662,5640,,,,0,,,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2622,4800,,,,0,,,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2558,6960,,,,0,,,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2300,5760,,,,1,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
2324,6240,,,,0,,,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2433,6000,,,,0,,,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0


In [50]:
#display(p.head(1))
p[p.isna().any(axis=1)]

Unnamed: 0,year,month,day,director,casting,distributor,country,duration,copies,actor_combined_score,country_combined_score,year_combined_score,is_holiday,director_combined_score,distributor_combined_score,season_autumn,season_spring,season_summer,season_winter,month_name_april,month_name_august,month_name_december,month_name_february,month_name_january,month_name_july,month_name_june,month_name_march,month_name_may,month_name_novembre,month_name_october,month_name_september
2448,2003,6,18,"""miguel arteta""","[""jennifer aniston"", ""jake gyllenhaal"", ""debor...","[""flan de coco films"", ""fox searchlight pictur...",etatsunis,5400,68,,,,0,,,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2429,2023,12,13,"""jeremie degruson""","[""monica young"", ""danny fehsenfeld"", ""olivier ...","[""a contracorriente films"", ""beside production...",belgique,5100,360,,,,0,,,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
2540,2005,6,22,"""antony cordier""","[""johan libereau"", ""salome stevenin"", ""florenc...","[""why not productions"", ""canal"", ""tps star""]",france,6120,78,,,,0,,,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2662,2024,2,21,"""jason yu""","[""kim gook hee"", ""yoon kyungho"", ""lee sunkyun""]","[""lewis pictures"", ""solaire partners""]",coree du sud,5640,138,,,,0,,,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2622,2014,2,5,"""benoit mariage""","[""benoit poelvoorde"", ""marc zinga"", ""tatiana r...","[""mg productions"", ""formosa productions"", ""cab...",france,4800,141,,,,0,,,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2558,2000,6,14,"""roger christian""","[""john travolta"", ""forest whitaker"", ""barry pe...","[""warner bros"", ""morgan creek entertainment"", ...",etatsunis,6960,168,,,,0,,,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2300,2001,7,11,"""ringo lam""","[""jeanclaude van damme"", ""michael rooker"", ""ca...","[""millennium films"", ""artisan entertainment"", ...",etatsunis,5760,152,,,,1,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
2324,2017,11,29,"""jon lucas""","[""mila kunis"", ""kristen bell"", ""kathryn hahn""]","[""huayi brothers media"", ""stx entertainment"", ...",etatsunis,6240,197,,,,0,,,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2433,1993,1,6,"""gerard jourdhui""","[""michel serrault"", ""anna galiena"", ""pierre ri...","[""centre europeen cinematographique rhonealpes...",france,6000,70,,,,0,,,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0


In [None]:
y_pred = model.predict(X_test)

In [None]:
r2_cleaned = r2_score(y_test, y_pred)
mse_cleaned = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse_cleaned = np.sqrt(mse_cleaned)

print("Performance du modèle :")
# print(model.best_params_)
print(f"R2 Score: {r2_cleaned:.4f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse_cleaned:.2f}")