In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import xgboost 
from sklearn import set_config
set_config(transform_output="pandas")
pd.set_option('display.max_columns', 500)
import pickle

In [2]:
from db.database_mysql import engine

df = pd.read_sql_query('''SELECT YEAR(jp.date) AS year, 
        MONTH(jp.date) AS month, 
        DAY(jp.date) AS day, jp.date, jp.director, jp.distributor, jp.casting, jp.copies, jp.duration, jp.genre
FROM functionalities_filmscrap as jp''', engine)

df.head(2)

Unnamed: 0,year,month,day,date,director,distributor,casting,copies,duration,genre
0,2024,4,24,2024-04-24,"""david schickele""","[""american film institute afi"", ""bushman co""]","[""paul eyam nzie okpokam"", ""mike slye"", ""elain...",14.0,4380,"[""drame""]"
1,2024,10,2,2024-10-02,"""todd phillips""","[""bron studios"", ""bron creative"", ""dc entertai...","[""zazie beetz"", ""joaquin phoenix"", ""lady gaga""]",,-1,"[""policier"", ""drame"", ""comedie musicale""]"


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


from modelisation.functions import classify_entrees_year

entree_transformer = FunctionTransformer(classify_entrees_year, kw_args={'column' : 'year'})
entree_pip = make_pipeline(entree_transformer)
entree_ct = Pipeline([('entree_dict' , entree_pip)])

entree_pipeline = make_pipeline(entree_ct)

entree_ct.fit_transform(df)

Unnamed: 0,year,month,day,date,director,distributor,casting,copies,duration,genre,entree_annee
0,2024,4,24,2024-04-24,"""david schickele""","[""american film institute afi"", ""bushman co""]","[""paul eyam nzie okpokam"", ""mike slye"", ""elain...",14.0,4380,"[""drame""]",180.8
1,2024,10,2,2024-10-02,"""todd phillips""","[""bron studios"", ""bron creative"", ""dc entertai...","[""zazie beetz"", ""joaquin phoenix"", ""lady gaga""]",,-1,"[""policier"", ""drame"", ""comedie musicale""]",180.8
2,2024,5,8,2024-05-08,"""wes ball""","[""disney studios australia"", ""jason t reed pro...","[""freya allan"", ""kevin durand"", ""dichen lachman""]",223.0,8700,"[""action"", ""aventure"", ""sciencefiction""]",180.8
3,2024,6,5,2024-06-05,"""pierphilippe chevigny""","[""le foyer films"", ""ts productions"", ""jpl films""]","[""ariane castellanos"", ""marcandre grondin"", ""n...",1.0,5400,"[""drame""]",180.8
4,2024,5,22,2024-05-22,"""george miller""","[""warner bros"", ""kennedy miller mitchell""]","[""anya taylorjoy"", ""chris hemsworth"", ""charlee...",,-1,"[""action"", ""aventure"", ""sciencefiction""]",180.8
...,...,...,...,...,...,...,...,...,...,...,...
125,2024,7,31,2024-07-31,"""ross venokur""","[""riverstone pictures"", ""kintop pictures"", ""va...","[""jimmy o yang"", ""jk simmons"", ""chloe bennet""]",2.0,5580,"[""animation"", ""comedie"", ""famille""]",180.8
126,2024,10,23,2024-10-23,"""josh cooley""","[""bay films"", ""di bonaventura pictures"", ""hasb...","[""scarlett johansson"", ""chris hemsworth"", ""jon...",,-1,"[""animation"", ""action"", ""aventure""]",180.8
127,2024,4,17,2024-04-17,"""dev patel""","[""bron studios"", ""thunder road pictures"", ""mon...","[""dev patel"", ""sharlto copley"", ""pitobash""]",224.0,7260,"[""action"", ""thriller""]",180.8
128,2024,11,20,2024-11-20,"""ridley scott""","[""paramount pictures"", ""universal pictures"", ""...","[""denzel washington"", ""pedro pascal"", ""paul me...",,-1,"[""action"", ""aventure"", ""drame""]",180.8


In [4]:
from modelisation.functions import classify_season

season_transformer = FunctionTransformer(classify_season,  kw_args={'column' : 'month'})
season_pip = make_pipeline(season_transformer)
season_ct = Pipeline([('season_dict' , season_pip)])


season_ct.fit_transform(df)

Unnamed: 0,year,month,day,date,director,distributor,casting,copies,duration,genre,entree_annee,season
0,2024,4,24,2024-04-24,"""david schickele""","[""american film institute afi"", ""bushman co""]","[""paul eyam nzie okpokam"", ""mike slye"", ""elain...",14.0,4380,"[""drame""]",180.8,spring
1,2024,10,2,2024-10-02,"""todd phillips""","[""bron studios"", ""bron creative"", ""dc entertai...","[""zazie beetz"", ""joaquin phoenix"", ""lady gaga""]",,-1,"[""policier"", ""drame"", ""comedie musicale""]",180.8,autumn
2,2024,5,8,2024-05-08,"""wes ball""","[""disney studios australia"", ""jason t reed pro...","[""freya allan"", ""kevin durand"", ""dichen lachman""]",223.0,8700,"[""action"", ""aventure"", ""sciencefiction""]",180.8,spring
3,2024,6,5,2024-06-05,"""pierphilippe chevigny""","[""le foyer films"", ""ts productions"", ""jpl films""]","[""ariane castellanos"", ""marcandre grondin"", ""n...",1.0,5400,"[""drame""]",180.8,summer
4,2024,5,22,2024-05-22,"""george miller""","[""warner bros"", ""kennedy miller mitchell""]","[""anya taylorjoy"", ""chris hemsworth"", ""charlee...",,-1,"[""action"", ""aventure"", ""sciencefiction""]",180.8,spring
...,...,...,...,...,...,...,...,...,...,...,...,...
125,2024,7,31,2024-07-31,"""ross venokur""","[""riverstone pictures"", ""kintop pictures"", ""va...","[""jimmy o yang"", ""jk simmons"", ""chloe bennet""]",2.0,5580,"[""animation"", ""comedie"", ""famille""]",180.8,summer
126,2024,10,23,2024-10-23,"""josh cooley""","[""bay films"", ""di bonaventura pictures"", ""hasb...","[""scarlett johansson"", ""chris hemsworth"", ""jon...",,-1,"[""animation"", ""action"", ""aventure""]",180.8,autumn
127,2024,4,17,2024-04-17,"""dev patel""","[""bron studios"", ""thunder road pictures"", ""mon...","[""dev patel"", ""sharlto copley"", ""pitobash""]",224.0,7260,"[""action"", ""thriller""]",180.8,spring
128,2024,11,20,2024-11-20,"""ridley scott""","[""paramount pictures"", ""universal pictures"", ""...","[""denzel washington"", ""pedro pascal"", ""paul me...",,-1,"[""action"", ""aventure"", ""drame""]",180.8,autumn


In [5]:
from modelisation.functions import classify_month_name

month_transformer = FunctionTransformer(classify_month_name,  kw_args={'column' : 'month'})
month_pip = make_pipeline(month_transformer)
month_ct = Pipeline([('month_dict' , month_pip)])


month_ct.fit_transform(df)

Unnamed: 0,year,month,day,date,director,distributor,casting,copies,duration,genre,entree_annee,season,month_name
0,2024,4,24,2024-04-24,"""david schickele""","[""american film institute afi"", ""bushman co""]","[""paul eyam nzie okpokam"", ""mike slye"", ""elain...",14.0,4380,"[""drame""]",180.8,spring,april
1,2024,10,2,2024-10-02,"""todd phillips""","[""bron studios"", ""bron creative"", ""dc entertai...","[""zazie beetz"", ""joaquin phoenix"", ""lady gaga""]",,-1,"[""policier"", ""drame"", ""comedie musicale""]",180.8,autumn,october
2,2024,5,8,2024-05-08,"""wes ball""","[""disney studios australia"", ""jason t reed pro...","[""freya allan"", ""kevin durand"", ""dichen lachman""]",223.0,8700,"[""action"", ""aventure"", ""sciencefiction""]",180.8,spring,may
3,2024,6,5,2024-06-05,"""pierphilippe chevigny""","[""le foyer films"", ""ts productions"", ""jpl films""]","[""ariane castellanos"", ""marcandre grondin"", ""n...",1.0,5400,"[""drame""]",180.8,summer,june
4,2024,5,22,2024-05-22,"""george miller""","[""warner bros"", ""kennedy miller mitchell""]","[""anya taylorjoy"", ""chris hemsworth"", ""charlee...",,-1,"[""action"", ""aventure"", ""sciencefiction""]",180.8,spring,may
...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,2024,7,31,2024-07-31,"""ross venokur""","[""riverstone pictures"", ""kintop pictures"", ""va...","[""jimmy o yang"", ""jk simmons"", ""chloe bennet""]",2.0,5580,"[""animation"", ""comedie"", ""famille""]",180.8,summer,july
126,2024,10,23,2024-10-23,"""josh cooley""","[""bay films"", ""di bonaventura pictures"", ""hasb...","[""scarlett johansson"", ""chris hemsworth"", ""jon...",,-1,"[""animation"", ""action"", ""aventure""]",180.8,autumn,october
127,2024,4,17,2024-04-17,"""dev patel""","[""bron studios"", ""thunder road pictures"", ""mon...","[""dev patel"", ""sharlto copley"", ""pitobash""]",224.0,7260,"[""action"", ""thriller""]",180.8,spring,april
128,2024,11,20,2024-11-20,"""ridley scott""","[""paramount pictures"", ""universal pictures"", ""...","[""denzel washington"", ""pedro pascal"", ""paul me...",,-1,"[""action"", ""aventure"", ""drame""]",180.8,autumn,novembre


In [6]:
from modelisation.functions import is_holiday

holiday_transformer = FunctionTransformer(is_holiday)
holiday_pip = make_pipeline(holiday_transformer)
holiday_ct = Pipeline([('holiday_dict' , holiday_pip)])


holiday_ct.fit_transform(df)

Unnamed: 0,year,month,day,date,director,distributor,casting,copies,duration,genre,entree_annee,season,month_name,is_holiday
0,2024,4,24,2024-04-24,"""david schickele""","[""american film institute afi"", ""bushman co""]","[""paul eyam nzie okpokam"", ""mike slye"", ""elain...",14.0,4380,"[""drame""]",180.8,spring,april,1
1,2024,10,2,2024-10-02,"""todd phillips""","[""bron studios"", ""bron creative"", ""dc entertai...","[""zazie beetz"", ""joaquin phoenix"", ""lady gaga""]",,-1,"[""policier"", ""drame"", ""comedie musicale""]",180.8,autumn,october,0
2,2024,5,8,2024-05-08,"""wes ball""","[""disney studios australia"", ""jason t reed pro...","[""freya allan"", ""kevin durand"", ""dichen lachman""]",223.0,8700,"[""action"", ""aventure"", ""sciencefiction""]",180.8,spring,may,0
3,2024,6,5,2024-06-05,"""pierphilippe chevigny""","[""le foyer films"", ""ts productions"", ""jpl films""]","[""ariane castellanos"", ""marcandre grondin"", ""n...",1.0,5400,"[""drame""]",180.8,summer,june,0
4,2024,5,22,2024-05-22,"""george miller""","[""warner bros"", ""kennedy miller mitchell""]","[""anya taylorjoy"", ""chris hemsworth"", ""charlee...",,-1,"[""action"", ""aventure"", ""sciencefiction""]",180.8,spring,may,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,2024,7,31,2024-07-31,"""ross venokur""","[""riverstone pictures"", ""kintop pictures"", ""va...","[""jimmy o yang"", ""jk simmons"", ""chloe bennet""]",2.0,5580,"[""animation"", ""comedie"", ""famille""]",180.8,summer,july,1
126,2024,10,23,2024-10-23,"""josh cooley""","[""bay films"", ""di bonaventura pictures"", ""hasb...","[""scarlett johansson"", ""chris hemsworth"", ""jon...",,-1,"[""animation"", ""action"", ""aventure""]",180.8,autumn,october,1
127,2024,4,17,2024-04-17,"""dev patel""","[""bron studios"", ""thunder road pictures"", ""mon...","[""dev patel"", ""sharlto copley"", ""pitobash""]",224.0,7260,"[""action"", ""thriller""]",180.8,spring,april,0
128,2024,11,20,2024-11-20,"""ridley scott""","[""paramount pictures"", ""universal pictures"", ""...","[""denzel washington"", ""pedro pascal"", ""paul me...",,-1,"[""action"", ""aventure"", ""drame""]",180.8,autumn,novembre,0


In [7]:
from modelisation.functions import nettoyer_genre

genre_transformer = FunctionTransformer(nettoyer_genre)
genre_pip = make_pipeline(genre_transformer)
genre_ct = Pipeline([('genre_dict' , genre_pip)])


genre_ct.fit_transform(df)

Unnamed: 0,year,month,day,date,director,distributor,casting,copies,duration,genre,entree_annee,season,month_name,is_holiday
0,2024,4,24,2024-04-24,"""david schickele""","[""american film institute afi"", ""bushman co""]","[""paul eyam nzie okpokam"", ""mike slye"", ""elain...",14.0,4380,drame,180.8,spring,april,1
1,2024,10,2,2024-10-02,"""todd phillips""","[""bron studios"", ""bron creative"", ""dc entertai...","[""zazie beetz"", ""joaquin phoenix"", ""lady gaga""]",,-1,policier,180.8,autumn,october,0
2,2024,5,8,2024-05-08,"""wes ball""","[""disney studios australia"", ""jason t reed pro...","[""freya allan"", ""kevin durand"", ""dichen lachman""]",223.0,8700,action,180.8,spring,may,0
3,2024,6,5,2024-06-05,"""pierphilippe chevigny""","[""le foyer films"", ""ts productions"", ""jpl films""]","[""ariane castellanos"", ""marcandre grondin"", ""n...",1.0,5400,drame,180.8,summer,june,0
4,2024,5,22,2024-05-22,"""george miller""","[""warner bros"", ""kennedy miller mitchell""]","[""anya taylorjoy"", ""chris hemsworth"", ""charlee...",,-1,action,180.8,spring,may,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,2024,7,31,2024-07-31,"""ross venokur""","[""riverstone pictures"", ""kintop pictures"", ""va...","[""jimmy o yang"", ""jk simmons"", ""chloe bennet""]",2.0,5580,animation,180.8,summer,july,1
126,2024,10,23,2024-10-23,"""josh cooley""","[""bay films"", ""di bonaventura pictures"", ""hasb...","[""scarlett johansson"", ""chris hemsworth"", ""jon...",,-1,animation,180.8,autumn,october,1
127,2024,4,17,2024-04-17,"""dev patel""","[""bron studios"", ""thunder road pictures"", ""mon...","[""dev patel"", ""sharlto copley"", ""pitobash""]",224.0,7260,action,180.8,spring,april,0
128,2024,11,20,2024-11-20,"""ridley scott""","[""paramount pictures"", ""universal pictures"", ""...","[""denzel washington"", ""pedro pascal"", ""paul me...",,-1,action,180.8,autumn,novembre,0


In [8]:
from modelisation.functions import load_file

actor_scores_path = 'actor_scores'
country_scores_path = 'country_scores'
director_scores_path = 'director_scores'
distributor_scores_path = 'distributor_scores'
year_scores_path = 'year_scores'

In [9]:
director_scores = load_file(director_scores_path)
director_scores

Unnamed: 0,director,director_combined_score
0,"""dany boon""",0.461777
1,"""patrice leconte""",0.248820
2,"""gerard krawczyk""",0.384086
3,"""george lucas""",0.505365
4,"""jon favreau""",0.310504
...,...,...
2668,"""fabrice gobert""",0.114794
2669,"""bernardo bertolucci""",0.145165
2670,"""paolo sorrentino""",0.186062
2671,"""karine silla""",0.115538


In [10]:
df = df.merge(director_scores, on='director', how='left')



In [11]:
actor_scores = load_file(actor_scores_path)
actor_scores

Unnamed: 0,actor,actor_combined_score
0,kad merad,0.269875
1,dany boon,0.351076
2,zoe felix,0.237639
3,josiane balasko,0.334050
4,michel blanc,0.224706
...,...,...
4146,carlo verdone,0.292652
4147,sabrina ferilli,0.292652
4148,valeria golino,0.210215
4149,micaela ramazzotti,0.274731


Unnamed: 0,year,month,day,date,director,distributor,casting,copies,duration,genre,entree_annee,season,month_name,is_holiday,director_combined_score,actor_combined_score
0,2024,4,24,2024-04-24,"""david schickele""","[""american film institute afi"", ""bushman co""]","[paul eyam nzie okpokam, mike slye, elaine fea...",14.0,4380,drame,180.8,spring,april,1,,0
1,2024,10,2,2024-10-02,"""todd phillips""","[""bron studios"", ""bron creative"", ""dc entertai...","[zazie beetz, joaquin phoenix, lady gaga]",,-1,policier,180.8,autumn,october,0,0.334096,0
2,2024,10,2,2024-10-02,"""todd phillips""","[""bron studios"", ""bron creative"", ""dc entertai...","[zazie beetz, joaquin phoenix, lady gaga]",,-1,policier,180.8,autumn,october,0,0.334096,0
3,2024,10,2,2024-10-02,"""todd phillips""","[""bron studios"", ""bron creative"", ""dc entertai...","[zazie beetz, joaquin phoenix, lady gaga]",,-1,policier,180.8,autumn,october,0,0.334096,0
4,2024,10,2,2024-10-02,"""todd phillips""","[""bron studios"", ""bron creative"", ""dc entertai...","[zazie beetz, joaquin phoenix, lady gaga]",,-1,policier,180.8,autumn,october,0,0.334096,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,2024,11,20,2024-11-20,"""ridley scott""","[""paramount pictures"", ""universal pictures"", ""...","[denzel washington, pedro pascal, paul mescal]",,-1,action,180.8,autumn,novembre,0,0.301450,0
183,2024,11,20,2024-11-20,"""ridley scott""","[""paramount pictures"", ""universal pictures"", ""...","[denzel washington, pedro pascal, paul mescal]",,-1,action,180.8,autumn,novembre,0,0.301450,0
184,2024,11,20,2024-11-20,"""ridley scott""","[""paramount pictures"", ""universal pictures"", ""...","[denzel washington, pedro pascal, paul mescal]",,-1,action,180.8,autumn,novembre,0,0.301450,0
185,2024,11,20,2024-11-20,"""ridley scott""","[""paramount pictures"", ""universal pictures"", ""...","[denzel washington, pedro pascal, paul mescal]",,-1,action,180.8,autumn,novembre,0,0.301450,0


In [13]:
from modelisation.functions import drop_temp_new

drop_transformer = FunctionTransformer(drop_temp_new)
drop_pip = make_pipeline(drop_transformer)
drop_ct = Pipeline([('drop_dict' , drop_pip)])


df = drop_ct.fit_transform(df)

In [14]:
df = df.dropna()
df

Unnamed: 0,year,date,copies,duration,genre,entree_annee,season,is_holiday,director_combined_score,actor_combined_score
6,2024,2024-05-08,223.0,8700,action,180.8,spring,0,0.347523,0
30,2024,2024-05-01,125.0,7560,action,180.8,spring,1,0.361554,0
31,2024,2024-05-01,125.0,7560,action,180.8,spring,1,0.361554,0
32,2024,2024-05-01,125.0,7560,action,180.8,spring,1,0.361554,0
39,2024,2024-04-17,1.0,7080,policier,180.8,spring,0,0.147767,0
54,2024,2024-04-24,95.0,7320,biographique,180.8,spring,1,0.370443,0
94,2024,2024-04-17,517.0,5400,comedie,180.8,spring,0,0.142154,0
111,2024,2024-05-01,44.0,5700,thriller,180.8,spring,1,0.062867,0
116,2024,2024-04-24,8.0,6480,horreur,180.8,spring,1,0.078898,0
120,2024,2024-05-01,47.0,5460,drame,180.8,spring,1,0.317136,0


In [15]:
# Charger le modèle à partir du fichier .pkl
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)


In [16]:
data_to_predict = {
    'year': [2024, 2024, 2024, 2024, 2024],
    'copies': [14.0, 223.0, 1.0, 1.0, 2.0],
    'duration': [4380, 8700, 5400, 6540, 5940],
    'genre': ['drame', 'action', 'drame', 'comedie', 'horreur'],
    'entree_annee': [180.8, 180.8, 180.8, 180.8, 180.8],
    'season': ['spring', 'spring', 'summer', 'summer', 'spring'],
    'is_holiday': [1, 0, 0, 0, 0]
}

df_to_predict = pd.DataFrame(data_to_predict)

In [17]:
predictions = model.predict(df_to_predict)
print(predictions)


AttributeError: 'numpy.ndarray' object has no attribute 'predict'