In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import xgboost 
from sklearn import set_config
set_config(transform_output="pandas")
pd.set_option('display.max_columns', 500)
import pickle

In [2]:
from db.database_mysql import engine

df = pd.read_sql_query('''SELECT YEAR(jp.date) AS year, 
        MONTH(jp.date) AS month, 
        DAY(jp.date) AS day, jp.date, jp.director, jp.distributor, jp.casting, jp.copies, jp.duration, jp.genre
FROM functionalities_filmscrap as jp''', engine)

df = pd.DataFrame({
    'year': 2024,
    'month': 4,
    'duration': 6960,
    'day': 10,
    'date': '10-04-2024',
    'director': 'gil kenan',
    'distributor': ['Apollo Films'],
    'casting': [['paul rudd', 'dan aykroyd','bill murray']],
    'country': 'etatsunis',
    'copies': 670,
    'genre': ['aventure']
})

df.head(2)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"[paul rudd, dan aykroyd, bill murray]",etatsunis,670,aventure


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


from modelisation.functions import classify_entrees_year

entree_transformer = FunctionTransformer(classify_entrees_year, kw_args={'column' : 'year'})
entree_pip = make_pipeline(entree_transformer)
entree_ct = Pipeline([('entree_dict' , entree_pip)])

entree_pipeline = make_pipeline(entree_ct)

entree_ct.fit_transform(df)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"[paul rudd, dan aykroyd, bill murray]",etatsunis,670,aventure,180.8


In [4]:
from modelisation.functions import classify_season

season_transformer = FunctionTransformer(classify_season,  kw_args={'column' : 'month'})
season_pip = make_pipeline(season_transformer)
season_ct = Pipeline([('season_dict' , season_pip)])


season_ct.fit_transform(df)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"[paul rudd, dan aykroyd, bill murray]",etatsunis,670,aventure,180.8,spring


In [5]:
from modelisation.functions import classify_month_name

month_transformer = FunctionTransformer(classify_month_name,  kw_args={'column' : 'month'})
month_pip = make_pipeline(month_transformer)
month_ct = Pipeline([('month_dict' , month_pip)])


month_ct.fit_transform(df)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"[paul rudd, dan aykroyd, bill murray]",etatsunis,670,aventure,180.8,spring,april


In [6]:
from modelisation.functions import is_holiday

holiday_transformer = FunctionTransformer(is_holiday)
holiday_pip = make_pipeline(holiday_transformer)
holiday_ct = Pipeline([('holiday_dict' , holiday_pip)])


holiday_ct.fit_transform(df)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name,is_holiday
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"[paul rudd, dan aykroyd, bill murray]",etatsunis,670,aventure,180.8,spring,april,0


In [7]:
from modelisation.functions import nettoyer_genre

genre_transformer = FunctionTransformer(nettoyer_genre)
genre_pip = make_pipeline(genre_transformer)
genre_ct = Pipeline([('genre_dict' , genre_pip)])


genre_ct.fit_transform(df)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name,is_holiday
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"[paul rudd, dan aykroyd, bill murray]",etatsunis,670,aventure,180.8,spring,april,0


In [8]:
from modelisation.functions import load_file

actor_scores_path = 'actor_scores'
country_scores_path = 'country_scores'
director_scores_path = 'director_scores'
distributor_scores_path = 'distributor_scores'
year_scores_path = 'year_scores'

In [9]:
director_scores = load_file(director_scores_path)
director_scores
df = df.merge(director_scores, on='director', how='left')
df['director_combined_score'].fillna(0.3, inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['director_combined_score'].fillna(0.3, inplace=True)


Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name,is_holiday,director_combined_score
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"[paul rudd, dan aykroyd, bill murray]",etatsunis,670,aventure,180.8,spring,april,0,0.3


In [10]:
distributor_scores = load_file(distributor_scores_path)
distributor_scores
df = df.merge(distributor_scores, on='distributor', how='left')
df['distributor_combined_score'].fillna(0.3, inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['distributor_combined_score'].fillna(0.3, inplace=True)


Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name,is_holiday,director_combined_score,distributor_combined_score
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"[paul rudd, dan aykroyd, bill murray]",etatsunis,670,aventure,180.8,spring,april,0,0.3,0.3


In [11]:
year_scores = load_file(year_scores_path)
year_scores
df = df.merge(year_scores, on='year', how='left')

df.head(1)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name,is_holiday,director_combined_score,distributor_combined_score,year_combined_score
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"[paul rudd, dan aykroyd, bill murray]",etatsunis,670,aventure,180.8,spring,april,0,0.3,0.3,0.440816


In [12]:
country_scores = load_file(country_scores_path)
country_scores
df = df.merge(country_scores, on='country', how='left')

df.head(1)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name,is_holiday,director_combined_score,distributor_combined_score,year_combined_score,country_combined_score
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"[paul rudd, dan aykroyd, bill murray]",etatsunis,670,aventure,180.8,spring,april,0,0.3,0.3,0.440816,0.918321


In [13]:
actor_scores = load_file(actor_scores_path)
actor_scores

Unnamed: 0,actor,actor_combined_score
0,kad merad,0.269875
1,dany boon,0.351076
2,zoe felix,0.237639
3,josiane balasko,0.334050
4,michel blanc,0.224706
...,...,...
4146,carlo verdone,0.292652
4147,sabrina ferilli,0.292652
4148,valeria golino,0.210215
4149,micaela ramazzotti,0.274731


In [14]:
# actor_scores = load_file(actor_scores_path)
# actor_scores
# df = df.merge(actor_scores, on='actor', how='left')

# df


In [15]:
# df['actor_combined_score'] = 0.4
# df


In [16]:
from modelisation.functions import drop_temp_new

drop_transformer = FunctionTransformer(drop_temp_new)
drop_pip = make_pipeline(drop_transformer)
drop_ct = Pipeline([('drop_dict' , drop_pip)])


df = drop_ct.fit_transform(df)

In [17]:
# df = df.dropna()
df = df.head(1)

df

Unnamed: 0,year,duration,date,country,copies,genre,entree_annee,season,month_name,is_holiday,director_combined_score,distributor_combined_score,year_combined_score,country_combined_score
0,2024,6960,10-04-2024,etatsunis,670,aventure,180.8,spring,april,0,0.3,0.3,0.440816,0.918321


In [18]:
# Charger le modèle à partir du fichier .pkl
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)


In [19]:
data = {
    'year': [2024],
    'duration': [6960],
    'date': ['10-04-2024'],
    'country': ['etatsunis'],
    'copies': [670],
    'genre': ['aventure'],
    'entree_annee': [180.8],
    'season': ['spring'],
    'month_name': ['april'],
    'is_holiday': [0],
    'director_combined_score': [0.3],
    'distributor_combined_score': [0.3],
    'year_combined_score': [0.440816],
    'country_combined_score': [0.918321]
}

df_leroy = pd.DataFrame(data)

In [20]:
predictions = model.predict(df_leroy)
print(predictions)


AttributeError: 'numpy.ndarray' object has no attribute 'predict'