In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import xgboost 
from sklearn import set_config
set_config(transform_output="pandas")
pd.set_option('display.max_columns', 500)
import pickle

In [2]:
from db.database_mysql import engine

df = pd.read_sql_query('''SELECT YEAR(jp.date) AS year, 
        MONTH(jp.date) AS month, 
        DAY(jp.date) AS day, jp.date, jp.director, jp.distributor, jp.casting, jp.copies, jp.duration, jp.genre
FROM functionalities_filmscrap as jp''', engine)

df = pd.DataFrame({
    'year': 2024,
    'month': 4,
    'duration': 6960,
    'day': 10,
    'date': '10-04-2024',
    'director': 'gil kenan',
    'distributor': ['Apollo Films'],
    'casting': [['paul rudd', 'dan aykroyd','bill murray']],
    'country': 'etatsunis',
    'copies': 670,
    'genre': ['aventure']
})

df.head(2)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"[paul rudd, dan aykroyd, bill murray]",etatsunis,670,aventure


In [3]:
from modelisation.functions import nettoyer_casting
df = nettoyer_casting(df)


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


from modelisation.functions import classify_entrees_year

entree_transformer = FunctionTransformer(classify_entrees_year, kw_args={'column' : 'year'})
entree_pip = make_pipeline(entree_transformer)
entree_ct = Pipeline([('entree_dict' , entree_pip)])

entree_pipeline = make_pipeline(entree_ct)

entree_ct.fit_transform(df)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"paul rudd, dan aykroyd, bill murray",etatsunis,670,aventure,180.8


In [5]:
from modelisation.functions import classify_season

season_transformer = FunctionTransformer(classify_season,  kw_args={'column' : 'month'})
season_pip = make_pipeline(season_transformer)
season_ct = Pipeline([('season_dict' , season_pip)])


season_ct.fit_transform(df)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"paul rudd, dan aykroyd, bill murray",etatsunis,670,aventure,180.8,spring


In [6]:
from modelisation.functions import classify_month_name

month_transformer = FunctionTransformer(classify_month_name,  kw_args={'column' : 'month'})
month_pip = make_pipeline(month_transformer)
month_ct = Pipeline([('month_dict' , month_pip)])


month_ct.fit_transform(df)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"paul rudd, dan aykroyd, bill murray",etatsunis,670,aventure,180.8,spring,april


In [7]:
from modelisation.functions import is_holiday

holiday_transformer = FunctionTransformer(is_holiday)
holiday_pip = make_pipeline(holiday_transformer)
holiday_ct = Pipeline([('holiday_dict' , holiday_pip)])


holiday_ct.fit_transform(df)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name,is_holiday
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"paul rudd, dan aykroyd, bill murray",etatsunis,670,aventure,180.8,spring,april,0


In [8]:
from modelisation.functions import nettoyer_genre

genre_transformer = FunctionTransformer(nettoyer_genre)
genre_pip = make_pipeline(genre_transformer)
genre_ct = Pipeline([('genre_dict' , genre_pip)])


genre_ct.fit_transform(df)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name,is_holiday
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"paul rudd, dan aykroyd, bill murray",etatsunis,670,aventure,180.8,spring,april,0


In [9]:
from modelisation.functions import load_file

actor_scores_path = 'actor_scores'
country_scores_path = 'country_scores'
director_scores_path = 'director_scores'
distributor_scores_path = 'distributor_scores'
year_scores_path = 'year_scores'

In [10]:
director_scores = load_file(director_scores_path)
director_scores
df = df.merge(director_scores, on='director', how='left')
df['director_combined_score'].fillna(0.3, inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['director_combined_score'].fillna(0.3, inplace=True)


Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name,is_holiday,director_combined_score
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"paul rudd, dan aykroyd, bill murray",etatsunis,670,aventure,180.8,spring,april,0,0.3


In [11]:
distributor_scores = load_file(distributor_scores_path)
distributor_scores
df = df.merge(distributor_scores, on='distributor', how='left')
df['distributor_combined_score'].fillna(0.3, inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['distributor_combined_score'].fillna(0.3, inplace=True)


Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name,is_holiday,director_combined_score,distributor_combined_score
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,"paul rudd, dan aykroyd, bill murray",etatsunis,670,aventure,180.8,spring,april,0,0.3,0.3


In [12]:
from modelisation.functions import calculate_total_actors_score, load_file

# Chargez le fichier des scores d'acteurs
actor_scores_path = 'actor_scores'  # Assurez-vous que le chemin est correct
actor_scores = load_file(actor_scores_path)
# actor_score_dict = pd.Series(actor_scores['actor_combined_score'].values, index=actor_scores['actor'].str.lower().strip()).to_dict()

# Application de la fonction à chaque ligne du DataFrame
df['actor_combined_score'] = df.apply(lambda row: calculate_total_actors_score(row, actor_scores), axis=1)

TypeError: calculate_total_actors_score() takes 1 positional argument but 2 were given

In [None]:
# actor_scores = load_file(actor_scores_path)
# actor_scores
# df = df.merge(actor_scores, on='casting', how='left')
# df['actor_combined_score'].fillna(0.3, inplace=True)
# df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['actor_combined_score'].fillna(0.3, inplace=True)


Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name,is_holiday,director_combined_score,distributor_combined_score,actor_combined_score
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,paul rudd dan aykroyd bill murray,etatsunis,670,aventure,180.8,spring,april,0,0.3,0.3,0.3


In [None]:
year_scores = load_file(year_scores_path)
year_scores
df = df.merge(year_scores, on='year', how='left')

df.head(1)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name,is_holiday,director_combined_score,distributor_combined_score,actor_combined_score,year_combined_score
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,paul rudd dan aykroyd bill murray,etatsunis,670,aventure,180.8,spring,april,0,0.3,0.3,0.3,0.440816


In [None]:
country_scores = load_file(country_scores_path)
country_scores
df = df.merge(country_scores, on='country', how='left')

df.head(1)

Unnamed: 0,year,month,duration,day,date,director,distributor,casting,country,copies,genre,entree_annee,season,month_name,is_holiday,director_combined_score,distributor_combined_score,actor_combined_score,year_combined_score,country_combined_score
0,2024,4,6960,10,10-04-2024,gil kenan,Apollo Films,paul rudd dan aykroyd bill murray,etatsunis,670,aventure,180.8,spring,april,0,0.3,0.3,0.3,0.440816,0.918321


In [None]:
# import json


# def convert_actor(df, column):
#     print("convert_actor")
#     scores = load_file("actor_scores")

#     df["actor_combined_score"] = 0

#     val = []
#     for index, row in df.iterrows():
#         sum = 0
#         try:
#             df_actors = json.loads(df.iloc[index][column])
#             for actor in df_actors:
#                 found = scores.loc[scores["actor"] == actor]
#                 if found.shape[0] != 0:
#                     sum += found.iloc[0]["actor_combined_score"]
#         except Exception:
#             pass

#         val.append(sum)

#     df["actor_combined_score"] = pd.Series(val)
#     return df



# actor_transformer = FunctionTransformer(convert_actor, kw_args={'column' : 'casting'})
# actor_pip = make_pipeline(actor_transformer)
# actor_ct = Pipeline([('actor_dict' , actor_pip)])


# actor_ct.fit_transform(df.iloc[0])

# df

In [None]:
# actor_scores = load_file(actor_scores_path)
# actor_scores.loc[actor_scores['actor']== 'dany boon']

In [None]:
# actor_scores_df = pd.DataFrame(list(actor_scores.items()), columns=['Actor', 'Score'])

# actor_scores_df

In [None]:
# df['actor_combined_score'] = 0.4
# df


In [None]:
from modelisation.functions import drop_temp_new

drop_transformer = FunctionTransformer(drop_temp_new)
drop_pip = make_pipeline(drop_transformer)
drop_ct = Pipeline([('drop_dict' , drop_pip)])


df = drop_ct.fit_transform(df)

In [None]:
# df = df.dropna()
df = df.head(1)

df

Unnamed: 0,year,duration,date,country,copies,genre,entree_annee,season,month_name,is_holiday,director_combined_score,distributor_combined_score,actor_combined_score,year_combined_score,country_combined_score
0,2024,6960,10-04-2024,etatsunis,670,aventure,180.8,spring,april,0,0.3,0.3,0.3,0.440816,0.918321


In [None]:
# Charger le modèle à partir du fichier .pkl
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)


In [None]:
data = {
    'year': [2024],
    'duration': [6960],
    'date': ['10-04-2024'],
    'country': ['etatsunis'],
    'copies': [670],
    'genre': ['aventure'],
    'entree_annee': [180.8],
    'season': ['spring'],
    'month_name': ['april'],
    'is_holiday': [0],
    'director_combined_score': [0.3],
    'distributor_combined_score': [0.3],
    'year_combined_score': [0.440816],
    'country_combined_score': [0.918321]
}

df_leroy = pd.DataFrame(data)

In [None]:
predictions = model.predict(df_leroy)
print(predictions)


AttributeError: 'numpy.ndarray' object has no attribute 'predict'