In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures, RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier


In [2]:
data_path = "Dataset_analyse.csv"
df = pd.read_csv(data_path)

df = df.dropna()
df = df.drop(["title","total_spectator",'country',"hebdo_rank","first_day",'budget',"first_weekend","rating_public","casting","visa","award","lang"], axis=1)

df

Unnamed: 0,year,director,duration,genre,first_week,copies,distributor,rating_press,classification_acteurs,classification_country,day,month
0,1997,1,6300,thriller,172230,234,2,3.28,0.0,2,15,10
1,1999,1,5400,drame,154881,198,3,3.40,0.0,2,31,3
2,1994,1,5220,comedie,417021,189,3,2.70,2.0,2,18,5
3,1995,1,9900,aventure action,355642,302,3,2.60,0.0,3,4,10
4,1994,1,9120,comedie dramatique,101953,129,3,3.14,0.0,0,31,8
...,...,...,...,...,...,...,...,...,...,...,...,...
3627,1997,1,5520,film familial,129374,248,3,3.30,0.0,3,15,10
3628,1994,1,6060,aventure action,186581,264,3,3.30,0.0,3,6,4
3629,1994,1,6480,film familial,227393,123,3,3.00,0.0,3,13,4
3630,2008,1,7080,comedie,110442,128,3,3.90,0.0,0,27,8


In [3]:
data = {
    'year': 2023,
    'director': 1,
    'country': "etatsunis",
    'duration': 7200,
    'genre' : "comedie",
    'copies': 665,
    'rating_press': 4.0,
    'first_day': 359889,
    'budget': 100000000,
    'classification_acteurs': 1,
    'distributor': 1
}

data = {k:[v] for k,v in data.items()}

df_test = pd.DataFrame(data)
df_test

Unnamed: 0,year,director,country,duration,genre,copies,rating_press,first_day,budget,classification_acteurs,distributor
0,2023,1,etatsunis,7200,comedie,665,4.0,359889,100000000,1,1


In [4]:
X = df.drop(['first_week'], axis=1)
y = df.first_week

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42)

num_col = list(X.select_dtypes(include=[float,int]).columns)
cat_col = list(X.select_dtypes(include=[object]).columns)

onehotscale_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore', sparse_output=False), RobustScaler(with_centering=False))
scale_pipeline = make_pipeline(RobustScaler(with_centering=False))

preprocessing = ColumnTransformer(
    transformers=[
        ('categorical', onehotscale_pipeline, cat_col),
        ('numerical', scale_pipeline, num_col)]
)

polyscale_pipeline = make_pipeline(PolynomialFeatures(2))

my_final_pipeline = make_pipeline(preprocessing) #, polyscale_pipeline)
my_final_pipeline.fit(X_train)

feature_names = my_final_pipeline.get_feature_names_out(X.columns)
feature_names

array(['categorical__genre_animation',
       'categorical__genre_aventure action',
       'categorical__genre_catastrophe', 'categorical__genre_comedie',
       'categorical__genre_comedie dramatique',
       'categorical__genre_comicbook', 'categorical__genre_documentaire',
       'categorical__genre_drame', 'categorical__genre_fantasy',
       'categorical__genre_film familial', 'categorical__genre_guerre',
       'categorical__genre_horreur', 'categorical__genre_musical',
       'categorical__genre_peplum', 'categorical__genre_romance',
       'categorical__genre_science fiction',
       'categorical__genre_thriller', 'categorical__genre_western',
       'numerical__year', 'numerical__director', 'numerical__duration',
       'numerical__copies', 'numerical__distributor',
       'numerical__rating_press', 'numerical__classification_acteurs',
       'numerical__classification_country', 'numerical__day',
       'numerical__month'], dtype=object)

In [5]:
# from sklearn import set_config


# set_config(transform_output="pandas")
# model = make_pipeline(
#     my_final_pipeline,
# )
# xdata = model.fit_transform(X_train, y_train)
# xdata.head()

In [6]:
model = make_pipeline(
    my_final_pipeline,
    Lasso(random_state=42, max_iter=100000),
    # CatBoostClassifier(random_state=42, depth=6,verbose=False)
)

param_grid = {'lasso__alpha': [15]}
# param_grid = {
#     'catboostclassifier__n_estimators': [100, 200, 300],
#     'catboostclassifier__depth': [4, 6, 8]   
# }

# grid_search = GridSearchCV(model, param_grid, cv=2, scoring='accuracy')
# grid_search.fit(X_train, y_train)

# model = GridSearchCV(model, param_grid,cv =5)
model.fit(X_train, y_train)
# grid_score = model.score(X_train, y_train)

# best_alpha = model.best_params_['lasso__alpha']

In [7]:
# y_pred_train = model.predict(X_train)


# residus =  y_pred_train - y_train


# weights = np.exp(-abs(residus) / residus.std())
# model.fit(X_train, y_train, lasso__sample_weight=weights)




y_pred = model.predict(X_test)
# y_pred = model.predict(df_test)
# print(y_pred)

In [8]:

r2_cleaned = r2_score(y_test, y_pred)
mse_cleaned = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse_cleaned = np.sqrt(mse_cleaned)

print("Performance du modèle :")
# print(model.best_params_)
print(f"R2 Score: {r2_cleaned:.4f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse_cleaned:.2f}")

Performance du modèle :
R2 Score: 0.5988
MAE: 175350.44
RMSE: 302773.61


In [9]:
from joblib import dump, load

model_path = 'model.pkl'
dump(model, model_path)
model = load('model.pkl')

In [10]:
def predict_prod(model,data):
    # Convertir la liste de listes en DataFrame
    df = pd.DataFrame(data, columns=['year', 'month','day','director', 'country','classification_country','classification_acteurs' ,'duration', 'genre', 'copies', 'rating_press','budget','distributor'])
    predictions = model.predict(df)
    return predictions

predict_prod(model,[[2023, 7, 19, 1, "etatsunis",3,3, 6840, "film familial", 665, 3.4, 100000000, 3]])

array([652285.68361021])