In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures, RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

In [2]:
data_path = "Dataset_analyse.csv"
df = pd.read_csv(data_path)

df = df.dropna()
df = df.drop(["title","total_spectator","hebdo_rank","first_weekend","rating_public","casting","visa","award","lang"], axis=1)
df

Unnamed: 0,year,director,country,duration,genre,first_day,first_week,copies,distributor,rating_press,budget,classification_acteurs
0,1997,3,france,6300,thriller,180000.6,172230,234,2,3.26,89800000.0,1.0
1,1999,3,france,5400,drame,34301.6,154881,198,3,3.40,990000000.0,1.0
2,1994,3,france,5220,comedie,190299.6,417021,189,3,2.74,113600000.0,1.0
3,1995,3,etatsunis,9900,aventure action,39170.4,355642,302,3,2.60,72000000.0,1.0
4,1994,3,france,6600,thriller,71377.6,39739,46,3,3.48,687840000.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2202,2008,3,grandebretagne,7080,comedie,18126.0,110442,128,3,3.90,44600000.0,1.0
2203,1993,1,etatsunis,6060,drame,174755.0,32351,44,3,3.20,46000000.0,1.0
2204,1999,3,france,5520,drame,156540.2,51749,-1,3,3.90,182000000.0,1.0
2205,1999,3,etatsunis,7080,comedie,125198.8,63249,-1,3,3.80,86262750.8,1.0


In [3]:
data = {
    'year': 2023,
    'director': 1,
    'country': "etatsunis",
    'duration': 7200,
    'genre' : "comedie",
    'copies': 665,
    'rating_press': 4.0,
    'first_day': 359889,
    'budget': 100000000,
    'classification_acteurs': 1,
    'distributor': 1
}

data = {k:[v] for k,v in data.items()}

df_test = pd.DataFrame(data)
df_test

Unnamed: 0,year,director,country,duration,genre,copies,rating_press,first_day,budget,classification_acteurs,distributor
0,2023,1,etatsunis,7200,comedie,665,4.0,359889,100000000,1,1


In [4]:
X = df.drop(['first_week'], axis=1)
y = df.first_week

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42)

num_col = list(X.select_dtypes(include=[float,int]).columns)
cat_col = list(X.select_dtypes(include=[object]).columns)

onehotscale_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore', sparse_output=False), RobustScaler(with_centering=False))
scale_pipeline = make_pipeline(RobustScaler(with_centering=False))

preprocessing = ColumnTransformer(
    transformers=[
        ('categorical', onehotscale_pipeline, cat_col),
        ('numerical', scale_pipeline, num_col)]
)

polyscale_pipeline = make_pipeline(PolynomialFeatures(2))

my_final_pipeline = make_pipeline(preprocessing, polyscale_pipeline)
my_final_pipeline.fit(X_train)

feature_names = my_final_pipeline.get_feature_names_out(X.columns)


In [5]:
from sklearn import set_config


set_config(transform_output="pandas")
model = make_pipeline(
    my_final_pipeline,
)
xdata = model.fit_transform(X_train, y_train)
xdata.head()

Unnamed: 0,1,categorical__country_allemagne,categorical__country_australie,categorical__country_belgique,categorical__country_bresil,categorical__country_canada,categorical__country_chine,categorical__country_danemark,categorical__country_espagne,categorical__country_etatsunis,...,numerical__distributor^2,numerical__distributor numerical__rating_press,numerical__distributor numerical__budget,numerical__distributor numerical__classification_acteurs,numerical__rating_press^2,numerical__rating_press numerical__budget,numerical__rating_press numerical__classification_acteurs,numerical__budget^2,numerical__budget numerical__classification_acteurs,numerical__classification_acteurs^2
1896,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,9.0,12.333333,1.981506,3.0,16.901235,2.715397,4.111111,0.436263,0.660502,1.0
1077,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,10.666667,6.083223,3.0,12.641975,7.209746,3.555556,4.111734,2.027741,1.0
454,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,9.0,10.666667,0.792602,3.0,12.641975,0.939381,3.555556,0.069802,0.264201,1.0
652,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,9.0,9.066667,1.668428,3.0,9.133827,1.680787,3.022222,0.309295,0.556143,1.0
465,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,2.777778,0.257596,1.0,7.716049,0.715544,2.777778,0.066356,0.257596,1.0


In [6]:

model = make_pipeline(
    my_final_pipeline,
    Lasso(random_state=42, max_iter=100000)
)

# param_grid = {'lasso__alpha': [15]}


# model = GridSearchCV(model, param_grid,cv =5)
model.fit(X_train, y_train)
grid_score = model.score(X_train, y_train)

# best_alpha = model.best_params_['lasso__alpha']

  model = cd_fast.enet_coordinate_descent(


In [7]:
y_pred_train = model.predict(X_train)


residus =  y_pred_train - y_train


weights = np.exp(-abs(residus) / residus.std())
model.fit(X_train, y_train, lasso__sample_weight=weights)




y_pred = model.predict(X_test)
# y_pred = model.predict(df_test)
# print(y_pred)

  model = cd_fast.enet_coordinate_descent(


In [8]:

r2_cleaned = r2_score(y_test, y_pred)
mse_cleaned = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse_cleaned = np.sqrt(mse_cleaned)

print("Performance du modèle :")
# print(model.best_params_)
print(f"R2 Score: {r2_cleaned:.4f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse_cleaned:.2f}")

Performance du modèle :
R2 Score: 0.8537
MAE: 125864.11
RMSE: 227149.65


In [16]:
import pickle

# Chemin vers votre notebook Jupyter
notebook_path = 'modelisation/model.ipynb'

# Charger le contenu du notebook
with open(notebook_path, 'rb') as f:
    notebook_content = f.read()

# Enregistrer le contenu du notebook en tant qu'objet pickle
pickle_path = 'model.pickle'
with open(pickle_path, 'wb') as f:
    pickle.dump(notebook_content, f)

print("Notebook sauvegardé en tant qu'objet pickle avec succès.")


Notebook sauvegardé en tant qu'objet pickle avec succès.
