In [6]:
import pickle
import pandas as pd
import numpy as np
from google.cloud import bigquery
from cachetools import cached, TTLCache
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
# Connexion à BigQuery
client = bigquery.Client(project='cdiscountwagon')

# Cache les résultats de chargement de données pour 10 minutes
cache = TTLCache(maxsize=100, ttl=600)

@cached(cache)
def load_data():
    query = """
    SELECT *
    FROM `cdiscountwagon.Datasets.Merges_Sales_avec_categ`
    """
    df = client.query(query).to_dataframe()
    return df

# Chargement des données
df = load_data()

In [3]:
df['order_date'] = pd.to_datetime(df['order_date'])
df['day_of_week_number'] = df['order_date'].dt.dayofweek
df['week_in_month'] = (df['order_date'].dt.day - 1) // 7 + 1
df['avg_price'] = df['avg_price'].round(2)
avg_prix_mean = df['indice_avg_price'].mean()
df['indice_avg_price'].fillna(avg_prix_mean, inplace=True)
df['impression_gs'].fillna(0, inplace=True)
df['acquisition_cost_gs'].fillna(0, inplace=True)


In [4]:
df_no_operation = df[df['operation_name'].isnull()]

df_on_operation_1 = df[df['operation_name'].notnull()]

In [5]:
#df_on_operation_1 =  df_on_operation_1.loc[df_on_operation_1['Category_1'] == 'BAGAGES']
df.head()

Unnamed: 0,product_id_upper,product_id,order_date,on_operation,on_google_shopping,type,avg_price,indice_avg_price,nb_new_customers,total_customers,...,impression_gs,acquisition_cost_gs,tracking_day_front,on_front,click_on_front,category,product_id_1,Category_1,day_of_week_number,week_in_month
0,603344402,603344402,2024-03-13,0,0,1P,181.91,97.072236,0,0,...,0.0,0.0,NaT,0,,bricolage,603344402,BRICOLAGE,2,2
1,603344402,603344402,2024-03-14,0,0,1P,181.91,97.072236,0,0,...,0.0,0.0,NaT,0,,bricolage,603344402,BRICOLAGE,3,2
2,603344402,603344402,2024-03-15,0,0,1P,181.91,97.072236,0,0,...,0.0,0.0,NaT,0,,bricolage,603344402,BRICOLAGE,4,3
3,603344402,603344402,2024-03-16,0,0,1P,99.9,97.072236,0,0,...,0.0,0.0,NaT,0,,bricolage,603344402,BRICOLAGE,5,3
4,603344402,603344402,2024-03-17,0,0,1P,99.9,97.072236,0,0,...,0.0,0.0,NaT,0,,bricolage,603344402,BRICOLAGE,6,3


In [7]:
# Séparation des caractéristiques (X) et de la variable cible (y)

X = df_no_operation.drop(columns=["on_operation","product_id","type","order_date","nb_new_customers","total_customers","operation_name","startdate_op","enddate_op","display_date_gs","tracking_day_front","click_on_front","category","on_google_shopping","acquisition_cost_gs","product_id_1","product_id_upper"])

y = df_no_operation["nb_new_customers"]


#Données numériques et non numérique train

numerical_columns = X.select_dtypes(include=np.number).columns

cat_columns = X.select_dtypes(exclude=np.number).columns

X=pd.get_dummies(X,columns=cat_columns)


# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
scaler = StandardScaler()

# Scale données numériques

#X_train[numerical_columns]= scaler.fit_transform(X_train[numerical_columns])
#X_test[numerical_columns]= scaler.transform(X_test[numerical_columns])


In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Créer et ajuster un modèle de forêt aléatoire
rf_model_hors_op = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_hors_op.fit(X_train, y_train)

In [12]:
# Prédiction et évaluation
y_pred_rf = rf_model_hors_op.predict(X_test)
print(f'RMSE (Random Forest): {mean_squared_error(y_test, y_pred_rf, squared=False)}')
print(f'R^2 (Random Forest): {r2_score(y_test, y_pred_rf)}')

RMSE (Random Forest): 1.6453969686558243
R^2 (Random Forest): 0.3191461662481808


In [10]:

from sklearn.linear_model import LinearRegression 
lin_reg_hors_op = LinearRegression()


# Ajustement du modèle aux données d'entraînement
lin_reg_hors_op.fit(X_train, y_train)

# Évaluation du modèle sur les données de test
lin_reg_hors_op.score(X_test,y_test)
y_pred = lin_reg_hors_op.predict(X_test)
print(lin_reg_hors_op.score(X_test,y_test))

0.07670331753483162


In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
model_GBC_Hors_OP = GradientBoostingClassifier()

# Entraîner le modèle
model_GBC_Hors_OP.fit(X_train, y_train)

# Faire des prédictions
predictions = model_GBC_Hors_OP.predict(X_test)

# Évaluer les performances du modèle
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9637766584503269


In [14]:
data = {
    'avg_price': [199], 
    'indice_avg_price': [183], 
    'impression_gs': [87778], 
    'on_front': [0], 
    'day_of_week_number': [4], 
    'week_in_month': [4], 
    'Category_1_ANIMALERIE': [0], 
    'Category_1_AU-QUOTIDIEN': [0], 
    'Category_1_AUTO': [0], 
    'Category_1_BAGAGES': [0], 
    'Category_1_BRICOLAGE': [0], 
    'Category_1_CORNER': [0], 
    'Category_1_ELECTROMENAGER': [0], 
    'Category_1_HIGH-TECH': [0], 
    'Category_1_INFORMATIQUE': [0], 
    'Category_1_JARDIN': [0], 
    'Category_1_JEUX-PC-VIDEO-CONSOLE': [0], 
    'Category_1_JUNIORS': [0], 
    'Category_1_MAISON': [0], 
    'Category_1_MEUBLE': [0], 
    'Category_1_PRÊT-À-PORTER': [1], 
    'Category_1_PUÉRICULTURE': [0], 
    'Category_1_SPORT': [0], 
    'Category_1_TELEPHONIE': [0], 
    'Category_1_TV SON': [0], 
    'Category_1_VIN-CHAMPAGNE': [0]
}

# Créer le DataFrame à partir du dictionnaire
df = pd.DataFrame(data)

display(df)

Unnamed: 0,avg_price,indice_avg_price,impression_gs,on_front,day_of_week_number,week_in_month,Category_1_ANIMALERIE,Category_1_AU-QUOTIDIEN,Category_1_AUTO,Category_1_BAGAGES,...,Category_1_JEUX-PC-VIDEO-CONSOLE,Category_1_JUNIORS,Category_1_MAISON,Category_1_MEUBLE,Category_1_PRÊT-À-PORTER,Category_1_PUÉRICULTURE,Category_1_SPORT,Category_1_TELEPHONIE,Category_1_TV SON,Category_1_VIN-CHAMPAGNE
0,199,183,87778,0,4,4,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [15]:
print(rf_model_hors_op.predict(df))
print(lin_reg_hors_op.predict(df))
print(model_GBC_Hors_OP.predict(df))

[4.16666667]
[2.536749]
[0.]


In [16]:
with open('/Users/enayargrh/Library/Mobile Documents/com~apple~CloudDocs/labo_coding/01_projects/07_cdiscount_streamlit_ml/data/fitted_model_rf_hors_op.pickle','wb') as modelFile:
     pickle.dump(rf_model_hors_op,modelFile)

with open('/Users/enayargrh/Library/Mobile Documents/com~apple~CloudDocs/labo_coding/01_projects/07_cdiscount_streamlit_ml/data/fitted_model_lr_hors_op.pickle','wb') as modelFile:
     pickle.dump(lin_reg_hors_op,modelFile)

with open('/Users/enayargrh/Library/Mobile Documents/com~apple~CloudDocs/labo_coding/01_projects/07_cdiscount_streamlit_ml/data/fitted_model_gbc_hors_op.pickle','wb') as modelFile:
     pickle.dump(model_GBC_Hors_OP,modelFile)