# Etude Frequence corporelle 


# Table des matières :

* [1 - Initialisation de l'étude](#initialisation)
* [2 - Retraitements des données](#retraitements) 
* [3 - Modèle de Machine Learning](#machineLearning)
* [4 - Pricing](#pricing)


# 1 - Initialisation de l'étude : <a class="anchor" id="initialisation"></a>

In [81]:
# Librairies :
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR


In [63]:
# Import des données :
df =  pd.read_csv("training_clean.csv")
df.head()

Unnamed: 0,PolNum,CalYear,Gender,Type,Category,Occupation,Age,Group1,Bonus,Poldur,Value,Adind,SubGroup2,Group2,Density,Exppdays,Numtppd,Numtpbi,Indtppd,Indtpbi
0,200114978,2009,Male,C,Large,Employed,25,18,85,3,15080.0,0,L46,L,72.012883,365.0,1,0,0.0,0.0
1,200114994,2009,Male,E,Large,Employed,20,11,30,2,22370.0,1,O38,O,39.550411,365.0,1,0,0.0,0.0
2,200115001,2009,Female,E,Large,Unemployed,42,11,85,0,39650.0,0,Q28,Q,169.529148,365.0,2,0,0.0,0.0
3,200115011,2009,Female,C,Medium,Housewife,21,5,0,0,12600.0,1,L6,L,58.894688,365.0,1,0,0.0,0.0
4,200115015,2009,Female,D,Large,Employed,33,12,30,10,9065.0,0,N4,N,109.631885,365.0,2,0,0.0,0.0


# 2 - Retraitement des données : <a class="anchor" id="retraitements"></a>

In [67]:
# Suppression des données erronées :
df2 = df.iloc[21:]

In [69]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_columns.remove('Numtpbi')  

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)])




In [71]:
# Define the regression model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# Split the data into features and target variable
X = df.drop('Numtpbi', axis=1)
y = df['Numtpbi']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2

(0.03127150249950512, 0.34092641401123736)

In [72]:

# Define the regression model pipeline with a random forest regressor
model_rf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor(random_state=42))])


model_rf.fit(X_train, y_train)

y_pred_rf = model_rf.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

mse_rf, r2_rf


(0.0019639277855571114, 0.9586085469263723)

In [87]:
param_grid_rf = {
    'regressor__n_estimators': [10, 50, 100],  # Nombre d'arbres
    'regressor__max_depth': [None, 10, 20],  # Profondeur maximale des arbres
    'regressor__min_samples_split': [2, 4],  # Nombre minimal d'échantillons requis pour diviser un nœud interne
    'regressor__min_samples_leaf': [1, 2]  # Nombre minimal d'échantillons requis pour être au niveau d'un nœud feuille
}

# Configuration et exécution de la recherche sur grille
grid_search_rf = GridSearchCV(model_rf, param_grid_rf, cv=3, scoring='r2', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

best_model_rf = grid_search_rf.best_estimator_
y_pred_rf = best_model_rf.predict(X_test)
# Affichage des meilleurs paramètres et du meilleur score R²
print("Meilleurs paramètres trouvés pour la forêt aléatoire:", grid_search_rf.best_params_)
print("Meilleur score R² pour la forêt aléatoire:", grid_search_rf.best_score_)

Meilleurs paramètres trouvés pour la forêt aléatoire: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 50}
Meilleur score R² pour la forêt aléatoire: 0.9625312561295352


In [None]:
model_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42))])
param_grid_xgb = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.1, 0.3],
    'regressor__max_depth': [3, 6, 9],
    'regressor__min_child_weight': [1, 3, 5],
    'regressor__subsample': [0.7, 0.8, 0.9],
    'regressor__colsample_bytree': [0.7, 0.8, 0.9],
}

grid_search_xgb = GridSearchCV(model_xgb, param_grid_xgb, cv=3, scoring='r2', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_model_x= grid_search_rf.best_estimator_
y_pred_rf = best_model_x.predict(X_test)
print("Meilleurs paramètres trouvés :", grid_search_x.best_params_)
print("Meilleur score R² :", grid_search_x.best_score_)

Fitting 3 folds for each of 45 candidates, totalling 135 fits


In [None]:
model_svr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR())
]) 

# Définition de la grille d'hyperparamètres pour SVR
param_grid = {
    'regressor__C': [0.1, 1, 10],
    'regressor__gamma': ['scale', 'auto', 0.1, 1, 10],
    'regressor__kernel': ['linear', 'rbf', 'poly']
}

# Configuration et exécution de la recherche sur grille
grid_search = GridSearchCV(model_svr, param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred_rf = best_model.predict(X_test)
# Affichage des meilleurs paramètres et du meilleur score R²
print("Meilleurs paramètres trouvés :", grid_search.best_params_)
print("Meilleur score R² :", grid_search.best_score_)

In [None]:
dataProv = pd.DataFrame(X_test.index)
dataProv.columns = ['index']

# Ajout des valeurs observées de la variable cible
dataProv['Obs'] = y_test.reset_index(drop=True)

# Ajout des prédictions de la régression linéaire
dataProv['Pred_svm'] = y_pred  # Assurez-vous que y_pred contient les prédictions de la régression linéaire

# Ajout des prédictions de la forêt aléatoire
dataProv['Pred_rf'] = y_pred_rf  # Assurez-vous que y_pred_rf contient les prédictions de la forêt aléatoire

# Ajout des prédictions du modèle XGBoost
dataProv['Pred_xgb'] = y_pred_xgb  

In [None]:
dataFinale = df2.copy().reset_index().merge(dataProv)
dataFinale.head()

In [None]:
for var in dataFinale.columns[3:16]:
#     dataFinale.groupby([var])[['Obs', 'Pred_model', 'Pred_knn', 'Pred_tree', 'Pred_xgb']].mean().plot()
    dataPlot = dataFinale.groupby([var])[['Obs', 'Pred_svm','Pred_rf', 'Pred_xgb']].mean()
        
    if dataFinale[var].dtypes == 'O':
        dataPlot.plot(kind='bar')
    else:
        dataPlot.plot()

In [None]:
for var in dataFinale.columns[3:16]:
#     dataFinale.groupby([var])[['Obs', 'Pred_model', 'Pred_knn', 'Pred_tree', 'Pred_xgb']].mean().plot()
    dataPlot = dataFinale.groupby([var])[['Obs', 'Pred_svm','Pred_rf', 'Pred_xgb']].mean()
    dataPlot['Ecart_Pred_model'] = dataPlot['Pred_model'] / dataPlot['Obs'] - 1
    dataPlot['Ecart_Pred_xgb'] = dataPlot['Pred_xgb'] / dataPlot['Obs'] - 1
#     dataPlot['Ecart_Pred_xgb_grid'] = dataPlot['Pred_xgb_grid'] / dataPlot['Obs'] - 1
        
    if dataFinale[var].dtypes == 'O':
        dataPlot[['Ecart_Pred_model', 'Ecart_Pred_xgb']].plot(kind='bar')
    else:
        dataPlot[['Ecart_Pred_model', 'Ecart_Pred_xgb']].plot()

In [51]:
from sklearn.metrics import roc_auc_score, roc_curve, auc

Z_test = Y_test.copy()
Z_test[Z_test > 1] = 1

fpr0, tpr0, thresholds0 = roc_curve(Z_test, p_model)
fpr3, tpr3, thresholds3 = roc_curve(Z_test, p_xgb)
fpr4, tpr4, thresholds4 = roc_curve(Z_test, p_xgb_grid)

auc0 = auc(fpr0, tpr0)
auc1 = auc(fpr1, tpr1)
auc2 = auc(fpr2, tpr2)
auc3 = auc(fpr3, tpr3)
auc4 = auc(fpr4, tpr4)


NameError: name 'p_knn' is not defined

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4,4))
ax.plot([0, 1], [0, 1], 'k--')
ax.plot(fpr0, tpr0, label='p_model' + ' auc=%1.5f' % auc0)
ax.plot(fpr1, tpr1, label='p_knn' + ' auc=%1.5f' % auc1)
ax.plot(fpr2, tpr2, label='p_tree' + ' auc=%1.5f' % auc2)
ax.plot(fpr3, tpr3, label='p_xgb' + ' auc=%1.5f' % auc3)
ax.plot(fpr4, tpr4, label='p_xgb_grid' + ' auc=%1.5f' % auc4)
ax.set_title('Courbe ROC - classifieur couleur des vins')
ax.legend()