In [None]:
import pandas as pd
import numpy as np

<H1> Chargement des données 

In [None]:
df1=pd.read_csv('immo/TGV.csv', sep = ";")
df1

<H1> création des variables catégorielles 

In [None]:
df1['Prixm2Moyen'].describe()

In [None]:
df1['PasGare'] = df1['TGV'].apply(lambda x: 1 if x == 'Pas de gare' else 0)
df1['GareNonTGV'] = df1['TGV'].apply(lambda x: 1 if x == 'Gare voyageurs non TGV' else 0)
df1['Gare TGV'] = df1['TGV'].apply(lambda x: 1 if x == 'Gare TGV' else 0)
df1=df1[['NbMaisons','NbApparts','Prixm2Moyen','SurfaceMoy', 'PasGare', 'GareNonTGV', 'Gare TGV']]
df1['ClassePrix'] = df1['Prixm2Moyen'].apply(lambda x: 0 if x <= 1108 else 1 if (x > 1108 and x <= 1452) else 2 if (x > 1452 and x <= 1922) else 3)
df1.dropna(inplace=True)
df1

<H1> traitement des données abberrantes 

In [None]:

nan_count = df1.apply(lambda x: x.isna().sum())
print(nan_count)
print(df1.dtypes)
df1.head()

<H1> distribution du prix du M2

In [None]:
df1['Prixm2Moyen'].describe()

In [None]:
import matplotlib.pyplot as plt

plt.hist(df1['Prixm2Moyen'], range=[330, 6000], color='lightgreen', ec='black', bins=15)
plt.xticks(range(500, 6000,500 ))
plt.show()


<h1> Division en train et test 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report



X = df1.drop(columns=["ClassePrix","Prixm2Moyen"])
y = df1["ClassePrix"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =.33,random_state=1)

In [None]:
X

<H1> ML 

<H3> Poids et Kfolds

In [None]:
from sklearn.model_selection import KFold
from sklearn.utils import class_weight

kf=KFold(n_splits=5, shuffle=False, random_state=None)
poids=class_weight.compute_class_weight(class_weight='balanced',classes=np.unique(y_train),y=y_train)
poids

weight_dict = {np.unique(y)[i]: poids[i] for i in range(len(np.unique(y)))}
weight_dict


<h3> LogisticRegression Multinomial

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_curve, auc, confusion_matrix
from sklearn.calibration import calibration_curve

# initialisation du modèle de régression multinomiale
model = LogisticRegression(multi_class='multinomial', solver='saga',class_weight=weight_dict)

# entrainement
model.fit(X_train, y_train)

# prédictions
y_pred = model.predict(X_test)

# métriques
report = classification_report(y_test, y_pred)
print(report)

# courbes roc et graphs
y_pred_prob = model.predict_proba(X_test)
n_classes = len(model.classes_)
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test, y_pred_prob[:, i], pos_label=i)
    roc_auc[i] = auc(fpr[i], tpr[i])

plt.figure()
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label='Classe %d (AUC = %0.2f)' % (i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('Taux de faux positifs')
plt.ylabel('Taux de vrais positifs')
plt.title('Courbes ROC par classe')
plt.legend(loc='lower right')
plt.show()

# matrice de conf
confusion = confusion_matrix(y_test, y_pred)
labels = model.classes_
plt.figure(figsize=(10, 8))
sns.heatmap(confusion, annot=True, cmap="Blues", fmt="d", xticklabels=labels, yticklabels=labels)
plt.xlabel('Prédictions')
plt.ylabel('Vraies valeurs')
plt.title('Matrice de confusion')
plt.show()


<H3> Descente de gradient stochastique

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, roc_curve, auc, confusion_matrix
from sklearn.calibration import calibration_curve

# initialisation du modèle
model = SGDClassifier(loss='log', max_iter=1000, class_weight=weight_dict)

# entrainement du modèle
model.fit(X_train, y_train)

# prédictions 
y_pred = model.predict(X_test)

# métriques
report = classification_report(y_test, y_pred)
print(report)

# Courbes ROC et graphiques 
y_pred_prob = model.predict_proba(X_test)
n_classes = len(model.classes_)
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test, y_pred_prob[:, i], pos_label=i)
    roc_auc[i] = auc(fpr[i], tpr[i])


plt.figure()
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label='Classe %d (AUC = %0.2f)' % (i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('Taux de faux positifs')
plt.ylabel('Taux de vrais positifs')
plt.title('Courbes ROC par classe')
plt.legend(loc='lower right')
plt.show()

#matrice de confusion
confusion = confusion_matrix(y_test, y_pred)
labels = model.classes_
plt.figure(figsize=(10, 8))
sns.heatmap(confusion, annot=True, cmap="Blues", fmt="d", xticklabels=labels, yticklabels=labels)
plt.xlabel('Prédictions')
plt.ylabel('Vraies valeurs')
plt.title('Matrice de confusion')
plt.show()


<H3> XGBoost

In [None]:
!pip install xgboost

In [None]:
df1.ClassePrix.value_counts()
df1['ClassePrix']=df1['ClassePrix'].astype(int)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, auc, confusion_matrix
import xgboost as xgb

# initialisation du modèle 
model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(set(y_train)), class_weights=weight_dict)

# application du modèle pour l'entrainement 
model.fit(X_train, y_train)

# Prédictions
y_pred = model.predict(X_test)

# Résultat des métriques 
report = classification_report(y_test, y_pred)
print(report)

# Courbes ROC et graphiques 
y_pred_prob = model.predict_proba(X_test)
n_classes = len(model.classes_)
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test, y_pred_prob[:, i], pos_label=i)
    roc_auc[i] = auc(fpr[i], tpr[i])


plt.figure()
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label='Classe %d (AUC = %0.2f)' % (i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('Taux de faux positifs')
plt.ylabel('Taux de vrais positifs')
plt.title('Courbes ROC par classe')
plt.legend(loc='lower right')
plt.show()

# Matrice de confusion
confusion = confusion_matrix(y_test, y_pred)
labels = model.classes_
plt.figure(figsize=(10, 8))
sns.heatmap(confusion, annot=True, cmap="Blues", fmt="d", xticklabels=labels, yticklabels=labels)
plt.xlabel('Prédictions')
plt.ylabel('Vraies valeurs')
plt.title('Matrice de confusion')
plt.show()

# Sauvegarde du modèle
with open('modele_xgboost.pkl', 'wb') as file:
    pickle.dump(model, file)


<H4> charger le modèle sauvegardé 

In [None]:
with open('modele_xgboost.pkl', 'rb') as file:
    model = pickle.load(file)

y_pred = model.predict(X_test)


<H3> Optimized XGBoost (RandomSearch) 

In [None]:
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_curve, auc, confusion_matrix
import xgboost as xgb

# Définir les hyperparamètres à optimiser
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'gamma': [ 0.1, 0.2],
    'scale_pos_weight' : [weight_dict],
}

# Instancier un modèle XGBoost
model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(set(y_train)))

# Effectuer la recherche aléatoire des hyperparamètres
random_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=3, random_state=1)
random_search.fit(X_train, y_train)

# Afficher les meilleurs hyperparamètres trouvés
print("Meilleurs hyperparamètres:")
print(random_search.best_params_)

# Utiliser le modèle avec les meilleurs hyperparamètres pour faire des prédictions
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculer les métriques usuelles (précision, rappel, score F1, etc.)
report = classification_report(y_test, y_pred)
print(report)

# Calculer les courbes ROC et l'AUC (Area Under Curve) pour chaque classe
y_pred_prob = best_model.predict_proba(X_test)
n_classes = len(best_model.classes_)
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test, y_pred_prob[:, i], pos_label=i)
    roc_auc[i] = auc(fpr[i], tpr[i])

# Afficher les courbes ROC
plt.figure()
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label='Classe %d (AUC = %0.2f)' % (i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('Taux de faux positifs')
plt.ylabel('Taux de vrais positifs')
plt.title('Courbes ROC par classe')
plt.legend(loc='lower right')
plt.show()

# Calculer et afficher la matrice de confusion avec Seaborn
confusion = confusion_matrix(y_test, y_pred)
labels = best_model.classes_
plt.figure(figsize=(10, 8))
sns.heatmap(confusion, annot=True, cmap="Blues", fmt="d", xticklabels=labels, yticklabels=labels)
plt.xlabel('Prédictions')
plt.ylabel('Vraies valeurs')
plt.title('Matrice de confusion')
plt.show()


In [None]:
import pickle

with open('modele_xgboost.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [None]:
### Meilleurs paramètres pour Thomas 
# Meilleurs hyperparamètres:
# {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0.2}

### A LIRE ###

### légère amélioration au niveau des auc individuels, mais pas au niveau des métriques 

#### RESULTATS RANDOMIZED ###

# Meilleurs hyperparamètres:
#{'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0.2}
#              precision    recall  f1-score   support

#           0       0.38      0.11      0.18      3615
#           1       0.49      0.28      0.35     15256
#           2       0.58      0.84      0.69     35712
#           3       0.56      0.36      0.44     15366
#           4       0.79      0.63      0.70      7783

#    accuracy                           0.58     77732
#   macro avg       0.56      0.45      0.47     77732
#weighted avg       0.57      0.58      0.55     77732