In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, make_scorer, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from matplotlib import pyplot
import seaborn as sns

## Dataset avec suppression de lignes

In [4]:
df = pd.read_csv("dataset_cleaned_0.9886.csv")
X = df.drop(columns=['MIS_Status']) 
y = df['MIS_Status']

In [None]:
df.shape

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [9]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42, booster='gbtree', max_depth=13, alpha=8,learning_rate=0.35,n_estimators=150)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
xgb_model.score(X_test,y_test)

[[ 8981   809]
 [  591 26128]]


0.96165329096935

In [42]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

fpr, tpr, _ = roc_curve(y_test, y_pred)

roc_auc1 = auc(fpr, tpr)

In [None]:
max_score = 0
best_params = {}
# temp_list = np.linspace(0,1,21)
list = range(0,1)
for x in list :
    params = {
            'max_depth': 11,
            'alpha': 6,
            'learning_rate': 0.35,
            'lambda': 8,
            'n_estimators': 150
        }

    xgb_clf = xgb.XGBClassifier(**params)

    xgb_clf.fit(X_train, y_train)
    y_prob = xgb_clf.predict_proba(X_test)[:, 1] 
    roc_auc11 = roc_auc_score(y_test, y_prob)
    f1_macro = f1_score(y_test, xgb_clf.predict(X_test), average='macro')
    

    # score = xgb_clf.score(X_test,y_test)
    if max_score<roc_auc11:
        best_params = params
        max_score = roc_auc11
print(best_params)
print(max_score)
print(f'F1-macro: {f1_macro}')

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc11:.2f})')
plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Taux de faux positifs (FPR)')
plt.ylabel('Taux de vrais positifs (TPR)')
plt.title('Courbe ROC')
plt.legend(loc='lower right')
plt.show()

## Dataset sans suppression de lignes

In [31]:
df = pd.read_csv("dataset_cleaned_0.9832.csv")
X = df.drop(columns=['MIS_Status']) 
y = df['MIS_Status']

In [None]:
df.shape

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [33]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

fpr, tpr, _ = roc_curve(y_test, y_pred)

# Calcul de l'AUC
roc_auc1 = auc(fpr, tpr)


In [None]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42, booster='gbtree', max_depth=13, alpha=8,learning_rate=0.35,n_estimators=150)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
xgb_model.score(X_test,y_test)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc1:.2f})')
plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Taux de faux positifs (FPR)')
plt.ylabel('Taux de vrais positifs (TPR)')
plt.title('Courbe ROC')
plt.legend(loc='lower right')
plt.show()

In [None]:
max_score = 0
best_params = {}
# temp_list = np.linspace(0,1,21)
list = range(0,1)
for x in list :
    params = {
            'max_depth': 11,
            'alpha': 6,
            'learning_rate': 0.35,
            'lambda': 8,
            'n_estimators': 150
        }

    xgb_clf = xgb.XGBClassifier(**params)

    xgb_clf.fit(X_train, y_train)
    y_prob = xgb_clf.predict_proba(X_test)[:, 1]  # Probabilités pour la classe positive
    roc_auc = roc_auc_score(y_test, y_prob)
    f1_macro = f1_score(y_test, xgb_clf.predict(X_test), average='macro')
    

    # score = xgb_clf.score(X_test,y_test)
    if max_score<roc_auc:
        best_params = params
        max_score = roc_auc
print(best_params)
print(max_score)
print(f'F1-macro: {f1_macro}')

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Taux de faux positifs (FPR)')
plt.ylabel('Taux de vrais positifs (TPR)')
plt.title('Courbe ROC')
plt.legend(loc='lower right')
plt.show()

## Feature Importance

In [None]:
pyplot.bar(range(len(xgb_model.feature_importances_)), xgb_model.feature_importances_)
pyplot.show()

In [None]:
importances = xgb_clf.feature_importances_
for feature, importance in zip(X_train.columns, importances):
    print(f"{feature}: {importance:.4f}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


importances = xgb_clf.feature_importances_
features = X_train.columns


feat_imp_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feat_imp_df = feat_imp_df.sort_values(by='Importance', ascending=False)


plt.figure(figsize=(20, 16))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df, palette='viridis')
plt.title("Importance des features")
plt.show()


## Shap

In [6]:
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
xid=4549

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0].values, X_test.iloc[0])

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[1].values, X_test.iloc[1])

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[2].values, X_test.iloc[2])

In [None]:
shap.summary_plot(shap_values, X_test)

## Regression Log

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

model = LogisticRegression(solver='saga')
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

In [20]:
y_prob = model.predict_proba(X_test)[:, 1]  # Probabilités pour la classe positive
roc_auc = roc_auc_score(y_test, y_prob)
print(f'AUC-ROC: {roc_auc}')


In [21]:
scoring = {'F1_macro': make_scorer(f1_score, average='macro'),
           'AUC': make_scorer(roc_auc_score)}
