<a href="https://colab.research.google.com/github/Emperor228/Loan_Status_Prediction/blob/main/Loan_Status_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

Import data and Analysis

In [2]:
X_full = pd.read_csv('/content/loan_statut.csv')

In [3]:
# Pre-processing
X_full.head(4)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y


In [4]:
X_full.shape

(614, 13)

In [5]:
X_full['Dependents'].value_counts()


Dependents
0     345
1     102
2     101
3+     51
Name: count, dtype: int64

In [6]:
X_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [7]:

  # Dropping nan values in target columns and splitting target and label data
X_full.dropna(axis=0, subset=["Loan_Status"], inplace=True)
y = X_full['Loan_Status']
# Convertir la colonne cible en valeurs numériques
y = y.map({'N': 0, 'Y': 1})

X_full.drop(['Loan_Status', 'Loan_ID'], axis=1, inplace=True)


  # Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y,
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

X_train_full = X_train_full.replace(to_replace='3+', value=4)
X_valid_full = X_valid_full.replace(to_replace='3+', value=4)

# Select categorical columns with relatively low cardinality
categorical_cols = [col for col in X_train_full.columns
                      if X_train_full[col].nunique() < 10 and
                      X_train_full[col].dtype == "object"]

                      # Select numerical columns
numerical_cols = [col for col in X_train_full.columns
                    if X_train_full[col].dtype in ["int64", 'float64']]

          # Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()



In [8]:
X_train.select_dtypes(include=["object"]).value_counts()

Gender  Married  Dependents  Education     Self_Employed  Property_Area
Male    Yes      0           Graduate      No             Semiurban        30
                                                          Rural            27
                                                          Urban            24
        No       0           Graduate      No             Semiurban        18
                                                          Urban            18
                                                                           ..
                 1           Not Graduate  Yes            Urban             1
        Yes      4           Graduate      Yes            Rural             1
                             Not Graduate  No             Urban             1
                                           Yes            Rural             1
                 2           Not Graduate  Yes            Semiurban         1
Name: count, Length: 94, dtype: int64

Pre-Processing & Define Pipeline

In [9]:
# Convertir toutes les colonnes catégorielles en chaînes de caractères
X_train[categorical_cols] = X_train[categorical_cols].astype(str)
X_valid[categorical_cols] = X_valid[categorical_cols].astype(str)


# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy="median")

# Preprocessing for categorical data
categorical_tranformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# print(X_train.head(10))

# Bundle preprocessing for bth numerical & categorical data
preprocessor = ColumnTransformer(
    transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_tranformer, categorical_cols)
])

# Définir les modèles à tester
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'KNN': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(random_state=42)
}

# Définir les paramètres de la grille de recherche pour chaque modèle
param_grids = {
    'RandomForest': {'model__n_estimators': [100, 200, 300]},
    'GradientBoosting': {'model__n_estimators': [100, 200, 300], 'model__learning_rate': [0.05, 0.1, 0.2]},
    'SVM': {'model__C': [0.1, 1, 10], 'model__gamma': ['scale', 'auto']},
    'KNN': {'model__n_neighbors': [3, 5, 7]},
    'XGBoost': {'model__n_estimators': [100, 200, 300], 'model__learning_rate': [0.05, 0.1, 0.2]}
}

# Définir la stratégie de validation croisée
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Entraîner et évaluer chaque modèle
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    param_grid = param_grids[name]

    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    y_pred = grid_search.predict(X_valid)
    test_accuracy = accuracy_score(y_valid, y_pred)

    results[name] = {
        'best_params': grid_search.best_params_,
        'test_accuracy': test_accuracy
    }

# Afficher les résultats
for name, result in results.items():
    print(f"Model: {name}")
    print(f"Best Parameters: {result['best_params']}")
    print(f"Test Accuracy: {result['test_accuracy']}")
    print()

# # Define Model
# # model = RandomForestClassifier(n_estimators=100, random_state=0)
# model = XGBClassifier(n_estimators=100, learning_rate=0.05, random_state=0)


# # Bundle preprocessing and modeling code in a pipeline
# clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# # Effectuer la validation croisée
# cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')

# # Afficher les scores de validation croisée et leur moyenne
# print(f'Cross-validation scores: {cv_scores}')
# print(f'Average cross-validation score: {cv_scores.mean()}')

# # Ajuster le modèle sur les données d'entraînement
# clf.fit(X_train, y_train)

# # Prétraiter les données de validation et obtenir des prédictions
# preds = clf.predict(X_valid)

# # Afficher l'exactitude des prédictions
# print('Validation Accuracy:', accuracy_score(y_valid, preds))

Model: RandomForest
Best Parameters: {'model__n_estimators': 100}
Test Accuracy: 0.7723577235772358

Model: GradientBoosting
Best Parameters: {'model__learning_rate': 0.05, 'model__n_estimators': 100}
Test Accuracy: 0.8130081300813008

Model: SVM
Best Parameters: {'model__C': 0.1, 'model__gamma': 'scale'}
Test Accuracy: 0.7317073170731707

Model: KNN
Best Parameters: {'model__n_neighbors': 7}
Test Accuracy: 0.6422764227642277

Model: XGBoost
Best Parameters: {'model__learning_rate': 0.05, 'model__n_estimators': 100}
Test Accuracy: 0.8130081300813008

