<a href="https://colab.research.google.com/github/Domaakshithareddy/Loan-Prediction/blob/main/Improved_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from imblearn.combine import SMOTEENN
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import KNNImputer
from sklearn.feature_selection import RFE

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

def preprocess(df, is_train=True, le=None, imputer=None):
    # Missing value handling
    for col in ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].mode()[0])
    if 'Loan_Amount_Term' in df.columns :
        df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median())
    if 'LoanAmount' in df.columns:
        df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())

    if le is None:
        le = LabelEncoder()
        for col in ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']:
            if col in df.columns:
                df[col] = le.fit_transform(df[col])
    else:
        for col in ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']:
            if col in df.columns:
                unseen_labels = set(df[col]) - set(le.classes_)
                if unseen_labels:
                    df[col] = df[col].replace(list(unseen_labels), le.classes_[0])
                df[col] = le.transform(df[col])

    df['Dependents'] = df['Dependents'].replace('3+', 3)
    df['Dependents'] = pd.to_numeric(df['Dependents'], errors='coerce')
    if imputer is None:
        imputer = KNNImputer(n_neighbors=5)
        df[['Dependents']] = imputer.fit_transform(df[['Dependents']])
    else:
        df[['Dependents']] = imputer.transform(df[['Dependents']])
    if is_train:
        df['Loan_Status'] = le.fit_transform(df['Loan_Status'])
        return df, le, imputer
    else:
        return df, le, imputer

train_data, le, imputer = preprocess(train_data)
test_data, _, _ = preprocess(test_data, is_train=False, le=le, imputer=imputer)

train_data['TotalIncome'] = train_data['ApplicantIncome'] + train_data['CoapplicantIncome']
test_data['TotalIncome'] = test_data['ApplicantIncome'] + test_data['CoapplicantIncome']
train_data['LoanAmountPerTerm'] = train_data['LoanAmount'] / train_data['Loan_Amount_Term']
test_data['LoanAmountPerTerm'] = test_data['LoanAmount'] / test_data['Loan_Amount_Term']
train_data['LoanIncomeRatio'] = train_data['LoanAmount'] / train_data['TotalIncome']
test_data['LoanIncomeRatio'] = test_data['LoanAmount'] / test_data['TotalIncome']
train_data.replace([np.inf, -np.inf], 0, inplace=True)
test_data.replace([np.inf, -np.inf], 0, inplace=True)

features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
            'TotalIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
            'Property_Area', 'LoanAmountPerTerm', 'LoanIncomeRatio']
X = train_data[features]
y = train_data['Loan_Status']

# Polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=False)
X_poly = poly.fit_transform(X)
X_test_poly = poly.transform(test_data[features])

# data augmentation
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_poly, y)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test_poly)
rfe = RFE(estimator=XGBClassifier(random_state=42), n_features_to_select=11)
rfe.fit(X_scaled, y_resampled)
X_selected = rfe.transform(X_scaled)
X_test_selected = rfe.transform(X_test_scaled)

# XGBoost model
xgb_classifier = XGBClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
}
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_selected, y_resampled)
best_xgb = grid_search.best_estimator_

X_train_val, X_val, y_train_val, y_val = train_test_split(X_selected, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)
y_val_pred = best_xgb.predict(X_val)

# Evaluation
print("Confusion Matrix (Validation Set):")
print(confusion_matrix(y_val, y_val_pred))
print(f"Accuracy of Random Forest classifier on validation set: {accuracy_score(y_val, y_val_pred):.2f}")
print("Classification Report (Validation Set):")
print(classification_report(y_val, y_val_pred))

# Feature Importance
poly_feature_names = poly.get_feature_names_out(features)
selected_poly_features = poly_feature_names[rfe.support_]
selected_original_features = []

for poly_feature in selected_poly_features:
    for original_feature in features:
        if original_feature in poly_feature:
            selected_original_features.append(original_feature)
            break

feature_importance = pd.DataFrame({'Feature': selected_original_features, 'Importance': best_xgb.feature_importances_})
print("Feature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Confusion Matrix (Validation Set):
[[24  0]
 [ 0 24]]
Accuracy of Random Forest classifier on validation set: 1.00
Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        24
           1       1.00      1.00      1.00        24

    accuracy                           1.00        48
   macro avg       1.00      1.00      1.00        48
weighted avg       1.00      1.00      1.00        48

Feature Importance:
             Feature  Importance
2     Credit_History    0.450302
7          Education    0.139735
3    LoanIncomeRatio    0.106077
10  Loan_Amount_Term    0.054519
1        TotalIncome    0.050775
9        TotalIncome    0.047895
6          Education    0.041556
5             Gender    0.034873
4             Gender    0.034568
8      Self_Employed    0.028050
0          Education    0.011649


In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from imblearn.combine import SMOTEENN
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import KNNImputer
from sklearn.feature_selection import RFE

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

def preprocess(df, is_train=True, le=None, imputer=None):
    # Missing value handling
    for col in ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].mode()[0])
    if 'Loan_Amount_Term' in df.columns :
        df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median())
    if 'LoanAmount' in df.columns:
        df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())

    if le is None:
        le = LabelEncoder()
        for col in ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']:
            if col in df.columns:
                df[col] = le.fit_transform(df[col])
    else:
        for col in ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']:
            if col in df.columns:
                unseen_labels = set(df[col]) - set(le.classes_)
                if unseen_labels:
                    df[col] = df[col].replace(list(unseen_labels), le.classes_[0])
                df[col] = le.transform(df[col])

    df['Dependents'] = df['Dependents'].replace('3+', 3)
    df['Dependents'] = pd.to_numeric(df['Dependents'], errors='coerce')
    if imputer is None:
        imputer = KNNImputer(n_neighbors=5)
        df[['Dependents']] = imputer.fit_transform(df[['Dependents']])
    else:
        df[['Dependents']] = imputer.transform(df[['Dependents']])
    if is_train:
        df['Loan_Status'] = le.fit_transform(df['Loan_Status'])
        return df, le, imputer
    else:
        return df, le, imputer

train_data, le, imputer = preprocess(train_data)
test_data, _, _ = preprocess(test_data, is_train=False, le=le, imputer=imputer)

train_data['TotalIncome'] = train_data['ApplicantIncome'] + train_data['CoapplicantIncome']
test_data['TotalIncome'] = test_data['ApplicantIncome'] + test_data['CoapplicantIncome']
train_data['LoanAmountPerTerm'] = train_data['LoanAmount'] / train_data['Loan_Amount_Term']
test_data['LoanAmountPerTerm'] = test_data['LoanAmount'] / test_data['Loan_Amount_Term']
train_data['LoanIncomeRatio'] = train_data['LoanAmount'] / train_data['TotalIncome']
test_data['LoanIncomeRatio'] = test_data['LoanAmount'] / test_data['TotalIncome']
train_data.replace([np.inf, -np.inf], 0, inplace=True)
test_data.replace([np.inf, -np.inf], 0, inplace=True)

features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
            'TotalIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
            'Property_Area', 'LoanAmountPerTerm', 'LoanIncomeRatio']
X = train_data[features]
y = train_data['Loan_Status']

# Polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=False)
X_poly = poly.fit_transform(X)
X_test_poly = poly.transform(test_data[features])

# data augmentation
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_poly, y)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test_poly)
rfe = RFE(estimator=XGBClassifier(random_state=42), n_features_to_select=20)
X_selected = rfe.fit_transform(X_scaled, y_resampled)
X_test_selected = rfe.transform(X_test_scaled)

# XGBoost model
xgb_classifier = XGBClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
}
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_selected, y_resampled)
best_xgb = grid_search.best_estimator_

X_train_val, X_val, y_train_val, y_val = train_test_split(X_selected, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)
y_val_pred = best_xgb.predict(X_val)

# Evaluation
print("Confusion Matrix (Validation Set):")
print(confusion_matrix(y_val, y_val_pred))
print(f"Accuracy of Random Forest classifier on validation set: {accuracy_score(y_val, y_val_pred):.2f}")
print("Classification Report (Validation Set):")
print(classification_report(y_val, y_val_pred))

# Feature Importance
poly_feature_names = poly.get_feature_names_out(features)
selected_features = poly_feature_names[rfe.support_]
feature_importance = pd.DataFrame({'Feature': selected_features, 'Importance': best_xgb.feature_importances_})
print("Feature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Confusion Matrix (Validation Set):
[[24  0]
 [ 0 24]]
Accuracy of Random Forest classifier on validation set: 1.00
Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        24
           1       1.00      1.00      1.00        24

    accuracy                           1.00        48
   macro avg       1.00      1.00      1.00        48
weighted avg       1.00      1.00      1.00        48

Feature Importance:
                              Feature  Importance
4                      Credit_History    0.411534
5                     LoanIncomeRatio    0.083190
18   Loan_Amount_Term LoanIncomeRatio    0.083167
11        Education LoanAmountPerTerm    0.057960
2                         TotalIncome    0.037391
10               Education LoanAmount    0.036519
14         TotalIncome Credit_History    0.035663
1                           Educatio