<a href="https://colab.research.google.com/github/Faseeh56/Credit_Risk_Analysis/blob/main/credit_risk_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing libraries
import pandas as pd
import numpy as np

# Load the dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/credit_risk_dataset.csv'
df = pd.read_csv(file_path)

# Display basic info
print("Dataset Shape:", df.shape)
df.head()


Dataset Shape: (32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [2]:
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

# Drop rows or fill missing values as needed (example: drop if very few)
df = df.dropna()

# Encode categorical variables (Label Encoding / One-Hot Encoding)
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Check cleaned data
df.head()


Missing values per column:
 person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,3,123.0,4,3,35000,16.02,1,0.59,1,3
1,21,9600,2,5.0,1,1,1000,11.14,0,0.1,0,2
2,25,9600,0,1.0,3,2,5500,12.87,1,0.57,0,3
3,23,65500,3,4.0,3,2,35000,15.23,1,0.53,0,2
4,24,54400,3,8.0,3,2,35000,14.27,1,0.55,1,4


In [3]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Separate features & target
X = df.drop('loan_status', axis=1)  # Assuming 'loan_status' is the target column
y = df['loan_status']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check new class distribution
from collections import Counter
print("Before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_train_resampled))


Before SMOTE: Counter({0: 17992, 1: 4918})
After SMOTE: Counter({0: 17992, 1: 17992})


In [4]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Initialize models
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train models
rf_model.fit(X_train_resampled, y_train_resampled)
gb_model.fit(X_train_resampled, y_train_resampled)
xgb_model.fit(X_train_resampled, y_train_resampled)

# Predict on test data
rf_preds = rf_model.predict(X_test)
gb_preds = gb_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)

# Evaluate models
print("Random Forest:")
print(classification_report(y_test, rf_preds))
print("\nGradient Boosting:")
print(classification_report(y_test, gb_preds))
print("\nXGBoost:")
print(classification_report(y_test, xgb_preds))


Parameters: { "use_label_encoder" } are not used.



Random Forest:
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      4443
           1       0.79      0.76      0.78      1285

    accuracy                           0.90      5728
   macro avg       0.86      0.85      0.86      5728
weighted avg       0.90      0.90      0.90      5728


Gradient Boosting:
              precision    recall  f1-score   support

           0       0.93      0.92      0.92      4443
           1       0.74      0.74      0.74      1285

    accuracy                           0.88      5728
   macro avg       0.83      0.83      0.83      5728
weighted avg       0.88      0.88      0.88      5728


XGBoost:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      4443
           1       0.92      0.75      0.83      1285

    accuracy                           0.93      5728
   macro avg       0.93      0.87      0.89      5728
weighted avg       0.93      

In [5]:
# Comparing F1-Scores
rf_f1 = f1_score(y_test, rf_preds)
gb_f1 = f1_score(y_test, gb_preds)
xgb_f1 = f1_score(y_test, xgb_preds)

print(f"Random Forest F1-Score: {rf_f1:.4f}")
print(f"Gradient Boosting F1-Score: {gb_f1:.4f}")
print(f"XGBoost F1-Score: {xgb_f1:.4f}")

# Best Model
best_model_name = max([('Random Forest', rf_f1), ('Gradient Boosting', gb_f1), ('XGBoost', xgb_f1)], key=lambda x: x[1])[0]
print(f"\n✅ Best Performing Model: {best_model_name}")


Random Forest F1-Score: 0.7753
Gradient Boosting F1-Score: 0.7404
XGBoost F1-Score: 0.8266

✅ Best Performing Model: XGBoost
