In [1]:
# STEP 1: Install required libraries
!pip install -q pandas scikit-learn

# STEP 2: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# STEP 3: Upload your dataset
from google.colab import files
uploaded = files.upload()

# STEP 4: Load the CSV file
df = pd.read_csv(next(iter(uploaded)))

# STEP 5: Map loan_status to binary classes
good_loans = ['Fully Paid', 'Current']
df['loan_condition'] = df['loan_status'].apply(lambda x: 1 if x in good_loans else 0)

# STEP 6: Drop irrelevant columns
drop_cols = ['id', 'member_id', 'issue_date', 'last_credit_pull_date',
             'last_payment_date', 'next_payment_date', 'loan_status', 'emp_title']
df = df.drop(columns=drop_cols)

# STEP 7: Define X and y
X = df.drop(columns=['loan_condition'])
y = df['loan_condition']

# STEP 8: Preprocessing
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# STEP 9: Preprocess data
X_processed = preprocessor.fit_transform(X)

# STEP 10: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# STEP 11: Train Random Forest with class_weight
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# STEP 12: Train Logistic Regression with class_weight
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# STEP 13: Evaluate both models
print("🔍 Random Forest Performance")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

print("\n🔍 Logistic Regression Performance")
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))


Saving financial_loan (3) (1) (1).csv to financial_loan (3) (1) (1) (1).csv
🔍 Random Forest Performance
              precision    recall  f1-score   support

           0       1.00      0.50      0.67      1064
           1       0.93      1.00      0.96      6652

    accuracy                           0.93      7716
   macro avg       0.96      0.75      0.82      7716
weighted avg       0.94      0.93      0.92      7716

Confusion Matrix:
 [[ 537  527]
 [   1 6651]]

🔍 Logistic Regression Performance
              precision    recall  f1-score   support

           0       0.71      0.86      0.78      1064
           1       0.98      0.94      0.96      6652

    accuracy                           0.93      7716
   macro avg       0.84      0.90      0.87      7716
weighted avg       0.94      0.93      0.93      7716

Confusion Matrix:
 [[ 912  152]
 [ 377 6275]]


In [2]:
# STEP 1: Install necessary libraries
!pip install -q pandas scikit-learn imbalanced-learn xgboost

# STEP 2: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import xgboost as xgb

# STEP 3: Upload dataset
from google.colab import files
uploaded = files.upload()
df = pd.read_csv(next(iter(uploaded)))

# STEP 4: Create target column (1 = good, 0 = bad)
df['loan_condition'] = df['loan_status'].apply(lambda x: 1 if x in ['Fully Paid', 'Current'] else 0)

# STEP 5: Drop unnecessary columns
drop_cols = ['id', 'member_id', 'issue_date', 'last_credit_pull_date',
             'last_payment_date', 'next_payment_date', 'loan_status', 'emp_title']
df = df.drop(columns=drop_cols)

# STEP 6: Define features and labels
X = df.drop(columns=['loan_condition'])
y = df['loan_condition']

# STEP 7: Preprocessing
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols)
])

X_processed = preprocessor.fit_transform(X)

# ===========================
# ✅ MODEL 1: Logistic Regression + SMOTE
# ===========================

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_processed, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train logistic regression
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

print("📊 Logistic Regression with SMOTE")
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

# ===========================
# ✅ MODEL 2: Tuned XGBoost with class imbalance handling
# ===========================

# Split original dataset (not SMOTE)
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Compute class imbalance ratio
ratio = (y_train == 0).sum() / (y_train == 1).sum()

# Train XGBoost
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    scale_pos_weight=ratio,       # handle imbalance
    learning_rate=0.1,
    max_depth=5,
    n_estimators=100,
    random_state=42
)

xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("\n📊 Tuned XGBoost Model")
print(classification_report(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))


Saving financial_loan (3) (1) (1).csv to financial_loan (3) (1) (1) (2).csv
📊 Logistic Regression with SMOTE
              precision    recall  f1-score   support

           0       0.95      0.90      0.92      6656
           1       0.90      0.95      0.92      6642

    accuracy                           0.92     13298
   macro avg       0.92      0.92      0.92     13298
weighted avg       0.92      0.92      0.92     13298

Confusion Matrix:
 [[5961  695]
 [ 341 6301]]


Parameters: { "use_label_encoder" } are not used.




📊 Tuned XGBoost Model
              precision    recall  f1-score   support

           0       0.86      0.89      0.88      1064
           1       0.98      0.98      0.98      6652

    accuracy                           0.97      7716
   macro avg       0.92      0.93      0.93      7716
weighted avg       0.97      0.97      0.97      7716

Confusion Matrix:
 [[ 946  118]
 [ 148 6504]]
