## IMPORTING NECESSARY LIBRARIES

In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning models
from sklearn.model_selection import train_test_split

# Classification models
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from imblearn.over_sampling import SMOTE

# Hyperparameter tuning and cross-validation
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, ShuffleSplit

# Evaluation metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.metrics import mean_squared_error, r2_score

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

## LOADING DATA

In [2]:
try:
    df = pd.read_csv('../data/preprocessed/preprocessed_loan_data.csv')
    df_test = pd.read_csv('../data/application_test.csv')
    df_main = pd.read_csv('../data/application_train.csv')
except FileNotFoundError:
    print("File not found")

In [3]:
df_main['TARGET'].value_counts()

TARGET
0    282686
1     24825
Name: count, dtype: int64

## PREPROCESS/ SPLITTING DATA

In [4]:
# Train-test split
X = df.drop('TARGET', axis=1)
y = df['TARGET']

# Split into training and validation sets (e.g., 80% train, 20% validation)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

X_test = df_test

In [5]:
df_test = df_test[['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE',
       'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
       'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT',
       'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START',
       'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION',
       'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
       'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY',
       'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'EXT_SOURCE_2',
       'EXT_SOURCE_3', 'YEARS_BEGINEXPLUATATION_AVG', 'FLOORSMAX_AVG',
       'YEARS_BEGINEXPLUATATION_MODE', 'FLOORSMAX_MODE',
       'YEARS_BEGINEXPLUATATION_MEDI', 'FLOORSMAX_MEDI', 'TOTALAREA_MODE',
       'EMERGENCYSTATE_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE',
       'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE',
       'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_2',
       'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
       'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8',
       'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11',
       'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14',
       'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17',
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR']]

In [6]:
categorical_cols = X_train.select_dtypes(include=['object']).columns
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

In [7]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_valid[col] = le.transform(X_valid[col].astype(str))  
    
    label_encoders[col] = le

In [8]:
# Fit scaler on training data
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_valid[numeric_cols] = scaler.transform(X_valid[numeric_cols])

## MODEL TRAINING

In [9]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42)
}

In [10]:
# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Predict on the validation set
    y_pred = model.predict(X_valid)
    
    # Print classification report for the validation set
    print(f"Classification Report for {name}:")
    print(classification_report(y_valid, y_pred))


Training Logistic Regression...
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56538
           1       0.51      0.01      0.02      4965

    accuracy                           0.92     61503
   macro avg       0.71      0.50      0.49     61503
weighted avg       0.89      0.92      0.88     61503


Training Random Forest...
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56538
           1       0.71      0.00      0.00      4965

    accuracy                           0.92     61503
   macro avg       0.82      0.50      0.48     61503
weighted avg       0.90      0.92      0.88     61503


Training LightGBM...
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028723 seconds.


In [11]:
df['TARGET'].value_counts()

TARGET
0    282686
1     24825
Name: count, dtype: int64

In [12]:
model = CatBoostClassifier(
    iterations=2000,
    early_stopping_rounds=100,
    learning_rate=0.001,
    depth=6,
    eval_metric='F1',
    class_weights=[1, 10],  
    verbose=100,
    random_seed=42
)
model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50)

y_pred = model.predict(X_valid)

print(classification_report(y_valid, y_pred))

0:	learn: 0.6098355	test: 0.6095797	best: 0.6095797 (0)	total: 34.8ms	remaining: 1m 9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6223918468
bestIteration = 11

Shrink model to first 12 iterations.
              precision    recall  f1-score   support

           0       0.95      0.72      0.82     56538
           1       0.16      0.60      0.25      4965

    accuracy                           0.71     61503
   macro avg       0.55      0.66      0.53     61503
weighted avg       0.89      0.71      0.77     61503



In [13]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'depth': [6, 8],
    'learning_rate': [0.01, 0.05],
    'iterations': [300, 500, 1000],
    'l2_leaf_reg': [3, 5, 7],
    'class_weights': [[1, 10], [1, 20], [1, 30]]
}

cbc = CatBoostClassifier(
    eval_metric='F1',
    verbose=0,
    random_seed=42
)

clf = RandomizedSearchCV(cbc, param_distributions=params, cv=3, scoring='f1', n_iter=20, n_jobs=-1)
clf.fit(X_train, y_train)

In [14]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
print(dict(zip(classes, weights)))


{np.int64(0): np.float64(0.5439092983356032), np.int64(1): np.float64(6.193554884189325)}


In [15]:
y_pred = clf.predict(X_valid)

print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.80      0.87     56538
           1       0.20      0.55      0.29      4965

    accuracy                           0.78     61503
   macro avg       0.57      0.68      0.58     61503
weighted avg       0.89      0.78      0.82     61503

