In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

In [13]:
tr_path = '../Boosting/dataset/give-me-some-credit/cs-training.csv'
ts_path = '../Boosting/dataset/give-me-some-credit/cs-test.csv'

In [6]:
df.describe()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,75000.5,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,43301.414527,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37500.75,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,75000.5,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,112500.25,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,150000.0,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [11]:
print(df.isnull().sum())

Unnamed: 0                                  0
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64


In [17]:
def clean_credit_data(train_path, test_path, target_col="SeriousDlqin2yrs"):
    # Load train and test
    train = pd.read_csv(train_path).drop(columns=["Unnamed: 0"], errors="ignore")
    test = pd.read_csv(test_path).drop(columns=["Unnamed: 0"], errors="ignore")

    # Numeric columns
    numeric_cols = train.select_dtypes(include="number").columns.drop(target_col)

    # Fill missing values
    train[numeric_cols] = train[numeric_cols].fillna(train[numeric_cols].median())
    test[numeric_cols] = test[numeric_cols].fillna(train[numeric_cols].median())

    # Handle outliers
    for col in numeric_cols:
        Q1 = train[col].quantile(0.25)
        Q3 = train[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        train[col] = train[col].clip(lower=lower, upper=upper)
        test[col] = test[col].clip(lower=lower, upper=upper)

    # Encode target in train
    le = LabelEncoder()
    train[target_col] = le.fit_transform(train[target_col])

    # Encode
    if target_col in test.columns and test[target_col].notna().any():
        test.loc[test[target_col].notna(), target_col] = le.transform(
            test.loc[test[target_col].notna(), target_col]
        )

    # Align test columns with train columns
    test = test[train.columns.intersection(test.columns)]

    return train, test


In [20]:
df, df_ts = clean_credit_data(tr_path, ts_path)

In [26]:
X = df.drop("SeriousDlqin2yrs", axis=1)
y = df["SeriousDlqin2yrs"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
)

X_test = df_ts.drop("SeriousDlqin2yrs", axis=1, errors="ignore")
y_test = df_ts["SeriousDlqin2yrs"] if "SeriousDlqin2yrs" in df_ts.columns else None

In [27]:
model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=200,
    learning_rate=0.5,
    random_state=42
)
model.fit(X_train, y_train)

In [50]:
def explain_model(model, X_train, X_test, y_train, y_test, features=None):
    # Train evaluation
    print("Train Results:")
    y_pred_tr = model.predict(X_train)
    print(classification_report(y_train, y_pred_tr))
    print(confusion_matrix(y_train, y_pred_tr))

    # Test evaluation only if labels exist
    if y_test is not None and y_test.notna().any():
        print("Test Results:")
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
    else:
        print("Test set has no labels — skipping test evaluation")
        # You can still predict
        y_pred = model.predict(X_test)


In [51]:
explain_model(model,  X_train, X_test, y_train, y_test)

Train Results:
              precision    recall  f1-score   support

           0       0.93      1.00      0.97    111930
           1       0.57      0.00      0.00      8070

    accuracy                           0.93    120000
   macro avg       0.75      0.50      0.48    120000
weighted avg       0.91      0.93      0.90    120000

[[111921      9]
 [  8058     12]]
Test set has no labels — skipping test evaluation
Predictions on test set:
[0 0 0 0 0 0 0 0 0 0]


In [40]:
y_pred_test = model.predict(X_test)

In [41]:
y_pred_test

array([0, 0, 0, ..., 0, 0, 0])

In [49]:
explain_model(model,  X_train, X_test, y_train, y_test)

Train Results:
              precision    recall  f1-score   support

           0       0.93      1.00      0.97    111930
           1       0.57      0.00      0.00      8070

    accuracy                           0.93    120000
   macro avg       0.75      0.50      0.48    120000
weighted avg       0.91      0.93      0.90    120000

[[111921      9]
 [  8058     12]]
Test set has no labels — skipping test evaluation


In [57]:
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
print("Cross-validation accuracy:", scores)
print("Mean accuracy:", scores.mean())

Cross-validation accuracy: [0.00370828 0.00370828 0.00246914 0.0037037  0.00370599]
Mean accuracy: 0.003459078903539551


In [65]:
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

In [66]:
model.fit(X_tr, y_tr)

In [67]:
y_val_pred = explain_model(model, X_tr, X_val, y_tr, y_val)

Train Results:
              precision    recall  f1-score   support

           0       0.93      1.00      0.97     89584
           1       0.65      0.00      0.00      6416

    accuracy                           0.93     96000
   macro avg       0.79      0.50      0.48     96000
weighted avg       0.91      0.93      0.90     96000

[[89578     6]
 [ 6405    11]]
Test Results:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96     22346
           1       0.50      0.00      0.00      1654

    accuracy                           0.93     24000
   macro avg       0.72      0.50      0.48     24000
weighted avg       0.90      0.93      0.90     24000

[[22345     1]
 [ 1653     1]]


In [68]:
print(df.isnull().sum())

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64
