In [2]:
import pandas as pd

# Load training data
X_train = pd.read_csv("../data/train/X_train.csv")
y_train = pd.read_csv("../data/train/y_train.csv")

# Load test data
X_test = pd.read_csv("../data/test/X_test.csv")
y_test = pd.read_csv("../data/test/y_test.csv")

# Convert y_train and y_test to Series if needed
y_train = y_train.squeeze()
y_test = y_test.squeeze()

# Combine X and y for train set
df_train = pd.concat([X_train, y_train], axis=1)

# Separate majority and minority classes (assuming 1 = default, 0 = no default)
df_A = df_train[df_train["Default"] == 1]  # All default cases
df_B = df_train[df_train["Default"] == 0].sample(df_A.shape[0], random_state=23)  # Sample equal non-default cases

# Concatenate to create a balanced dataset
df_balanced = pd.concat([df_A, df_B], axis=0, ignore_index=True)

# Check new dataset shape
print("Balanced dataset shape:", df_balanced.shape)
print(df_balanced.head(2))


Balanced dataset shape: (47444, 23)
   Age  Income  LoanAmount  CreditScore  MonthsEmployed  NumCreditLines  \
0   20   99464      248557          318              74               3   
1   32   20755      222550          404              75               1   

   InterestRate  LoanTerm  DTIRatio  Education  ...  MaritalStatus_Married  \
0         19.45        60      0.45          1  ...                  False   
1         15.17        24      0.85          0  ...                  False   

   MaritalStatus_Single  HasMortgage_Yes  HasDependents_Yes  \
0                 False            False              False   
1                 False            False              False   

   LoanPurpose_Business  LoanPurpose_Education  LoanPurpose_Home  \
0                 False                  False             False   
1                  True                  False             False   

   LoanPurpose_Other  HasCoSigner_Yes  Default  
0              False             True        1  
1         

In [12]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import classification_report

# Load training data
X_train = pd.read_csv("../data/train/X_train.csv")
y_train = pd.read_csv("../data/train/y_train.csv")

# Load test data
X_test = pd.read_csv("../data/test/X_test.csv")
y_test = pd.read_csv("../data/test/y_test.csv")

# Convert y_train and y_test to Series if needed
y_train = y_train.squeeze()
y_test = y_test.squeeze()

# Merge X_train and y_train to create a single DataFrame
df_train = pd.concat([X_train, y_train], axis=1)

# Separate classes
df_A = df_train[df_train["Default"] == 1]  # All default cases
df_B = df_train[df_train["Default"] == 0].sample(df_A.shape[0], random_state=23)  # Sample equal non-default cases

# Concatenate to create a balanced dataset
df_balanced = pd.concat([df_A, df_B], axis=0, ignore_index=True)

# **Ensure X_train and X_test have the same feature columns**
common_features = X_train.columns.intersection(X_test.columns)

# Select only these common features
X_train_balanced = df_balanced[common_features]
y_train_balanced = df_balanced["Default"]

X_test = X_test[common_features]  # Ensure test set has the same features

# **Scaling**
sc_exp = StandardScaler()
X_train_balanced = sc_exp.fit_transform(X_train_balanced)
X_test = sc_exp.transform(X_test)

# **PCA (Ensure consistent feature count)**
pca = PCA(n_components=min(X_train_balanced.shape[1], 22))  # Use the lower of the two
X_train_balanced = pca.fit_transform(X_train_balanced)
X_test = pca.transform(X_test)

# **XGBoost Classifier**
params = {
    'objective': 'binary:logistic',  
    'eval_metric': 'logloss',       
    'max_depth': 3,                    
    'learning_rate': 0.1,         
    'n_estimators': 100              
}
xgb_classifier = xgb.XGBClassifier(**params)

# **Cross-validation**
kf = KFold(n_splits=5, shuffle=True, random_state=42)
xgb_cv_scores = cross_val_score(xgb_classifier, X_train_balanced, y_train_balanced, cv=kf, scoring="accuracy")

# **Fitting the model**
xgb_classifier.fit(X_train_balanced, y_train_balanced)

# **Predictions**
xgb_prediction_exp = xgb_classifier.predict(X_test)

# **Evaluation Metrics**
print("Classification Report :")
print(classification_report(y_test, xgb_prediction_exp))


Classification Report :
              precision    recall  f1-score   support

           0       0.95      0.68      0.79     45139
           1       0.22      0.70      0.34      5931

    accuracy                           0.68     51070
   macro avg       0.58      0.69      0.57     51070
weighted avg       0.86      0.68      0.74     51070



In [5]:
# Train logistic regression model
model = LogisticRegression()
model.fit(X_train_balanced, y_train_balanced)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy_logReg = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_logReg:.4f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.6728
              precision    recall  f1-score   support

           0       0.94      0.68      0.78     45139
           1       0.21      0.65      0.32      5931

    accuracy                           0.67     51070
   macro avg       0.57      0.66      0.55     51070
weighted avg       0.85      0.67      0.73     51070



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
# Train model with balanced class weights
clf = LogisticRegression(class_weight='balanced', random_state=42)
clf.fit(X_train_balanced, y_train_balanced)

# Evaluate
y_pred = clf.predict(X_test)
# Evaluate model
accuracy_logRegopt = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_logRegopt:.4f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.6728
              precision    recall  f1-score   support

           0       0.94      0.68      0.78     45139
           1       0.21      0.65      0.32      5931

    accuracy                           0.67     51070
   macro avg       0.57      0.66      0.55     51070
weighted avg       0.85      0.67      0.73     51070



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
