In [1]:
import os
from google.colab import drive

# Mount Google Drive
drive.mount("/content/drive")

%cd "/content/drive/MyDrive/CS5780/Credit-Risk-Scoring"

# Verify the current working directory
print("Current working directory:", os.getcwd())

# List the contents of the folder
folder_contents = os.listdir()
print(folder_contents)


Mounted at /content/drive
/content/drive/MyDrive/CS5780/Credit-Risk-Scoring
Current working directory: /content/drive/MyDrive/CS5780/Credit-Risk-Scoring
['main.ipynb', 'LICENSE', '.gitignore', 'README.md', 'submission.csv', 'data']


In [2]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split

In [3]:
data = pd.read_csv('data/data_devsample.csv')
df = pd.DataFrame(data)

# Define features and target variable
X = df.drop(['SK_ID_CURR', 'TARGET'], axis=1)
y = df['TARGET']

# Load scoring data
df_test = pd.read_csv('data/data_to_score.csv')
X_new = df_test.drop(['SK_ID_CURR'], axis=1)

# Preprocessing: Numerical and Categorical pipeline
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [4]:
X.shape

(80000, 195)

In [5]:
df.isnull().sum()

SK_ID_CURR                          0
TARGET                              0
NAME_CONTRACT_TYPE                  0
CODE_GENDER                         0
FLAG_OWN_CAR                        0
                                ...  
LAST_TRANSACTION_TIME_MONTHS    30353
TIME                                0
BASE                                0
DAY                                 0
MONTH                               0
Length: 197, dtype: int64

In [6]:
# Replace inf/-inf with NaN in numeric columns
for col in numerical_features:
    X[col] = X[col].replace([np.inf, -np.inf], np.nan)

# Optional: Clip numeric columns to a maximum value, limit the values of each column to a specified range.
max_value_threshold = 1e6  # Example threshold
for col in numerical_features:
    X[col] = X[col].clip(upper=max_value_threshold)

# Preprocessing: Numerical and Categorical pipeline
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

In [7]:
# Create an XGBoost classifier pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', XGBClassifier())])
model

In [8]:
# # Create a logistic regression pipeline
# model = Pipeline(steps=[('preprocessor', preprocessor),
#                         ('classifier', LogisticRegression())])
# model

In [9]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
# Define hyperparameter grid for XGBoost
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 4, 5],
    # 'classifier__min_child_weight': [1, 3, 5],
    # 'classifier__gamma': [0, 0.1, 0.2],
    # 'classifier__subsample': [0.8, 1.0],
    # 'classifier__colsample_bytree': [0.8, 1.0],
    # 'classifier__reg_alpha': [0, 0.1, 0.5],
    # 'classifier__reg_lambda': [1, 1.5, 2],
    # Add other hyperparameters you want to tune
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Perform hyperparameter tuning with GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
test_roc_auc = best_model.score(X_test, y_test)
print("Test ROC-AUC:", test_roc_auc)

Best Hyperparameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 200}
Test ROC-AUC: 0.9180625


In [13]:
# # Train the model
# model.fit(X_train, y_train)

# Evaluate the model using Gini coefficient (2 * AUC - 1)
roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
gini_coefficient = 2 * roc_auc - 1
print(f'ROC: {roc_auc}')
print(f'Gini Coefficient: {gini_coefficient}')

ROC: 0.7407381505831856
Gini Coefficient: 0.4814763011663712


In [14]:
# Perform cross-validation
cv_scores = cross_val_score(best_model, X_test, y_test, cv=5, scoring='roc_auc')  # Use appropriate scoring metric (ROC-AUC in this case)
print("Cross-validated ROC-AUC scores:", cv_scores)
print("Mean ROC-AUC:", np.mean(cv_scores))

Cross-validated ROC-AUC scores: [0.71453918 0.73169647 0.71251462 0.72304471 0.70731894]
Mean ROC-AUC: 0.7178227839675192


### Predict on testing data

In [15]:
# Replace inf/-inf with NaN in numeric columns
for col in numerical_features:
    X_new[col] = X_new[col].replace([np.inf, -np.inf], np.nan)

# Optional: Clip numeric columns to a maximum value
max_value_threshold = 1e6  # Example threshold
for col in numerical_features:
    X_new[col] = X_new[col].clip(upper=max_value_threshold)

In [16]:
X_new.shape

(100000, 195)

In [17]:
# Generate predictions and save to CSV
predictions = best_model.predict_proba(X_new)[:, 1]
output = pd.DataFrame({'SK_ID_CURR': df_test['SK_ID_CURR'], 'SCORE': predictions})
output.to_csv('submission.csv', index=False)