In [2]:
!pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.0.post0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.6.1-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.9-py3-none-any.whl.metadata (5.2 kB)
Downloading pytorch_lightning-2.5.0.post0-py3-none-any.whl (819 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m819.3/819.3 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.9-py3-none-any.whl (28 kB)
Downloading torchmetrics-1.6.1-py3-none-any.whl (927 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m927.3/927.3 kB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning-utilities, torchmetrics, pytorch-lightning
Successfully installed lightning-utilities-0.11.9 pytorch-lightning-2.5.0.post0 torchmetrics-1.6.1


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import torch
import torch.nn as nn
import torch.optim as optim
import scipy.sparse
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
import pytorch_lightning as pl

In [5]:
# Set global parameters
sns.set(style='darkgrid')
pd.set_option('display.precision', 2)
pd.set_option('mode.chained_assignment', None)
warnings.filterwarnings('ignore')

In [6]:
# Read csv file and print our 5 rows
df = pd.read_csv('/content/drive/MyDrive/DATA1/credit_score_cleaned_train.csv')
df.loc[0:2].style

Unnamed: 0,id,customer_id,month,name,age,ssn,occupation,annual_income,monthly_inhand_salary,credit_history_age,total_emi_per_month,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,num_credit_inquiries,credit_mix,outstanding_debt,credit_utilization_ratio,payment_of_min_amount,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.8434,89,49.574947,3,4,3,4,"['Auto Loan', 'Credit-Builder Loan', 'Personal Loan', 'Home Equity Loan']",3,7,11.27,4,Good,809.98,26.82262,No,80.4153,High_spent_Small_value_payments,312.49408,2
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.8434,93,49.574947,3,4,3,4,"['Auto Loan', 'Credit-Builder Loan', 'Personal Loan', 'Home Equity Loan']",-1,7,11.27,4,Good,809.98,31.94496,No,118.28022,Low_spent_Large_value_payments,284.62915,2
2,0x1604,CUS_0xd40,March,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.8434,91,49.574947,3,4,3,4,"['Auto Loan', 'Credit-Builder Loan', 'Personal Loan', 'Home Equity Loan']",3,7,11.27,4,Good,809.98,28.609352,No,81.699524,Low_spent_Medium_value_payments,331.20987,2


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96696 entries, 0 to 96695
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        96696 non-null  object 
 1   customer_id               96696 non-null  object 
 2   month                     96696 non-null  object 
 3   name                      96696 non-null  object 
 4   age                       96696 non-null  int64  
 5   ssn                       96696 non-null  object 
 6   occupation                96696 non-null  object 
 7   annual_income             96696 non-null  float64
 8   monthly_inhand_salary     96696 non-null  float64
 9   credit_history_age        96696 non-null  int64  
 10  total_emi_per_month       96696 non-null  float64
 11  num_bank_accounts         96696 non-null  int64  
 12  num_credit_card           96696 non-null  int64  
 13  interest_rate             96696 non-null  int64  
 14  num_of

In [8]:
unique_classes_list = df['credit_score'].unique()
print(f'The unique classes in the target column are: {unique_classes_list}')

The unique classes in the target column are: [2 1 0]


In [9]:
# Check for NaN values in each column
nan_columns = df.isna().sum()

# Filter columns with NaN values
nan_columns = nan_columns[nan_columns > 0]

print(f"Columns with NaN values:\n{nan_columns}")
print(f"Number of columns with NaN values: {nan_columns.count()}")

Columns with NaN values:
Series([], dtype: int64)
Number of columns with NaN values: 0


In [10]:
# Global variables
SEED = 777
torch.manual_seed(SEED)

<torch._C.Generator at 0x7e7e99502bf0>

In [11]:
# Create features and target
X = df.drop(['credit_score'], axis=1)
y = df['credit_score']

In [12]:
# Define numerical and categorical features
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=[object, 'category']).columns.tolist()

In [13]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

X = preprocessor.fit_transform(X)

In [14]:
# Upscaling data
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

In [16]:
# Balancing labels using SMOTE
smote = SMOTE(random_state=SEED)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=SEED)

In [18]:
# Save the split data to disk
scipy.sparse.save_npz('/content/drive/MyDrive/DATA1/X_train.npz', X_train)
scipy.sparse.save_npz('/content/drive/MyDrive/DATA1/X_test.npz', X_test)
pd.DataFrame(y_train).to_csv('/content/drive/MyDrive/DATA1/y_train.csv', index=False)
pd.DataFrame(y_test).to_csv('/content/drive/MyDrive/DATA1/y_test.csv', index=False)

In [19]:
# Convert to PyTorch tensors
class CreditScoreDataset(Dataset):
    def __init__(self, X_path, y_path):
        self.X = scipy.sparse.load_npz(X_path)
        self.y = pd.read_csv(y_path).values.flatten()

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        X = torch.tensor(self.X[idx].toarray(), dtype=torch.float32).squeeze()
        y = torch.tensor(self.y[idx], dtype=torch.long)
        return X, y

In [20]:
# Create datasets and dataloaders
train_dataset = CreditScoreDataset(
    '/content/drive/MyDrive/DATA1/X_train.npz',
    '/content/drive/MyDrive/DATA1/y_train.csv'
)
test_dataset = CreditScoreDataset(
    '/content/drive/MyDrive/DATA1/X_test.npz',
    '/content/drive/MyDrive/DATA1/y_test.csv'
)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

In [21]:
class CreditScoreModel(pl.LightningModule):
    def __init__(self, input_dim, output_dim):
        super(CreditScoreModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.fc4 = nn.Linear(64, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.4)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x.to_dense())))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.fc4(x)
        return x

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = self.criterion(outputs, labels)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = self.criterion(outputs, labels)
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=0.001, weight_decay=1e-5)
        scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss'
            }
        }

In [25]:
# Initialize the model
input_dim = 131635 # number of encoded cols
output_dim = len(unique_classes_list)

model = CreditScoreModel(input_dim, output_dim)

In [26]:
# Initialize PyTorch Lightning trainer
trainer = pl.Trainer(max_epochs=5)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [27]:
# Train the model
trainer.fit(model, train_loader, test_loader)

INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | fc1       | Linear           | 33.7 M | train
1 | bn1       | BatchNorm1d      | 512    | train
2 | fc2       | Linear           | 32.9 K | train
3 | bn2       | BatchNorm1d      | 256    | train
4 | fc3       | Linear           | 8.3 K  | train
5 | bn3       | BatchNorm1d      | 128    | train
6 | fc4       | Linear           | 195    | train
7 | relu      | ReLU             | 0      | train
8 | dropout   | Dropout          | 0      | train
9 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
33.7 M    Trainable params
0         Non-trainable params
33.7 M    Total params
134.964   Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [28]:
# Evaluation loop
model.eval()  # Set the model to evaluation mode
all_labels = []
all_predictions = []

with torch.no_grad():  # Disable gradient calculation
    for inputs, labels in test_loader:
        outputs = model(inputs)  # Forward pass
        _, predicted = torch.max(outputs.data, 1)  # Get the predicted class
        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())

    # Calculate F1 score
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    print(f'F1 Score of the model on the test data: {f1}')

F1 Score of the model on the test data: 0.9959857662667928
