In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Load data
train_data = pd.read_csv('training_loan_data.csv')
test_data = pd.read_csv('testing_loan_data.csv')

# Inspect data
print("Training Data Overview:")
print(train_data.info())
print(train_data.head())

# Data Dictionary (based on dict_data.json)
data_dict = {
    "annual_inc": "The annual income provided by the borrower during application.",
    "bc_util": "Ratio of total current balance to high credit/credit limit for all bankcard accounts.",
    "desc": "Loan description provided by the borrower",
    "dti": "A ratio calculated using the borrower’s total monthly debt payments divided by self-reported monthly income.",
    "emp_length": "Employment length in years.",
    "home_ownership": "The home ownership status provided by the borrower.",
    "id": "A unique assigned ID for the loan listing.",
    "inq_last_6mths": "The number of inquiries by creditors during the past 6 months.",
    "int_rate": "Interest Rate on the loan",
    "loan_amnt": "The listed amount of the loan applied for by the borrower.",
    "member_id": "A unique assigned Id for the borrower member.",
    "mths_since_last_major_derog": "Months since most recent 90-day or worse rating",
    "mths_since_recent_inq": "Months since most recent inquiry.",
    "percent_bc_gt_75": "Percentage of all bankcard accounts > 75% of limit.",
    "purpose": "A category provided by the borrower for the loan request.",
    "revol_util": "Revolving line utilization rate.",
    "term": "The number of payments on the loan.",
    "tot_cur_bal": "Total current balance of all accounts",
    "tot_hi_cred_lim": "Total high credit/credit limit",
    "total_bc_limit": "Total bankcard high credit/credit limit",
    "internal_score": "A third-party vendor's risk score",
    "bad_flag": "Target variable, indicates if the loan is bad or not"
}

# Data Cleaning and Preprocessing
def preprocess_data(df):
    df = df.copy()

    # Handle missing values
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].fillna('Unknown')
        else:
            df[col] = df[col].fillna(df[col].median())

    # Convert percentages to numeric
    for col in ['int_rate', 'revol_util']:
        if col in df.columns:
            df[col] = df[col].str.replace('%', '').astype(float) / 100

    # Extract numeric values from strings
    if 'term' in df.columns:
        df['term'] = df['term'].str.extract(r'(\d+)').astype(float)

    if 'emp_length' in df.columns:
        df['emp_length'] = df['emp_length'].replace({'< 1 year': 0.5, '10+ years': 10}).str.extract(r'(\d+)').astype(float)

    return df

train_data_cleaned = preprocess_data(train_data)
test_data_cleaned = preprocess_data(test_data)

# Feature Selection
features = [
    'loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership',
    'annual_inc', 'purpose', 'percent_bc_gt_75', 'bc_util', 'dti',
    'inq_last_6mths', 'mths_since_recent_inq', 'revol_util',
    'total_bc_limit', 'tot_hi_cred_lim', 'tot_cur_bal', 'internal_score'
]

X = train_data_cleaned[features]
y = train_data_cleaned['bad_flag']

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing Pipeline
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1)

# Neural Network Definition
class LoanPredictionNN(nn.Module):
    def __init__(self, input_dim):
        super(LoanPredictionNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Model Training
input_dim = X_train.shape[1]
model = LoanPredictionNN(input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, criterion, optimizer, epochs=20):
    for epoch in range(epochs):
        # Forward pass
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (epoch+1) % 5 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

train_model(model, criterion, optimizer)

# Save the updated model and notebook content for delivery.

  train_data = pd.read_csv('training_loan_data.csv')
  test_data = pd.read_csv('testing_loan_data.csv')


Training Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199122 entries, 0 to 199121
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   Data derived from a public source  199122 non-null  object
 1   Unnamed: 1                         189458 non-null  object
 2   Unnamed: 2                         199122 non-null  object
 3   Unnamed: 3                         189458 non-null  object
 4   Unnamed: 4                         189458 non-null  object
 5   Unnamed: 5                         181532 non-null  object
 6   Unnamed: 6                         189458 non-null  object
 7   Unnamed: 7                         189458 non-null  object
 8   Unnamed: 8                         82005 non-null   object
 9   Unnamed: 9                         189458 non-null  object
 10  Unnamed: 10                        180420 non-null  object
 11  Unnamed: 11                 

ValueError: could not convert string to float: 'Unknown'