In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
import sys

# Read the dataset into a pandas DataFrame
data = pd.read_csv('accepted_2007_to_2018Q4.csv', low_memory=False)

# Your code for data preprocessing and feature engineering...
def separate_features(dataframe):
    categorical_features = []
    continuous_features = []

    for column in dataframe.columns:
        if dataframe[column].dtype == 'object' or dataframe[column].dtype.name == 'category':
            categorical_features.append(column)
        elif dataframe[column].dtype == 'int64' or dataframe[column].dtype == 'float64':
            continuous_features.append(column)
        else:
            print(f'Unhandled data type in column "{column}": {dataframe[column].dtype}')

    return categorical_features, continuous_features



#print("Categorical Features:", cat_features)
#print("Continuous Features:", cont_features)


data = data[data['loan_status'].isin(['Fully Paid','Charged Off','Defaulted'])]
leakage_features = [
    'out_prncp',
    'out_prncp_inv',
    'total_pymnt',
    'total_pymnt_inv',
    'total_rec_prncp',
    'total_rec_int',
    'total_rec_late_fee',
    'recoveries',
    'collection_recovery_fee',
    'last_pymnt_d',
    'last_pymnt_amnt',
    'next_pymnt_d',
    'last_credit_pull_d',
    'debt_settlement_flag',
    'debt_settlement_flag_date',
    'settlement_status',
    'settlement_date',
    'settlement_amount',
    'settlement_percentage',
    'settlement_term',
    'last_fico_range_high',
    'last_fico_range_low',
    'id',
    'url',
    'emp_title',
    'title'
]

# Drop the leakage features
data = data.drop(leakage_features, axis=1)

# Calculate the percentage of missing values for each feature
missing_values = data.isnull().sum() / len(data) * 100

# Identify columns with more than 90% missing values
columns_to_drop = missing_values[missing_values > 90].index

# Drop the identified columns
data = data.drop(columns_to_drop, axis=1)

categorical_features, continuous_features = separate_features(data)
categorical_features.remove('loan_status')


data[continuous_features] = data[continuous_features].fillna(-1)

# Fill categorical NAs with "missing" and convert all elements to strings
data[categorical_features] = data[categorical_features].fillna('missing').astype(str)

data.isna().sum().sum()


for feature in categorical_features:
    label_encoder = LabelEncoder()
    data[feature] = label_encoder.fit_transform(data[feature])

# Preprocess loan status labels with custom encoding
custom_encoding = {
    'Fully Paid': 0,
    'Charged Off': 1,
    'Default': 1,
    
}


data['loan_status'] = data['loan_status'].replace(custom_encoding)


# Split the dataset into features (X) and labels (y)
X = data.drop('loan_status', axis=1)
y = data['loan_status']

#Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)


# Split the dataset into training and temporary sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Apply RandomUnderSampler to balance the training dataset
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Split the temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create the Logistic Regression model
logit_model = LogisticRegression(random_state=42, max_iter=1000)

# Train the model
logit_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = logit_model.predict(X_test)
y_pred_val = logit_model.predict(X_val)


# Calculate the test accuracy
val_accuracy = accuracy_score(y_val, y_pred_val)
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred)
test_recall = recall_score(y_test, y_pred)
test_f1_score = f1_score(y_test, y_pred)
test_auc_roc = roc_auc_score(y_test, y_pred)

# Print the metrics
print(f"Validation accuracy: {val_accuracy}")
print(f"Test accuracy: {test_accuracy}")
print(f"Test precision: {test_precision}")
print(f"Test recall: {test_recall}")
print(f"Test F1-score: {test_f1_score}")
print(f"Test AUC-ROC score: {test_auc_roc}")


Validation accuracy: 0.6631891534293212
Test accuracy: 0.6635868312879559
Test precision: 0.3290411546022902
Test recall: 0.6561809026551961
Test F1-score: 0.4382981892197138
Test AUC-ROC score: 0.6608097499807484
