## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import f1_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import InputLayer, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input

## Load Data

In [3]:
df = pd.read_csv('/Users/alexlei/Downloads/accepted_2007_to_2018Q4.csv', index_col=0, low_memory=False)

## Drop Columns

In [4]:
# Drop unnecessary columns
df = df.drop(df.loc[:, 'hardship_flag':'settlement_term'].columns, axis=1)
df = df.drop(['Total amount funded in policy code 1: 1465324575', 'Total amount funded in policy code 2: 521953170'])

In [5]:
df = df.loc[df['loan_status'].notnull()]

In [6]:
df['fico_range_high'].fillna(df['fico_range_high'].mean(), inplace=True)

In [7]:
df['fico_range_low'].fillna(df['fico_range_low'].mean(), inplace=True)

In [8]:
# Fill Loan Status Null values

In [9]:
df['loan_status'].value_counts()

Fully Paid                                             1076751
Current                                                 878317
Charged Off                                             268559
Late (31-120 days)                                       21467
In Grace Period                                           8436
Late (16-30 days)                                         4349
Does not meet the credit policy. Status:Fully Paid        1988
Does not meet the credit policy. Status:Charged Off        761
Default                                                     40
Name: loan_status, dtype: int64

## Assigning X & y

In [10]:
mapping_loan_status = {
    'Fully Paid': 1,
    'Does not meet the credit policy. Status:Fully Paid': 1,
    'Current': 0.75,
    'Late (16-30 days)': 0.5,
    'Late (31-120 days)': 0.25,
    'In Grace Period': 0.25,
    'Charged Off': 0,
    'Default': 0,
    'Does not meet the credit policy. Status:Charged Off': 0
}

# Convert loan_status to numerical values based on the mapping
df['loan_status_numeric'] = df['loan_status'].map(mapping_loan_status)

In [11]:
target_list = [0 if i in ['Default', 'Charged Off'] else 1 for i in df['loan_status']]

df['TARGET'] = target_list
df['TARGET'].value_counts()

1    1992069
0     268599
Name: TARGET, dtype: int64

In [12]:
df['TARGET'].isnull().any()

False

In [13]:
temp = df[['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate', 
    'installment', 'annual_inc', 'delinq_2yrs', 'fico_range_low', 
    'fico_range_high', 'inq_last_6mths', 'total_pymnt', 'total_rec_prncp',
   'tot_cur_bal', 'mort_acc']]

In [14]:
X = temp
y = df['TARGET']

### Imputers

In [15]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [16]:
imputer = imp_mean.fit(X)
X = imputer.transform(X)

In [17]:
# Assign weights based on the mapping
sample_weights = np.where(df['loan_status_numeric'] == 1, 1, df['loan_status_numeric'])

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, sample_weights_train, sample_weights_test = train_test_split(X, y, sample_weights, 
                                                                                               test_size=0.2, random_state=42, 
                                                                                               stratify=df['TARGET'], shuffle=True)

## Implementing a Scaler

In [18]:
# Transform X - Min-Max Scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Creating a classification report for the model

In [19]:
from sklearn.metrics import classification_report
#print(classification_report(y_test,y_pred))

In [20]:
def print_score(true, pred, train=True):
    if train:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")
        
    elif train==False:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")

## Creating LogisticRegression Model

In [21]:
# Create and train the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

#### Prediction

In [22]:
y_pred = model.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)
print('The Accuracy for Test Set is {}'.format(test_acc*100))

The Accuracy for Test Set is 89.93926579288441


## Autoencoder

In [23]:
# Define the # of input features (# of columns in 'df')
num_features = X_train.shape[1]
latent_dim = 10

In [24]:
# Define the encoder architecture
input_data = Input(shape=(num_features,))

encoded = Dense(20, activation='relu')(input_data)
encoded = Dropout(0.3)(encoded)
encoded = Dense(latent_dim, activation='sigmoid')(encoded)

In [25]:
# Latent space representation
latent_representation = Dense(latent_dim, activation='sigmoid')(encoded)

In [26]:
# # Define encoder model
# encoder_model = Model(input_data, encoded)
encoder_model = Model(input_data, latent_representation)

In [27]:
# Decoder layers
decoded = Dense(20, activation='relu')(encoded)

# Output layer
decoded = Dense(num_features, activation='sigmoid')(decoded)

In [28]:
# Define autoencoder model
autoencoder = Model(input_data, decoded)
#autoencoder_model = Model(encoder_model.input, decoder_output)

# Compile autoencoder model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

In [29]:
# Train the autoencoder
autoencoder.fit(X_train, X_train, epochs=3, batch_size=10, shuffle=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2ef6f8bd0>

In [30]:
# Extract the learned features from the encoder
encoder = Model(input_data, encoded)
encoded_data = encoder.predict(X)



In [31]:
encoder.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 14)]              0         
                                                                 
 dense (Dense)               (None, 20)                300       
                                                                 
 dropout (Dropout)           (None, 20)                0         
                                                                 
 dense_1 (Dense)             (None, 10)                210       
                                                                 
Total params: 510 (1.99 KB)
Trainable params: 510 (1.99 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [32]:
# Calculate reconstruction error for each column
reconstruction_errors = np.mean(np.square(X - autoencoder.predict(X)), axis=0)



In [33]:
# Identify the column with the smallest reconstruction error (potential feature)
best_feature_index = np.argmin(reconstruction_errors)
best_feature_name = df.columns[best_feature_index]

In [34]:
print(f"The best feature to use as input is: {best_feature_name}")

The best feature to use as input is: installment


## Scoring the Model

In [35]:
y_pred = model.predict(X_test)

In [36]:
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
print(f1_score)

0.8767031528426911


In [37]:
accuracy_score = metrics.accuracy_score(y_test, y_pred)
print(accuracy_score)

0.8993926579288442


In [38]:
input_sample = temp.sample(1)
input_sample

Unnamed: 0_level_0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,delinq_2yrs,fico_range_low,fico_range_high,inq_last_6mths,total_pymnt,total_rec_prncp,tot_cur_bal,mort_acc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10099608,12000.0,12000.0,12000.0,16.99,298.17,35000.0,0.0,660.0,664.0,0.0,16569.083808,12000.0,128051.0,2.0


In [39]:
prediction = model.predict(input_sample.values)
prediction

array([1])

In [40]:
from sklearn.decomposition import PCA

In [41]:
pca = PCA(n_components=4)

principalComponents = pca.fit_transform(X)

principalDf = pd.DataFrame(data = principalComponents,
              columns = ['principal component 1', 'principal component 2', 'principal component 3', 'principal component 4'])

In [42]:
finalDf = principalDf.merge(df[['TARGET']], left_on='principal component 1', right_on='TARGET')

  finalDf = principalDf.merge(df[['TARGET']], left_on='principal component 1', right_on='TARGET')


In [43]:
finalDf

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4,TARGET


In [44]:
principalDf.isnull().any()

principal component 1    False
principal component 2    False
principal component 3    False
principal component 4    False
dtype: bool

In [45]:
pca.explained_variance_ratio_

array([0.69547696, 0.29449333, 0.00811679, 0.00184805])

In [46]:
# Make an instance of the Model
pca = PCA(.95)

In [47]:
pca.fit(X_train)

In [48]:
train_X = pca.transform(X_train)
test_X = pca.transform(X_test)

In [49]:
Xfinal = principalDf
yfinal = df['TARGET']
X_train, X_test, y_train, y_test = train_test_split(Xfinal,yfinal,test_size=0.3)

In [50]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(X=X_train, y=y_train)

In [51]:
logistic.predict(X_test)

array([1, 1, 1, ..., 1, 1, 1])

In [52]:
score = logistic.score(X_test,y_test)

In [53]:
score

0.877791392227378

### Using Random Forest Classifier

In [54]:
rf_clf.fit(X_train, y_train)

NameError: name 'rf_clf' is not defined

In [None]:
rf_clf.score(X_test, y_test)

### Using Logistic Regression model

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

## Pipeline

In [39]:
# Define PCA pipeline for training data
pca_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values using mean imputation
    ('scaler', MinMaxScaler()),  # Scale features to a range (default is [0, 1])
    ('pca', PCA(n_components=4))  # Apply PCA for dimensionality reduction
])

# Fit PCA pipeline to training data
pca_pipeline.fit(X_train)

# Transform testing data using PCA
X_test_pca = pca_pipeline.transform(X_test)

# Fit a Linear Regression model to transformed training data
model = LinearRegression()
model.fit(pca_pipeline.transform(X_train), y_train)

# Reverse PCA transformation on predicted values
y_pred = model.predict(X_test_pca)

# Reverse PCA transformation on actual values for calculating mean squared error
X_test_reverse_pca = pca_pipeline.named_steps['pca'].inverse_transform(X_test_pca)

# Calculate mean squared error
mse = mean_squared_error(X_test, X_test_reverse_pca)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.0003167254687457965
