In this post I will share an example that uses Principal Component Analysis as a dimension reduction tool to prepare the data for logistic regression prediction in python.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [14]:
df=pd.read_csv('/content/finalmerged.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.drop(['customerid', 'month_approved_loan'], axis = 1)
df = df.dropna()
df.head() # Inspect the first 5 rows

Unnamed: 0,longitude_gps,latitude_gps,bank_branch_clients,loannumber,loanamount,totaldue,termdays,good_bad_flag,referred,realage,...,employment_status_client_Contract,employment_status_client_Permanent,employment_status_client_Retired,employment_status_client_Self-Employed,employment_status_client_Student,employment_status_client_Unemployed,employment_status_client_Unknown,is_late_for_firstpay,exceeds_loan_term_days,TipoInteres
0,3.319219,6.528604,0,2,10000.0,13000.0,30,1,1,48,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
1,3.325598,7.119403,0,2,10000.0,13000.0,30,0,0,36,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,5.7461,5.563174,0,4,10000.0,13000.0,30,1,0,35,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
3,5.7461,5.563174,0,4,10000.0,13000.0,30,1,0,35,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
4,5.7461,5.563174,0,4,10000.0,13000.0,30,1,0,35,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [15]:
x = df.loc[:, df.columns != 'good_bad_flag'] # as x variable - the features
x=StandardScaler().fit_transform(x) # standarize the variables
y=df['good_bad_flag'] # assign y variable - the target 

We will start by using only the first 2 leading principal components, and then explore 3 principal components and 4 principal components.

In [29]:
pca=PCA(n_components=2)
PC=pca.fit_transform(x)
principalDF=pd.DataFrame(data=PC,columns=['pc1','pc2'])
finalDf = pd.concat([principalDF, df[['good_bad_flag']]], axis = 1)
finalDf.head()

Unnamed: 0,pc1,pc2,good_bad_flag
0,-3.802878,0.544418,1.0
1,-3.563865,1.558228,0.0
2,-3.364829,0.92797,1.0
3,-3.364829,0.92797,1.0
4,-3.124634,-0.763975,1.0


To assess how much weightings each feature will have in later predictions, we could construct a loadings table. The loadings shows how much each of our original features have contributed to each of the “new features” — the principal components.

In [31]:
PCloadings = pca.components_.T * np.sqrt(pca.explained_variance_)
components=df.columns.tolist()
components.pop(7)
loadingdf=pd.DataFrame(PCloadings,columns=('PC1','PC2'))
loadingdf["variable"]=components
loadingdf


Unnamed: 0,PC1,PC2,variable
0,0.005484,0.014028,longitude_gps
1,-0.011942,-0.088894,latitude_gps
2,0.031941,0.103802,bank_branch_clients
3,0.845344,-0.089238,loannumber
4,0.879723,-0.323109,loanamount
5,0.874249,-0.335063,totaldue
6,0.538451,-0.265529,termdays
7,-0.324813,-0.050836,referred
8,0.003539,0.015882,realage
9,-0.877865,0.29246,TipoInteresAhora


Now we can plot the loadings and see which of them have high weightings in both principal component 1 and 2:

In [38]:
fig=ex.scatter(x=loadingdf['PC1'],y=loadingdf['PC2'],text=loadingdf['variable'],)
fig.update_layout(
height=600,width=500,
title_text='loadings plot')
fig.update_traces(textposition='bottom center')
fig.add_shape(type="line",
x0=-0, y0=-1,x1=-0,y1=1,
line=dict(color="RoyalBlue",width=3)
)
fig.add_shape(type="line",
x0=-1, y0=0,x1=1,y1=0,
line=dict(color="RoyalBlue",width=3)
)
fig.show()

It is clear that is_late_for_first_pay and exceeds_loan_term_days t are two heavily weighted features.