### Import packages

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import numpy as np
from sklearn.pipeline import Pipeline

### Set-up

In [None]:
infile = 'https://bitbucket.org/vishal_derive/vcu-data-mining/raw/3d740375d8d00c80e83dacbadc8b5e70cd2bfe48/data/credit_default_model_data.csv'

target = 'default payment next month'

### Read data

In [None]:
df = pd.read_csv(infile)

y = df[target]
X = df.drop(target, axis=1)

del df

In [None]:
X.head()

In [None]:
X = X[X.columns[:-1]]

In [None]:
X.shape

In [None]:
y.mean()

### Train-Test partition

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=20000, random_state=314)

X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=10000, random_state=314)

len(X_train), len(X_test), len(X_valid)

### Logistic Regression model

In [None]:
logit = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=314)

Standardize the input data.

In [None]:
X_scaler = StandardScaler()

# fit and transform the training data frame
X_train_std = X_scaler.fit_transform(X_train.astype(float))

# transform the test data frame
X_test_std = X_scaler.transform(X_test.astype(float))

Fit the model and get model scores.

In [None]:
logit.fit(X_train_std, y_train)

logit_scores_train = logit.predict_proba(X_train_std)[:, 1]
logit_scores_test = logit.predict_proba(X_test_std)[:, 1]

ROC Curve

In [None]:
logit_fpr_test, logit_tpr_test, _ = roc_curve(y_test, logit_scores_test)
auc_logit = roc_auc_score(y_test, logit_scores_test)

sns.set(style='darkgrid')
plt.figure().set_size_inches(9, 9)

plt.plot(logit_fpr_test, logit_tpr_test, color='royalblue', lw=2, linestyle = '-',
         label=f'Test (AUC = {auc_logit:0.3f})')


plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate', fontsize = 14)
plt.ylabel('True Positive Rate', fontsize = 14)
plt.title('Default Risk Model: Logistic Regression', fontsize = 16)
plt.legend(loc="lower right", fontsize = 14);

### Principal Component Analysis (PCA)

In [None]:
pca = PCA(random_state=314)

pca.fit(X_train)

plt.figure().set_size_inches(9, 6)

sns.lineplot(x=range(len(X.columns)), y=pca.explained_variance_ratio_, linewidth=3, color='tomato')
plt.xlabel('Number of Components', fontsize = 14)
plt.ylabel('Explained Variance', fontsize = 14);

Cumulative Variance Explained

In [None]:
plt.figure().set_size_inches(9, 6)

sns.lineplot(x=range(len(X.columns)), y=np.cumsum(pca.explained_variance_ratio_), linewidth=3, color='tomato')
plt.xlabel('Number of Components', fontsize = 14)
plt.ylabel('Explained Variance', fontsize = 14);

A very few principal compoents appear to explain most of the variance in the data. The top ten principal components explain almost 100% of all variance!

We need to *standardize* the data before fitting PCA -- or, run PCA on the standardized data.

In [None]:
pca.fit(X_train_std)

plt.figure().set_size_inches(9, 6)

sns.lineplot(x=range(len(X.columns)), y=pca.explained_variance_ratio_, linewidth=3, color='tomato')

plt.xlabel('Number of Components', fontsize = 14)
plt.ylabel('Explained Variance', fontsize = 14);

Cumulative Variance Explained

In [None]:
plt.figure().set_size_inches(9, 6)

sns.lineplot(x=range(len(X.columns)), y=np.cumsum(pca.explained_variance_ratio_), linewidth=3, color='tomato')

plt.xlabel('Number of Components', fontsize = 14)
plt.ylabel('Explained Variance', fontsize = 14);

In [None]:
np.cumsum(pca.explained_variance_ratio_)[30]

Let's keep the top 30 principal compoents. By doing so, we will retain 97% of the total variance.

In [None]:
components_to_keep = 30

pca = PCA(n_components=components_to_keep, random_state=314)

### Pipelines

In [None]:
# define the pipeline
pl = Pipeline(steps=[('scaling', X_scaler),
                     ('pca', pca),
                     ('logit', logit)])

# fit the pipeline
pl_fit = pl.fit(X_train.astype(float), y_train)

# calculate model scores (predicted probabilities)
pl_scores_test = pl_fit.predict_proba(X_test.astype(float))[:, 1]

ROC Curve

In [None]:
pl_fpr_test, pl_tpr_test, _ = roc_curve(y_test, pl_scores_test)
auc_pl = roc_auc_score(y_test, pl_scores_test)

plt.figure().set_size_inches(9, 6)

plt.plot(logit_fpr_test, logit_tpr_test, color='royalblue', lw=2, label=f'Logistic (AUC = {auc_logit:0.3f})')

plt.plot(pl_fpr_test, pl_tpr_test, color='tomato', lw=2, label=f'PCA + Logistic (AUC = {auc_pl:0.3f})')

plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate', fontsize = 14)
plt.ylabel('True Positive Rate', fontsize = 14)
plt.title('Default Risk Model: Logit vs. PCA+Logit', fontsize = 16)
plt.legend(loc="lower right", fontsize = 14);

Why did the performance of the model didn't improve?

What would happen if we keep *all* principal components?

In [None]:
pca = PCA(random_state=314)

# define the pipeline
pl = Pipeline(steps=[('scaling', X_scaler),
                     ('pca', pca),
                     ('logit', logit)])

# fit the pipeline
pl_fit = pl.fit(X_train.astype(float), y_train)

# calculate model scores (predicted probabilities)
pl_scores_test = pl_fit.predict_proba(X_test.astype(float))[:, 1]

# ROC Curve
pl_fpr_test, pl_tpr_test, _ = roc_curve(y_test, pl_scores_test)
auc_pl = roc_auc_score(y_test, pl_scores_test)

plt.figure().set_size_inches(9, 6)

plt.plot(logit_fpr_test, logit_tpr_test, color='royalblue', lw=2, label=f'Logistic (AUC = {auc_logit:0.3f})')

plt.plot(pl_fpr_test, pl_tpr_test, color='tomato', lw=2, label=f'PCA + Logistic (AUC = {auc_pl:0.3f})')

plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate', fontsize = 14)
plt.ylabel('True Positive Rate', fontsize = 14)
plt.title('Default Risk Model: Logit vs. PCA+Logit', fontsize = 16)
plt.legend(loc="lower right", fontsize = 14);