In [14]:
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
import numpy as np
import pandas as pd
from data_preprocessing import preprocess_data

loan_data = "data/accepted_2007_to_2018Q4.csv.gz"
unemployment_rate_data = ["data/unemployment_rate_0.csv", "data/unemployment_rate_1.csv", "data/unemployment_rate_2.csv", "data/unemployment_rate_3.csv", "data/unemployment_rate_4.csv"]

data = preprocess_data(loan_data, unemployment_rate_data)
print(f'Initial data shape: {data.shape}')
print(data.to_string())

# Drop non-numeric columns and the target column `default` from features (but keep for reference)
if 'default' in data.columns:
    y = data['default']
else:
    y = None
numeric_columns = data.columns[data.dtypes.apply(lambda x: np.issubdtype(x, np.number))]

# ISSUES WITH MEMORY USAGE
data = data[numeric_columns] 
# non_numeric = [c for c in data.columns if c not in numeric_columns]
# data.drop(columns=non_numeric, inplace=True)

# Remove IDs
for col in ['id','member_id']:
    if col in data.columns:
        data = data.drop(columns=[col])
# Ensure target not included in features
if 'default' in data.columns:
    data = data.drop(columns=['default'])

print(f'Numeric feature count: {data.shape[1]}')

# Impute missing values with median
imp = SimpleImputer(strategy='median')
X_imp = imp.fit_transform(data)

# Scale features
scaler = StandardScaler()
# Run PCA
batch_size = 50000
for i in range(0, X_imp.shape[0], batch_size):
    scaler.partial_fit(X_imp[i:i+batch_size])

# Transform in batches
X_scaled = np.zeros_like(X_imp, dtype=np.float32)  # Saves memory vs float64
for i in range(0, X_imp.shape[0], batch_size):
    X_scaled[i:i+batch_size] = scaler.transform(X_imp[i:i+batch_size])

n_components = min(50, X_scaled.shape[1])  # Choose desired number of components (50 is typical)
ipca = IncrementalPCA(n_components=n_components)

# Fit in chunks
for i in range(0, X_scaled.shape[0], batch_size):
    ipca.partial_fit(X_scaled[i:i+batch_size])

# Transform in chunks
X_pca_list = []
for i in range(0, X_scaled.shape[0], batch_size):
    X_pca_list.append(ipca.transform(X_scaled[i:i+batch_size]))

X_pca = np.vstack(X_pca_list)

# Explained variance
explained = ipca.explained_variance_ratio_
cum_explained = np.cumsum(explained)
print('10 Explained variance ratio ')
print(explained[:10])
print('10 Cum variance')
print(cum_explained[:10])

# Loadings DataFrame (features x components)
loadings = pd.DataFrame(ipca.components_.T, index=data.columns, columns=[f'PC{i+1}' for i in range(ipca.n_components_)])
importance = (loadings.abs() * explained).sum(axis=1)
importance = importance.sort_values(ascending=False)
impl_df = pd.DataFrame({'feature': importance.index, 'importance': importance.values})
impl_df.to_csv('pca_feature_importance.csv', index=False)

#print('Top 20 features by PCA importance:')
# print(impl_df.head(20))
print('Print all features by PCA importance to -> pca_feature_importance.csv:')

top_feats = impl_df['feature'].head(10).tolist()
print('Loadings for top features (first 5 PCs):')
print(loadings.loc[top_feats, loadings.columns[:5]])

# Components to reach 95% variance
n_95 = np.searchsorted(cum_explained, 0.95) + 1
print(f'Number of components to reach 95% variance: {n_95}')

Loan data loaded successfully.
Unemployment rate data loaded and merged successfully.
Loan and unemployment data merged successfully.
Data preprocessing completed successfully.
Initial data shape: (2260668, 98)


MemoryError: 

### Logistic Regression on 50 features with highest variance (according to PCA)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report)
import numpy as np
import pandas as pd

desired = 51
if 'n_95' in globals() and isinstance(n_95, int) and n_95 > 0:
    n_use = min(desired, ipca.n_components_, n_95)
else:
    n_use = min(desired, ipca.n_components_)

print(f'Using {n_use} components for model input')

# Build feature matrix from PCA-transformed data X_pca
if 'X_pca' not in globals():
    X_pca = ipca.transform(X_scaled)
X = X_pca[:, :n_use]

try:
    y = data['default'] 
except Exception:
    # fall back to checking globals for y from earlier cells
    if 'y' in globals() and y is not None:
        pass
    else:
        raise RuntimeError('Target vector `y` not found in notebook namespace. Ensure you saved the original target before overwriting `data`.')


# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Fit logistic regression with balanced class weights
clf = LogisticRegression(max_iter=1000, class_weight='balanced', solver='saga')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1] if hasattr(clf, 'predict_proba') else None


# Metrics
metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, zero_division=0),
    'recall': recall_score(y_test, y_pred, zero_division=0),
    'f1': f1_score(y_test, y_pred, zero_division=0),
}

if y_proba is not None:
    metrics['roc_auc'] = roc_auc_score(y_test, y_proba)

print('Evaluation metrics:')
print(metrics)

print('Classification report:')
print(classification_report(y_test, y_pred, zero_division=0))

print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred))

# Save metrics and model coefficients
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv('pca_logistic_metrics.csv', index=False)
coef_df = pd.DataFrame({'component': [f'PC{i+1}' for i in range(n_use)], 'coef': clf.coef_.ravel()[:n_use]})
coef_df.to_csv('pca_logistic_coefficients.csv', index=False)
print('Saved metrics to pca_logistic_metrics.csv and coefficients to pca_logistic_coefficients.csv')

Using 50 components for model input




Evaluation metrics:
{'accuracy': 0.9592886179760868, 'precision': 0.8039651070578906, 'recall': 0.9205712398877332, 'f1': 0.858325957283048, 'roc_auc': 0.9842582346650408}
Classification report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98    391564
           1       0.80      0.92      0.86     60570

    accuracy                           0.96    452134
   macro avg       0.90      0.94      0.92    452134
weighted avg       0.96      0.96      0.96    452134

Confusion matrix:
[[377968  13596]
 [  4811  55759]]
Saved metrics to pca_logistic_metrics.csv and coefficients to pca_logistic_coefficients.csv


Loan data loaded successfully.
Unemployment rate data loaded and merged successfully.
Loan and unemployment data merged successfully.
Data preprocessing completed successfully.

Top 20 Features by Point-Biserial Correlation:
                    feature  abs_pointbiserial_corr
35     last_fico_range_high                0.609548
36      last_fico_range_low                0.556323
32               recoveries                0.488174
33  collection_recovery_fee                0.463735
94     debt_settlement_flag                0.314878
7                 sub_grade                0.233739
29          total_rec_prncp                0.233454
6                     grade                0.229600
4                  int_rate                0.211744
34          last_pymnt_amnt                0.192632
25                out_prncp                0.157300
26            out_prncp_inv                0.157285
27              total_pymnt                0.145531
28          total_pymnt_inv                0.14