In [None]:
import pandas as pd
from data_preprocessing import preprocess_data
from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

loan_data = "data/accepted_2007_to_2018Q4.csv.gz"
unemployment_rate_data = ["data/unemployment_rate_0.csv", "data/unemployment_rate_1.csv", "data/unemployment_rate_2.csv", "data/unemployment_rate_3.csv", "data/unemployment_rate_4.csv"]

data = preprocess_data(loan_data, unemployment_rate_data)

X = data.drop(columns=['default']).copy()
y = data['default'].copy()
# Compute Z-scores
z = (X - X.mean()) / X.std(ddof=0)

# Track clipping
clipped_counts = {}
rows_to_drop = pd.Series(False, index=X.index)  # track all rows to drop

for col in X.columns:
    high_mask = z[col] > 4.0
    low_mask = z[col] < -4.0
    total_clipped = high_mask.sum() + low_mask.sum()

    if total_clipped > 0:
        clipped_counts[col] = total_clipped
        # Mark rows for dropping instead of replacing values
        rows_to_drop = rows_to_drop | high_mask | low_mask

# Drop all rows with any outlier after checking all columns
X_clipped = X[~rows_to_drop]
y_clipped = y[~rows_to_drop]  # Make sure target matches

# Reporting
if clipped_counts:
    outlier_report = pd.DataFrame.from_dict(clipped_counts, orient='index', columns=['num_clipped'])
    outlier_report = outlier_report.sort_values(by='num_clipped', ascending=False)
    print("\n Outlier Clipping Summary (|z| > 4):")
    print(outlier_report.head(20))
    total_clipped = rows_to_drop.sum()
    print(f"\n Total rows dropped: {total_clipped} out of {len(X)} "
          f"({(total_clipped / len(X)) * 100:.4f}% of data)")
else:
    print("No outliers detected beyond |z| > 4")

# then impute (if needed) and scale
imputer = SimpleImputer(strategy='median')
X_imp = imputer.fit_transform(X_clipped)
X_scaled = StandardScaler().fit_transform(X_imp)

# # Number of components PCA
# n = 20
# pca = PCA(n_components=n)

# # Fit and transform
# X_pca = pca.fit_transform(X_scaled)

# import numpy as np
# loading_strength = np.abs(pca.components_[0])  # first PC
# top_features = np.argsort(loading_strength)[::-1][:10]  # top 10 features
# print("Top 10 features contributing to the first principal component:")
# print(data.columns[top_features])

# #K-means
# from sklearn.cluster import KMeans

# km = KMeans(n_clusters=5, n_init='auto', random_state=0)
# labels = km.fit_predict(X_pca)


# # Add cluster labels to the original data
# data['cluster'] = labels

# data['cluster'].value_counts()



Loan data loaded successfully.
Unemployment rate data loaded and merged successfully.
Loan and unemployment data merged successfully.
Data preprocessing completed successfully.


  X_clipped.loc[high_mask | low_mask, col] = X[col].mean()
  X_clipped.loc[high_mask | low_mask, col] = X[col].mean()
  X_clipped.loc[high_mask | low_mask, col] = X[col].mean()
  X_clipped.loc[high_mask | low_mask, col] = X[col].mean()
  X_clipped.loc[high_mask | low_mask, col] = X[col].mean()
  X_clipped.loc[high_mask | low_mask, col] = X[col].mean()
  X_clipped.loc[high_mask | low_mask, col] = X[col].mean()



 Outlier Clipping Summary (|z| > 4):
                            num_clipped
application_type                 120710
disbursement_method               78122
collections_12_mths_ex_med        37438
last_fico_range_low               37326
debt_settlement_flag              34246
num_accts_ever_120_pd             32592
delinq_2yrs                       29347
recoveries                        27522
collection_recovery_fee           26637
mo_sin_rcnt_rev_tl_op             26342
last_pymnt_amnt                   25573
bc_open_to_buy                    23282
total_rec_int                     22189
mths_since_recent_bc              21542
tax_liens                         20992
total_bal_ex_mort                 20445
mo_sin_rcnt_tl                    19822
mths_since_rcnt_il                19105
total_bc_limit                    18460
revol_bal                         18450

 Total values clipped: 1182105 out of 219284796 (0.5391% of data)
