In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# 1) Load data
df = pd.read_csv("../data/credit_customers (1).csv")

# 2) Separate features and target
X = df.drop(columns=["class"])   # features
y = df["class"]                  # target (still 'good'/'bad' here)

# 3) Split numeric vs categorical columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

print("Numeric columns:", list(num_cols))
print("Categorical columns:", list(cat_cols))

# 4) Scale numeric features (Z-score standardization)
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# 5) Encode categorical features (One-Hot Encoding)
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

print("\nAfter transformation:")
print(X.head())


Numeric columns: ['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
Categorical columns: ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']

After transformation:
   duration  credit_amount  installment_commitment  residence_since       age  \
0 -1.236478      -0.745131                0.918477         1.046987  2.766456   
1  2.248194       0.949817               -0.870183        -0.765977 -1.191404   
2 -0.738668      -0.416562               -0.870183         0.140505  1.183312   
3  1.750384       1.634247               -0.870183         1.046987  0.831502   
4  0.256953       0.566664                0.024147         1.046987  1.535122   

   existing_credits  num_dependents  checking_status_<0  \
0          1.027079       -0.428290                True   