In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df = pd.read_csv("C:\Projects\Customer_churn_project\data\processed\cleaned_customer_churn.csv")

Unnamed: 0,id,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,0,567.0,33.0,9.0,0.0,2.0,1.0,0.0,156792.89,0.0,False,False,True
1,1,628.0,38.0,3.0,0.0,1.0,1.0,1.0,51987.99,1.0,False,False,False
2,2,635.0,29.0,3.0,0.0,2.0,1.0,1.0,113079.19,0.0,False,False,False
3,3,681.0,28.0,6.0,0.0,2.0,1.0,0.0,14081.64,0.0,False,False,True
4,4,587.0,27.0,5.0,0.0,2.0,1.0,0.0,158958.9,0.0,False,False,False


In [None]:
# Correlation coefficient
corr_matrix = df.corr()
cor_target = abs(corr_matrix["Exited"])
relevant_features = cor_target[cor_target > 0.05].index.tolist()
print("Selected columns:", relevant_features)

df = df[relevant_features]

Selected columns: ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'IsActiveMember', 'Exited', 'Geography_Germany', 'Geography_Spain', 'Gender_Male']


In [3]:
# Skewness check
skewed_feats = df.drop("Exited", axis=1).apply(lambda x: x.skew()).sort_values(ascending=False)
print("Skewed features:\n", skewed_feats)

# Log(1+x) transformation
for col in skewed_feats.index:
    if skewed_feats[col] > 1:
        df[col] = np.log1p(df[col])

Skewed features:
 Geography_Germany    1.677829
Geography_Spain      1.361398
Age                  0.843488
Balance              0.800401
CreditScore          0.063609
NumOfProducts        0.057803
IsActiveMember       0.016002
Gender_Male         -0.263072
dtype: float64


In [4]:
# Scaling: numeric columns only
X = df.drop("Exited", axis=1)
y = df["Exited"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Final dataset
df_final = pd.concat([X_scaled, y.reset_index(drop=True)], axis=1)
df_final.head()

Unnamed: 0,CreditScore,Age,Balance,NumOfProducts,IsActiveMember,Geography_Germany,Geography_Spain,Gender_Male,Exited
0,-1.250157,-0.578313,-0.709168,0.778743,-0.992032,-0.466404,-0.529021,0.877089,0.0
1,-0.410819,0.03559,-0.709168,-1.123104,1.008032,-0.466404,-0.529021,-1.140135,1.0
2,-0.314501,-1.069436,-0.709168,0.778743,1.008032,-0.466404,-0.529021,-1.140135,0.0
3,0.318442,-1.192216,-0.709168,0.778743,-0.992032,-0.466404,-0.529021,0.877089,0.0
4,-0.974964,-1.314997,-0.709168,0.778743,-0.992032,-0.466404,-0.529021,-1.140135,0.0


In [5]:
# Save the dataset
os.makedirs("C:\Projects\Customer_churn_project\data\processed", exist_ok=True)
df_final.to_csv("C:/Projects/Customer_churn_project/data/processed/featured_customer_churn.csv", index=False)
print("Saved: C:/Projects/Customer_churn_project/data/processed/featured_customer_churn.csv")
print("Final shape:", df_final.shape)


Saved: C:/Projects/Customer_churn_project/data/processed/featured_customer_churn.csv
Final shape: (15000, 9)
