In [None]:
# File: notebooks/churn_preprocessing.ipynb

import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 1. Load training and testing data
train_data = pd.read_csv("../data/customer_churn_dataset-training-master.csv")
test_data = pd.read_csv("../data/customer_churn_dataset-testing-master.csv")

print("Train shape:", train_data.shape)
print("Test shape:", test_data.shape)

# 2. Separate features (X) and target (y)
X_train = train_data.drop('Churn', axis=1)
y_train = train_data['Churn']

X_test = test_data.drop('Churn', axis=1)
y_test = test_data['Churn']

# 3. Handle missing values (simple strategy: fill with 0)
X_train = X_train.copy()
X_test = X_test.copy()
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

# 4. Encode categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
print("Categorical columns:", list(categorical_cols))

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])
    label_encoders[col] = le

# 5. Scale numeric columns
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
print("Numeric columns:", list(numeric_cols))

scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

print("Preprocessing done. Data ready for model training.")
