# Customer Churn Prediction (Kaggle Telco Dataset)

Professional notebook with concise pipeline: load data, EDA, preprocess, train models, evaluate and save artifacts.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

print('Libraries imported')

In [None]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

In [None]:
df.info()
print('\nMissing values:\n', df.isnull().sum())

In [None]:
from src.preprocess import preprocess
from src.utils import save_confusion_matrix, save_feature_importance

pdf = preprocess(df)
pdf.head()

In [None]:
X = pdf.drop('Churn', axis=1)
y = pdf['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

lr = LogisticRegression(max_iter=300)
lr.fit(X_train, y_train)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

print('RF acc:', accuracy_score(y_test, rf.predict(X_test)))

## Save models and reports

Run the `src/train.py` script to save models and reports to the `models/` and `reports/` folders.