# Model Building

Trying out a few models to see what sticks.
- Logistic Regression (Baseline)
- Random Forest
- XGBoost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Import our new script!
import sys
sys.path.append('..')
from src.preprocessing import get_processed_data

In [None]:
# Load and clean in one shot
df = get_processed_data('../data/Telco-Customer-Churn.csv')

# Prepping for ML
# Drop ID
df.drop('customerID', axis=1, inplace=True)

# Encode Target
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# Dummies for the rest
df = pd.get_dummies(df, drop_first=True)

X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)

## 1. Logistic Regression (Baseline)

In [None]:
# using balanced class weight because of the imbalance
lr = LogisticRegression(class_weight='balanced', max_iter=1000)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

## 2. Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

## 3. XGBoost

In [None]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

## Compare ROC AUC

In [None]:
print(f"Logistic: {roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])}")
print(f"RF: {roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])}")
print(f"XGB: {roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1])}")

## Save Best Model

In [None]:
joblib.dump(xgb, '../models/model.joblib')