# COGS 118B Final Project
### Authors:
- Michelle Tran
- Christopher Ly

<strong style="color:red">Just for reference for working on GitHub:</strong>
<p style="color:red">Be sure to clear output for the notebook before pushing to the repo, this is to keep commit history clean. You can do this by following the sequence below:</p>

`Cell > All Output > Clear`

In [None]:
import sys
sys.path.insert(0,'src')
import utils

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
sns.set(style='darkgrid', palette='rainbow')

## Load in data

In [None]:
fp = 'data/telco.csv'

In [None]:
df = pd.read_csv(fp).drop(['customerID'], axis=1)
df.head()

## Clean data

categorical_cols = df.select_dtypes(include=[object]).columns

for col in categorical_cols:
    print(col, df[col].unique())

df = utils.convert(df)

In [None]:
df = df.drop(df[df['tenure']==0].index).reset_index(drop=True)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])#, errors='coerce')
# df['tenure'] = df['tenure'].replace({0: 1})

# df['TotalCharges'] = df['TotalCharges'].fillna(df['MonthlyCharges'])

for col in categorical_cols:
    print(col, df[col].unique())

## Exploratory Data Analysis

In [None]:
df = utils.revert(df)

In [None]:
# ref: https://stackoverflow.com/questions/33179122/seaborn-countplot-with-frequencies
ax = sns.countplot(x=df['Churn'])
plt.title('Churn rate/occurrence')
ax.set_ylabel('Count')
ax.set_ylim(0, len(df))

for p in ax.patches:
    x=p.get_bbox().get_points()[:,0]
    y=p.get_bbox().get_points()[1,1]
    ax.annotate('{:.2f}%'.format(100.*y/len(df)), (x.mean(), y), 
            ha='center', va='bottom') # set the alignment of the text
    
plt.show()

In [None]:
utils.plot_cat(df);

In [None]:
sns.distplot(df['tenure'][df['Churn'] == 'Yes'])
sns.distplot(df['tenure'][df['Churn'] == 'No'])
plt.title('Density of Tenure in Months for Churn and No Churn')
plt.legend(['Churn', 'No Churn'])
plt.xlabel('Tenure (Months)')
plt.ylabel('Probability Density')
plt.show()

In [None]:
sns.distplot(df['MonthlyCharges'][df['Churn'] == 'Yes'])
sns.distplot(df['MonthlyCharges'][df['Churn'] == 'No'])
plt.title('Density of Monthly Charges for Churn and No Churn')
plt.legend(['Churn', 'No Churn'])
plt.xlabel('Monthly Charges')
plt.ylabel('Probability Density')
plt.show()

In [None]:
sns.distplot(df['TotalCharges'][df['Churn'] == 'Yes'])
sns.distplot(df['TotalCharges'][df['Churn'] == 'No'])
plt.title('Density of Total Charges for Churn and No Churn')
plt.legend(['Churn', 'No Churn'])
plt.xlabel('Total Charges')
plt.ylabel('Probability Density')
plt.show()

In [None]:
df = utils.convert(df)

In [None]:
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore')

In [None]:
df_categorical = df.select_dtypes(include=object)
df_categorical.head(2)

In [None]:
df_numerical = df.select_dtypes(include=np.number).drop(['Churn', 'TotalCharges'], axis=1)
df_numerical.head(2)

In [None]:
X = pd.DataFrame(ohe.fit_transform(df_categorical).todense(), columns=ohe.get_feature_names(df_categorical.columns))

In [None]:
X = pd.concat([df_numerical,X], axis=1)
X.head(2)

In [None]:
y = df['Churn']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# ref: https://stackoverflow.com/questions/29432629/plot-correlation-matrix-using-pandas
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(3)

## Regression Model

In [None]:
mdl = LogisticRegression(solver='liblinear')
result = mdl.fit(X_train, y_train)

In [None]:
score = mdl.score(X_test, y_test)
print('Accuracy Score %.2f' %(score*100))

In [None]:
param_grid = {'C': [0.001,0.005,0.01,0.05,0.1,0.5,1,10], 
              'penalty' : ['l1', 'l2'],
              'tol' : [1e-4],
              'max_iter' : [100,500,1000]}

In [None]:
gs = GridSearchCV(mdl, param_grid, n_jobs=-1, cv=10)
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
gs.best_score_

In [None]:
print('Accuracy Score %.2f' %(mdl.set_params(**gs.best_params_).score(X_test, y_test)*100))

In [None]:
to_sort = {k:v for k,v in zip(X.columns.values,mdl.coef_.reshape((-1,)))}
sort = {k: v for k, v in sorted(to_sort.items(), key=lambda item: item[1])}

In [None]:
plt.figure(figsize=(12,12))
plt.suptitle("Weights of logistic regression model")
sns.barplot(x=[x for _,x in sort.items()], y=[y for y,_ in sort.items()])

## SVM Model

In [None]:
mdl2 = SVC(gamma='auto')
result2 = mdl2.fit(X_train, y_train)

In [None]:
score2 = result2.score(X_test, y_test)
print('Accuracy Score %.2f' %(score2*100))

In [None]:
param_grid2 = {'C': [0.001,0.005,0.01,0.05,0.1,0.5,1,10], 
               'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
               'gamma': ['auto', 'scale'],
               'max_iter' : [-1,100,500,1000]}

In [None]:
gs2 = GridSearchCV(result2, param_grid2, n_jobs=-1, cv=10)
gs2.fit(X_train, y_train)

In [None]:
gs2.best_params_

In [None]:
gs2.best_score_

In [None]:
print('Accuracy Score %.2f' %(mdl2.set_params(**gs2.best_params_).score(X_test, y_test)*100))

## AdaBoost Model

In [None]:
mdl3 = AdaBoostClassifier()
result3 = mdl3.fit(X_train, y_train)

In [None]:
score3 = result3.score(X_test, y_test)
print('Accuracy Score %.2f' %(score3*100))

In [None]:
param_grid3 = {'n_estimators': [10,25,50,100,250], 
               'learning_rate' : [0.01,0.05,0.1,0.5,1,10]}

In [None]:
gs3 = GridSearchCV(result3, param_grid3, n_jobs=-1, cv=10)
gs3.fit(X_train, y_train)

In [None]:
gs3.best_params_

In [None]:
gs3.best_score_

In [None]:
print('Accuracy Score %.2f' %(mdl3.set_params(**gs3.best_params_).score(X_test, y_test)*100))

In [None]:
to_sort = {k:v for k,v in zip(X.columns.values,mdl3.feature_importances_)}
sort = {k: v for k, v in sorted(to_sort.items(), key=lambda item: item[1])}

In [None]:
plt.figure(figsize=(12,12))
plt.suptitle("Weights of AdaBoost model")
sns.barplot(x=[x for _,x in sort.items()], y=[y for y,_ in sort.items()])

In [None]:
to_sort = {k:v for k,v in zip(X.columns.values,mdl3.feature_importances_)}
sort = {k: v for k, v in sorted(to_sort.items(), key=lambda item: item[1]) if v!=0}

In [None]:
plt.figure(figsize=(12,8))
plt.suptitle("Weights of AdaBoost model")
sns.barplot(x=[x for _,x in sort.items()], y=[y for y,_ in sort.items()])

## ANN Model (if time permits)

## Discussion