# COGS 118B Final Project
### Authors:
- Michelle Tran
- Christopher Ly

<strong style="color:red">Just for reference for working on GitHub:</strong>
<p style="color:red">Be sure to clear output for the notebook before pushing to the repo, this is to keep commit history clean. You can do this by following the sequence below:</p>

`Cell > All Output > Clear`

In [None]:
import sys
sys.path.insert(0,'src')
import utils

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import f1_score, fbeta_score

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
sns.set(style='darkgrid')#, palette='rainbow')

## Load in data

In [None]:
fp = 'data/telco.csv'

In [None]:
df = pd.read_csv(fp).drop(['customerID'], axis=1)
df.head()

In [None]:
bf_etl = {
    'Num of features': df.shape[1] - 1, # Churn is a label not a feature
    'Num of datapoints': df.shape[0],
    'Num of not churned': sum(df['Churn']=='No'),
    'Num of churned': sum(df['Churn']=='Yes'),
    '% churned': np.mean(df['Churn']=='Yes')*100
}
summary = pd.DataFrame(bf_etl, index=['Before etl'])
summary

## Clean data

In [None]:
df = df.drop(df[df['tenure']==0].index).reset_index(drop=True)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])#, errors='coerce')
df.head(2)
# df['tenure'] = df['tenure'].replace({0: 1})

# df['TotalCharges'] = df['TotalCharges'].fillna(df['MonthlyCharges'])

## Exploratory Data Analysis

In [None]:
df = utils.revert(df)

In [None]:
# ref: https://stackoverflow.com/questions/33179122/seaborn-countplot-with-frequencies
ax = sns.countplot(x=df['Churn'])
plt.title('Churn rate/occurrence')
ax.set_ylabel('Count')
ax.set_ylim(0, len(df))

for p in ax.patches:
    x=p.get_bbox().get_points()[:,0]
    y=p.get_bbox().get_points()[1,1]
    ax.annotate('{:.2f}%'.format(100.*y/len(df)), (x.mean(), y), 
            ha='center', va='bottom') # set the alignment of the text
    
plt.show()

In [None]:
utils.plot_cat(df);

In [None]:
sns.distplot(df['tenure'][df['Churn'] == 'Yes'])
sns.distplot(df['tenure'][df['Churn'] == 'No'])
plt.title('Density of Tenure in Months for Churn and No Churn')
plt.legend(['Churn', 'No Churn'])
plt.xlabel('Tenure (Months)')
plt.ylabel('Probability Density')
plt.show()

In [None]:
sns.distplot(df['MonthlyCharges'][df['Churn'] == 'Yes'])
sns.distplot(df['MonthlyCharges'][df['Churn'] == 'No'])
plt.title('Density of Monthly Charges for Churn and No Churn')
plt.legend(['Churn', 'No Churn'])
plt.xlabel('Monthly Charges')
plt.ylabel('Probability Density')
plt.show()

In [None]:
sns.distplot(df['TotalCharges'][df['Churn'] == 'Yes'])
sns.distplot(df['TotalCharges'][df['Churn'] == 'No'])
plt.title('Density of Total Charges for Churn and No Churn')
plt.legend(['Churn', 'No Churn'])
plt.xlabel('Total Charges')
plt.ylabel('Probability Density')
plt.show()

## Data Transformation

In [None]:
addOns = ['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
def internet_add_ons(r):
    if r['InternetService'] == 'No':
        return 0
    count = 0
    for i in addOns: 
        if r[i] == 'Yes':
            count+=1
    return count

In [None]:
df['InternetAddOns'] = df.apply(internet_add_ons, axis=1)

###### Replaces other distplots, requires `pip install --upgrade seaborn`
utils.plot_num(df);

In [None]:
sns.distplot(df['InternetAddOns'][df['Churn'] == 'Yes'])
sns.distplot(df['InternetAddOns'][df['Churn'] == 'No'])
plt.title('Density of Internet Add Ons for Churn and No Churn')
plt.legend(['Churn', 'No Churn'])
plt.xlabel('Internet Add Ons')
plt.ylabel('Probability Density')
plt.show()

In [None]:
df = utils.convert(df)

In [None]:
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore')

In [None]:
df_categorical = df.select_dtypes(include=object).drop(addOns, axis=1)
df_numerical = df.select_dtypes(include=np.number).drop(['Churn', 'TotalCharges'], axis=1)

In [None]:
X = pd.DataFrame(ohe.fit_transform(df_categorical).todense(), columns=ohe.get_feature_names(df_categorical.columns))
X = pd.concat([df_numerical,X], axis=1)
y = df['Churn']
X.head(2)

In [None]:
af_etl = {
    'Num of features': X.shape[1],
    'Num of datapoints': X.shape[0],
    'Num of not churned': sum(df['Churn']==0),
    'Num of churned': sum(df['Churn']),
    '% churned': np.mean(df['Churn'])*100
}
summary.append(pd.DataFrame(af_etl, index=['After etl']))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
af_spt = {
    'Num of datapoints': [X_train.shape[0], X_test.shape[0]],
    'Num of not churned': [sum(y_train==0), sum(y_test==0)],
    'Num of churned': [sum(y_train), sum(y_test)],
    '% churned': [np.mean(y_train)*100, np.mean(y_test)*100]
}
pd.DataFrame(af_spt, index=['Train data', 'Test data'])

In [None]:
# ref: https://stackoverflow.com/questions/29432629/plot-correlation-matrix-using-pandas
corr = df_numerical.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(3)

In [None]:
res_ix = ['Logistic Regression', 'SVM', 'AdaBoost']
ave = 'binary'             # average parameter for F1 score
beta = 5                   # beta parameter for Fβ score
scr = 'balanced_accuracy'  # scoring parameter for grid search

In [None]:
res = {
    'Accuracy before grid search': [0,0,0],
    'Accuracy after': [0,0,0],
    'F1 Score': [0,0,0],
    'Fβ Score': [0,0,0]
}

## Regression Model

In [None]:
mdl = LogisticRegression(solver='liblinear')
result = mdl.fit(X_train, y_train)
score = mdl.score(X_test, y_test)*100

In [None]:
param_grid = {'C': [0.001,0.005,0.01,0.05,0.1,0.5,1,10], 
              'penalty' : ['l1', 'l2'],
              'tol' : [1e-4],
              'max_iter' : [100,500,1000]}
gs = GridSearchCV(mdl, param_grid, n_jobs=-1, cv=10, scoring=scr)
gs.fit(X_train, y_train);
mdl = mdl.set_params(**gs.best_params_)

In [None]:
gs.best_params_, gs.best_score_

In [None]:
res['Accuracy before grid search'][0] = score
res['Accuracy after'][0] = mdl.score(X_test, y_test)*100
res['F1 Score'][0] = f1_score(y_test, mdl.predict(X_test), average=ave)
res['Fβ Score'][0] = fbeta_score(y_test, mdl.predict(X_test), average=ave, beta=beta)

In [None]:
to_sort = {k:v for k,v in zip(X.columns.values,mdl.coef_.reshape((-1,)))}
sort = {k: v for k, v in sorted(to_sort.items(), key=lambda item: item[1])}

plt.figure(figsize=(12,12))
plt.suptitle("Weights of logistic regression model")
sns.barplot(x=[x for _,x in sort.items()], y=[y for y,_ in sort.items()]);

## SVM Model

In [None]:
mdl2 = SVC(gamma='auto')
result2 = mdl2.fit(X_train, y_train)
score2 = result2.score(X_test, y_test)*100

In [None]:
param_grid2 = {'C': [0.001,0.005,0.01,0.05,0.1,0.5,1,10], 
               'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
               'gamma': ['auto', 'scale'],
               'max_iter' : [100,500,1000]}
gs2 = GridSearchCV(result2, param_grid2, n_jobs=-1, cv=10, scoring=scr)
gs2.fit(X_train, y_train);
mdl2 = mdl2.set_params(**gs2.best_params_)

In [None]:
gs2.best_params_, gs2.best_score_

In [None]:
res['Accuracy before grid search'][1] = score2
res['Accuracy after'][1] = mdl2.score(X_test, y_test)*100
res['F1 Score'][1] = f1_score(y_test, mdl2.predict(X_test), average=ave)
res['Fβ Score'][1] = fbeta_score(y_test, mdl2.predict(X_test), average=ave, beta=beta)

## AdaBoost Model

In [None]:
mdl3 = AdaBoostClassifier()
result3 = mdl3.fit(X_train, y_train)
score3 = result3.score(X_test, y_test)*100

In [None]:
param_grid3 = {'n_estimators': [10,25,50,100,250], 
               'learning_rate' : [0.01,0.05,0.1,0.5,1,10]}
gs3 = GridSearchCV(result3, param_grid3, n_jobs=-1, cv=10, scoring=scr)
gs3.fit(X_train, y_train);
mdl3 = mdl3.set_params(**gs3.best_params_)

In [None]:
gs3.best_params_, gs3.best_score_

In [None]:
res['Accuracy before grid search'][2] = score3
res['Accuracy after'][2] = mdl3.score(X_test, y_test)*100
res['F1 Score'][2] = f1_score(y_test, mdl3.predict(X_test), average=ave)
res['Fβ Score'][2] = fbeta_score(y_test, mdl3.predict(X_test), average=ave, beta=beta)

In [None]:
to_sort = {k:v for k,v in zip(X.columns.values,mdl3.feature_importances_)}
sort = {k: v for k, v in sorted(to_sort.items(), key=lambda item: item[1])}

plt.figure(figsize=(12,12))
plt.suptitle("Weights of AdaBoost model")
sns.barplot(x=[x for _,x in sort.items()], y=[y for y,_ in sort.items()]);

In [None]:
to_sort = {k:v for k,v in zip(X.columns.values,mdl3.feature_importances_)}
sort = {k: v for k, v in sorted(to_sort.items(), key=lambda item: item[1]) if v!=0}

plt.figure(figsize=(12,8))
plt.suptitle("Weights of AdaBoost model")
sns.barplot(x=[x for _,x in sort.items()], y=[y for y,_ in sort.items()]);

## Discussion

In [None]:
pd.DataFrame(res, index=res_ix)

In [None]:
np.mean(mdl.predict(X_test) != mdl2.predict(X_test))*100

In [None]:
np.mean(mdl.predict(X_test) != mdl3.predict(X_test))*100

In [None]:
np.mean(mdl2.predict(X_test) != mdl3.predict(X_test))*100