# COGS 118B Final Project
### Authors:
- Michelle Tran
- Christopher Ly

<strong style="color:red">Just for reference for working on GitHub:</strong>
<p style="color:red">Be sure to clear output for the notebook before pushing to the repo, this is to keep commit history clean. You can do this by following the sequence below:</p>

`Cell > All Output > Clear`

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import src.utils
import seaborn as sns
sns.set(style='darkgrid', palette='rainbow')
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

## Exploratory Data Analysis

In [None]:
fp = 'data/telco.csv'

In [None]:
df = pd.read_csv(fp)
df.head()

In [None]:
df.dtypes

In [None]:
df = df.drop(['customerID'], axis=1)
df['SeniorCitizen'] = df['SeniorCitizen'].replace(to_replace=[0, 1], value=['No', 'Yes'])
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
df.dtypes

In [None]:
df[pd.isnull(df['TotalCharges'])].head()

In [None]:
df.loc[df['TotalCharges'].isnull(),'tenure'] = 1
df[pd.isnull(df['TotalCharges'])] = df[pd.isnull(df['TotalCharges'])].replace(to_replace=[0], value=[1])
df['TotalCharges'].fillna(df['MonthlyCharges'], inplace=True)
df.isnull().sum()

In [None]:
sns.countplot(x='Churn', data=df)
plt.title('No Churn and Churn')
plt.show()

In [None]:
count_no_churn = len(df[df['Churn']=='No'])
count_churn = len(df[df['Churn']=='Yes'])
pct_of_no_churn = count_no_churn/(count_no_churn+count_churn)
pct_of_churn = count_churn/(count_no_churn+count_churn)
print('Percent of No Churn %.2f' %(pct_of_no_churn*100))
print('Percent of Churn %.2f' %(pct_of_churn*100))

In [None]:
fig, axes = plt.subplots(3,2, figsize=(15,12))
sns.countplot(x='gender', hue='Churn', data=df, ax=axes[0][0])
sns.countplot(x='SeniorCitizen', hue='Churn', data=df, ax=axes[0][1])
sns.countplot(x='Partner', hue='Churn', data=df, ax=axes[1][0])
sns.countplot(x='Dependents', hue='Churn', data=df, ax=axes[1][1])
sns.countplot(x='PhoneService', hue='Churn', data=df, ax=axes[2][0])
sns.countplot(x='PaperlessBilling', hue='Churn', data=df, ax=axes[2][1])
plt.show()

In [None]:
fig, axes = plt.subplots(5,2, figsize=(15,25))
sns.countplot(x='MultipleLines', hue='Churn', data=df, ax=axes[0][0])
sns.countplot(x='InternetService', hue='Churn', data=df, ax=axes[0][1])
sns.countplot(x='OnlineSecurity', hue='Churn', data=df, ax=axes[1][0])
sns.countplot(x='OnlineBackup', hue='Churn', data=df, ax=axes[1][1])
sns.countplot(x='DeviceProtection', hue='Churn', data=df, ax=axes[2][0])
sns.countplot(x='TechSupport', hue='Churn', data=df, ax=axes[2][1])
sns.countplot(x='StreamingTV', hue='Churn', data=df, ax=axes[3][0])
sns.countplot(x='StreamingMovies', hue='Churn', data=df, ax=axes[3][1])
sns.countplot(x='Contract', hue='Churn', data=df, ax=axes[4][0])
sns.countplot(x='PaymentMethod', hue='Churn', data=df, ax=axes[4][1])
plt.show()

In [None]:
sns.distplot(df['tenure'][df['Churn'] == 'Yes'])
sns.distplot(df['tenure'][df['Churn'] == 'No'])
plt.title('Density of Tenure in Months for Churn and No Churn')
plt.legend(['Churn', 'No Churn'])
plt.xlabel('Tenure (Months)')
plt.ylabel('Probability Density')
plt.show()

In [None]:
sns.distplot(df['MonthlyCharges'][df['Churn'] == 'Yes'])
sns.distplot(df['MonthlyCharges'][df['Churn'] == 'No'])
plt.title('Density of Monthly Charges for Churn and No Churn')
plt.legend(['Churn', 'No Churn'])
plt.xlabel('Monthly Charges')
plt.ylabel('Probability Density')
plt.show()

In [None]:
sns.distplot(df['TotalCharges'][df['Churn'] == 'Yes'])
sns.distplot(df['TotalCharges'][df['Churn'] == 'No'])
plt.title('Density of Total Charges for Churn and No Churn')
plt.legend(['Churn', 'No Churn'])
plt.xlabel('Total Charges')
plt.ylabel('Probability Density')
plt.show()

In [None]:
df['Churn'] = df['Churn'].replace(to_replace=['No', 'Yes'], value=[0, 1])

In [None]:
df.dtypes

In [None]:
df_categorical = df.select_dtypes(include=[object])
df_categorical.head()

In [None]:
df_numerical = df.select_dtypes(include=[int, float])
df_numerical = df_numerical.drop(['Churn'], axis=1)
df_numerical.head()

In [None]:
X = pd.DataFrame(ohe.fit_transform(df_categorical).todense(), columns=ohe.get_feature_names(df_categorical.columns))

In [None]:
X = pd.concat([df_numerical,X], axis=1)
X.head()

In [None]:
y = df['Churn']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Regression Model

In [None]:
mdl = logreg.fit(X_train, y_train)

In [None]:
score = mdl.score(X_test, y_test)
print('Accuracy Score %.2f' %(score*100))

In [None]:
training_scores = cross_val_score(logreg, X_train, y_train, cv=5)
print('CV Accuracy Scores', training_scores)

In [None]:
param_grid = {'C': [0.001,0.005,0.01,0.05,0.1,0.5,1,10], 
             'penalty' : ['l1', 'l2'],
             'tol' : [1e-4],
             'max_iter' : [100,500,1000]}

In [None]:
gs = GridSearchCV(mdl, param_grid, n_jobs=-1, cv=10)
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
gs.best_score_

In [None]:
print('Accuracy Score %.2f' %(mdl.set_params(**gs.best_params_).score(X_test, y_test)*100))

In [None]:
to_sort = {k:v for k,v in zip(X.columns.values,mdl.coef_.reshape((-1,)))}
sort = {k: v for k, v in sorted(to_sort.items(), key=lambda item: item[1])}

In [None]:
plt.figure(figsize=(12,12))
plt.suptitle("Weights of logistic regression model")
sns.barplot(x=[x for _,x in sort.items()], y=[y for y,_ in sort.items()])

## SVM Model

In [None]:
from sklearn.svm import SVC as svc

In [None]:
mdl2 = svc()
result2 = mdl2.fit(X_train, y_train)

In [None]:
score2 = result2.score(X_test, y_test)
print('Accuracy Score %.2f' %(score2*100))

In [None]:
training_scores2 = cross_val_score(mdl2, X_train, y_train, cv=5)
print('CV Accuracy Scores', training_scores2)

In [None]:
param_grid2 = {'C': [0.001,0.005,0.01,0.05,0.1,0.5,1,10], 
             'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
             'max_iter' : [-1,100,500,1000]}

In [None]:
gs2 = GridSearchCV(result2, param_grid2, n_jobs=-1, cv=10)
gs2.fit(X_train, y_train)

In [None]:
gs2.best_params_

In [None]:
gs2.best_score_

In [None]:
print('Accuracy Score %.2f' %(mdl2.set_params(**gs2.best_params_).score(X_test, y_test)*100))

## AdaBoost Model

In [None]:
from sklearn.ensemble import AdaBoostClassifier as ada

In [None]:
mdl3 = ada()
result3 = mdl3.fit(X_train, y_train)

In [None]:
score3 = result3.score(X_test, y_test)
print('Accuracy Score %.2f' %(score3*100))

In [None]:
training_scores3 = cross_val_score(mdl3, X_train, y_train, cv=5)
print('CV Accuracy Scores', training_scores3)

In [None]:
param_grid3 = {'n_estimators': [10,25,50,100,250], 
             'learning_rate' : [0.01,0.05,0.1,0.5,1,10]}

In [None]:
gs3 = GridSearchCV(result3, param_grid3, n_jobs=-1, cv=10)
gs3.fit(X_train, y_train)

In [None]:
gs3.best_params_

In [None]:
gs3.best_score_

In [None]:
print('Accuracy Score %.2f' %(mdl3.set_params(**gs3.best_params_).score(X_test, y_test)*100))

In [None]:
to_sort = {k:v for k,v in zip(X.columns.values,mdl3.feature_importances_)}
sort = {k: v for k, v in sorted(to_sort.items(), key=lambda item: item[1])}

In [None]:
plt.figure(figsize=(12,12))
plt.suptitle("Weights of AdaBoost model")
sns.barplot(x=[x for _,x in sort.items()], y=[y for y,_ in sort.items()])

In [None]:
to_sort = {k:v for k,v in zip(X.columns.values,mdl3.feature_importances_)}
sort = {k: v for k, v in sorted(to_sort.items(), key=lambda item: item[1]) if v!=0}

In [None]:
plt.figure(figsize=(12,8))
plt.suptitle("Weights of AdaBoost model")
sns.barplot(x=[x for _,x in sort.items()], y=[y for y,_ in sort.items()])

## ANN Model (if time permits)

## Discussion