# COGS 118B Final Project
### Authors:
- Michelle Tran
- Christopher Ly

<strong style="color:red">Just for reference for working on GitHub:</strong>
<p style="color:red">Be sure to clear output for the notebook before pushing to the repo, this is to keep commit history clean. You can do this by following the sequence below:</p>

`Cell > All Output > Clear`

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import src.utils
import seaborn as sns
sns.set(style='darkgrid', palette='rainbow')
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
from sklearn.model_selection import cross_val_score

## Exploratory Data Analysis

In [None]:
fp = 'data/telco.csv'

In [None]:
df = pd.read_csv(fp)
df.head()

In [None]:
df.dtypes

In [None]:
df = df.drop(['customerID'], axis=1)
df['SeniorCitizen'] = df['SeniorCitizen'].replace(to_replace=[0, 1], value=['No', 'Yes'])
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df[pd.isnull(df['TotalCharges'])]

In [None]:
# todo: replace dropping with fill of monthly charges
df = df.dropna(axis=0, subset=['TotalCharges'])

In [None]:
df.isnull().sum()

In [None]:
sns.countplot(x='Churn', data=df)
plt.title('No Churn and Churn')
plt.show()

In [None]:
count_no_churn = len(df[df['Churn']=='No'])
count_churn = len(df[df['Churn']=='Yes'])
pct_of_no_churn = count_no_churn/(count_no_churn+count_churn)
pct_of_churn = count_churn/(count_no_churn+count_churn)
print('Percent of No Churn %.2f' %(pct_of_no_churn*100))
print('Percent of Churn %.2f' %(pct_of_churn*100))

In [None]:
fig, axes = plt.subplots(3,2, figsize=(15,12))
sns.countplot(x='gender', hue='Churn', data=df, ax=axes[0][0])
sns.countplot(x='SeniorCitizen', hue='Churn', data=df, ax=axes[0][1])
sns.countplot(x='Partner', hue='Churn', data=df, ax=axes[1][0])
sns.countplot(x='Dependents', hue='Churn', data=df, ax=axes[1][1])
sns.countplot(x='PhoneService', hue='Churn', data=df, ax=axes[2][0])
sns.countplot(x='PaperlessBilling', hue='Churn', data=df, ax=axes[2][1])
plt.show()

In [None]:
fig, axes = plt.subplots(5,2, figsize=(15,25))
sns.countplot(x='MultipleLines', hue='Churn', data=df, ax=axes[0][0])
sns.countplot(x='InternetService', hue='Churn', data=df, ax=axes[0][1])
sns.countplot(x='OnlineSecurity', hue='Churn', data=df, ax=axes[1][0])
sns.countplot(x='OnlineBackup', hue='Churn', data=df, ax=axes[1][1])
sns.countplot(x='DeviceProtection', hue='Churn', data=df, ax=axes[2][0])
sns.countplot(x='TechSupport', hue='Churn', data=df, ax=axes[2][1])
sns.countplot(x='StreamingTV', hue='Churn', data=df, ax=axes[3][0])
sns.countplot(x='StreamingMovies', hue='Churn', data=df, ax=axes[3][1])
sns.countplot(x='Contract', hue='Churn', data=df, ax=axes[4][0])
sns.countplot(x='PaymentMethod', hue='Churn', data=df, ax=axes[4][1])
plt.show()

In [None]:
sns.distplot(df['tenure'][df['Churn'] == 'Yes'])
sns.distplot(df['tenure'][df['Churn'] == 'No'])
plt.title('Density of Tenure in Months for Churn and No Churn')
plt.legend(['Churn', 'No Churn'])
plt.xlabel('Tenure (Months)')
plt.ylabel('Probability Density')
plt.show()

In [None]:
sns.distplot(df['MonthlyCharges'][df['Churn'] == 'Yes'])
sns.distplot(df['MonthlyCharges'][df['Churn'] == 'No'])
plt.title('Density of Monthly Charges for Churn and No Churn')
plt.legend(['Churn', 'No Churn'])
plt.xlabel('Monthly Charges')
plt.ylabel('Probability Density')
plt.show()

In [None]:
sns.distplot(df['TotalCharges'][df['Churn'] == 'Yes'])
sns.distplot(df['TotalCharges'][df['Churn'] == 'No'])
plt.title('Density of Total Charges for Churn and No Churn')
plt.legend(['Churn', 'No Churn'])
plt.xlabel('Total Charges')
plt.ylabel('Probability Density')
plt.show()

## Regression Model

In [None]:
# todo: ohe on both datasets
X = pd.get_dummies(df)
X.drop(['Churn_No', 'Churn_Yes'], axis=1, inplace=True)
y = df['Churn'].replace(to_replace=['No', 'Yes'], value=[0, 1])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
result = logreg.fit(X_train, y_train)

In [None]:
score = result.score(X_test, y_test)
print('Accuracy Score %.2f' %(score*100))

In [None]:
training_scores = cross_val_score(logreg, X_train, y_train, cv=5)
print('CV Accuracy Scores', training_scores)

In [None]:
param_grid = {'C': [0.1,0.3,0.5,0.7,1]}

In [None]:
GridSearchCV(result, param_grid)

## SVM Model

## AdaBoost Model

## ANN Model (if time permits)

## Discussion