In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import acquire
import prepare
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from acquire import get_titanic_data
from prepare import prep_titanic_data_beta

# New datafram Test
X_train, X_validate, X_test, y_train, y_validate, y_test = prep_titanic_data_beta(get_titanic_data())

In [3]:
print("train: ", X_train.shape, ", validate: ", X_validate.shape, ", test: ", X_test.shape)
print("train: ", y_train.shape, ", validate: ", y_validate.shape, ", test: ", y_test.shape)

train:  (498, 5) , validate:  (214, 5) , test:  (179, 5)
train:  (498, 1) , validate:  (214, 1) , test:  (179, 1)


In [4]:
logit = LogisticRegression(C=1, random_state = 123, solver='newton-cg')
logit.fit(X_train, y_train)
#make a prediction with traning data
y_pred = logit.predict(X_train)
#estimate prob of survival with training data
y_pred_proba = logit.predict_proba(X_train)
# Compute Accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'.format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.79


In [None]:
# Cleaned Data for Exploration
df = acquire.get_telco_data(cached = True)
train, validate, test = prepare.prep_telco_data(df)

In [None]:
train.info()

In [None]:
train_stats = train.describe().T
train_stats['range'] = train_stats['max'] - train_stats['min']
train_stats

In [None]:
# Compute the correlation matrix (all train)
corr = train.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, annot = True, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
sns.pairplot(train, hue='churn')
plt.show()

In [None]:
sns.pairplot(train, hue='contract_type')

In [None]:
churn_rate = train.churn.mean()

In [None]:
train.churn.value_counts().plot.bar()
plt.xlabel('Churn')

In [None]:
train.groupby('gender').churn.mean().plot.bar(alpha=.8)
plt.ylabel('Churn Rate')
plt.hlines(churn_rate, *plt.xlim(), ls='--', alpha=.8)

In [None]:
train.groupby('multiple_lines').churn.mean().plot.bar(alpha=.8)
plt.ylabel('Churn Rate')
plt.hlines(churn_rate, *plt.xlim(), ls='--', alpha=.8)

In [None]:
train.groupby('internet_service').churn.mean().plot.bar(alpha=.8)
plt.ylabel('Churn Rate')
plt.hlines(churn_rate, *plt.xlim(), ls='--', alpha=.8)

In [None]:
train.groupby('phone_service').churn.mean().plot.bar(alpha=.8)
plt.ylabel('Churn Rate')
plt.hlines(churn_rate, *plt.xlim(), ls='--', alpha=.8)

In [None]:
train.groupby('internet_service_type').churn.mean().plot.bar(alpha=.8)
plt.ylabel('Churn Rate')
plt.hlines(churn_rate, *plt.xlim(), ls='--', alpha=.8)

In [None]:
train.groupby('online_security').churn.mean().plot.bar(alpha=.8)
plt.ylabel('Churn Rate')
plt.hlines(churn_rate, *plt.xlim(), ls='--', alpha=.8)

In [None]:
train.groupby('online_backup').churn.mean().plot.bar(alpha=.8)
plt.ylabel('Churn Rate')
plt.hlines(churn_rate, *plt.xlim(), ls='--', alpha=.8)

In [None]:
train.groupby('contract_type').churn.mean().plot.bar(alpha=.8)
plt.ylabel('Churn Rate')
plt.hlines(churn_rate, *plt.xlim(), ls='--', alpha=.8)

In [None]:
train.groupby('payment_type').churn.mean().plot.bar(alpha=.8)
plt.ylabel('Churn Rate')
plt.hlines(churn_rate, *plt.xlim(), ls='--', alpha=.8)

In [None]:
train.info()

In [None]:
pd.crosstab(train.churn, train.payment_type)

In [None]:
sns.heatmap(train.pivot_table('churn', 'contract_type', 'internet_service_type'), cmap='Blues', annot=True)

In [None]:
sns.heatmap(train.pivot_table('churn', 'contract_type', 'payment_type'), cmap='Blues', annot=True)

In [None]:
sns.heatmap(train.pivot_table('churn', 'contract_type', 'streaming_movies'), cmap='Blues', annot=True)

In [None]:
sns.heatmap(train.pivot_table('churn', 'contract_type', 'streaming_tv'), cmap='Blues', annot=True)

In [None]:
sns.heatmap(train.pivot_table('churn', 'contract_type', 'tech_support'), cmap='Blues', annot=True)

In [None]:
sns.heatmap(train.pivot_table('churn', 'contract_type', 'device_protection'), cmap='Blues', annot=True)

In [None]:
sns.heatmap(train.pivot_table('churn', 'contract_type', 'online_backup'), cmap='Blues', annot=True)

In [None]:
sns.heatmap(train.pivot_table('churn', 'contract_type', 'online_security'), cmap='Blues', annot=True)

In [None]:
sns.heatmap(train.pivot_table('churn', 'contract_type', 'multiple_lines'), cmap='Blues', annot=True)

In [None]:
sns.heatmap(train.pivot_table('churn', 'contract_type', 'paperless_billing'), cmap='Blues', annot=True)

## Observations
 - In month to month contracts correlation between churn and multiple phone lines(0.48)
 - In month to month contracts correlation between churn and no online security (0.5)
 - In month to month contracts correlation between churn and no tech support (0.5)
 - In month to month contracts correlation between churn and electronic check payment type (0.53)
 - In month to month contracts correlation between churn and fiber optic internet service (0.54)

In [None]:
train.info()

In [None]:

plt.figure(figsize=(13, 7))
sns.swarmplot(data=train, y='monthly_charges', x='internet_service_type', hue='churn')

In [None]:
# Compute the correlation matrix (all train)
corr = train.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, annot = True, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
train = train[['churn', 'gender', 'senior_citizen', 'partner', 'dependents', 'month_to_month_contract', 'internet_service_type']]

# Compute a correlation matrix and convert to long-form
corr_mat = df.corr().stack().reset_index(name="correlation")

# Draw each cell as a scatter point with varying size and color
g = sns.relplot(
    data=corr_mat,
    x="level_0", y="level_1", hue="correlation", size="correlation",
    palette="vlag", hue_norm=(-1, 1), edgecolor=".7",
    height=10, sizes=(50, 250), size_norm=(-.2, .8),
)

# # Tweak the figure to finalize
# g.set(xlabel="", ylabel="", aspect="equal")
# g.despine(left=True, bottom=True)
# g.ax.margins(.02)
# for label in g.ax.get_xticklabels():
#     label.set_rotation(90)
# for artist in g.legend.legendHandles:
#     artist.set_edgecolor(".7")


In [None]:
# Compute the correlation matrix
corr = train.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, annot = True, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
#2nd Heatmap, not sure which to use...
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(corr, annot=True, linewidths=.5, ax=ax)
plt.show()

In [None]:
train.churn.var(), train.month_to_month_contract.var()


Null Hypothesis:

$H_0$: Churn and month to month contracts are independent (not dependent)

$H_a$: churn and month to month contracts are dependent


In [None]:
# Stats testing

observed = pd.crosstab(train.churn, train.month_to_month_contract)
observed

# Set our alpha
# alpha nice and low
alpha = .05

# .chi2_contingency returns 4 different values
chi2, p, degf, expected = stats.chi2_contingency(observed)

null_hypothesis = "Churn and month to month contracts are independent"

if p < alpha:
    print("We reject the null hypothesis")
    print("We reject the hypothesis that", null_hypothesis)
else:
    print("We fail to reject the null hypothesis")

print(p)


Null Hypothesis:

$H_0$: Churn and being a senior citizen are independent (not dependent)

$H_a$: Churn and being a senior citizen  are dependent


In [None]:
# Stats testing

observed = pd.crosstab(train.churn, train.senior_citizen)
observed

# Set our alpha
# alpha nice and low
alpha = .05

# .chi2_contingency returns 4 different values
chi2, p, degf, expected = stats.chi2_contingency(observed)

null_hypothesis = "Churn and being a senior citizen are independent"

if p < alpha:
    print("We reject the null hypothesis")
    print("We reject the hypothesis that", null_hypothesis)
else:
    print("We fail to reject the null hypothesis")

print(p)

In [None]:
# Cleaned Data for Exploration
df = acquire.get_telco_data(cached = True)
train, validate, test = prepare.prep_telco_data(df)
train.head()

In [None]:
# 2nd set to data to examine
train = train[['churn','paperless_billing','streaming_movies', 'streaming_tv', 'tech_support', 'device_protection', 'online_backup', 'online_security', 'multiple_lines', 'monthly_charges']]


In [None]:
# Compute the correlation matrix
corr = train.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, annot = True, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

Null Hypothesis:

𝐻0

: Churn and paperless billing are independent (not dependent)

𝐻𝑎
: Churn and paperless billing are dependent

In [None]:
# Stats testing

observed = pd.crosstab(train.churn, train.paperless_billing)
observed

# Set our alpha
# alpha nice and low
alpha = .05

# .chi2_contingency returns 4 different values
chi2, p, degf, expected = stats.chi2_contingency(observed)

null_hypothesis = "Churn and paperless billing are independent"

if p < alpha:
    print("We reject the null hypothesis")
    print("We reject the hypothesis that", null_hypothesis)
else:
    print("We fail to reject the null hypothesis")

print(p)

In [None]:

t, p = stats.ttest_ind(train.churn, train.monthly_charges, equal_var = True)
print(f'''
t = {t:.4f}
p = {p:.8f}
''')


Null Hypothesis:

𝐻0

: Churn and streaming tv are independent (not dependent)

𝐻𝑎 : Churn and streaming tv  are dependent


In [None]:
# Stats testing

observed = pd.crosstab(train.churn, train.streaming_tv)
observed

# Set our alpha
# alpha nice and low
alpha = .05

# .chi2_contingency returns 4 different values
chi2, p, degf, expected = stats.chi2_contingency(observed)

null_hypothesis = "Churn and streaming tv are independent"

if p < alpha:
    print("We reject the null hypothesis")
    print("We reject the hypothesis that", null_hypothesis)
else:
    print("We fail to reject the null hypothesis")

print(p)


In [None]:
#titanic = acquire.get_titanic_data(cached=True)

In [None]:
#train, validate, test = prepare.prep_titanic(titanic)

In [None]:
#train.info()

In [None]:
#validate.info()

In [None]:
#test.info()

# Messing around with titanic data for exploratory data analysis

In [None]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic_data

In [None]:
df = prep_titanic_data(get_titanic_data())

In [None]:
df.isnull().sum()

In [None]:
df.age.isnull()

In [None]:
# survival_rate = train.survived.mean()

In [None]:
# train.survived.value_counts().plot.bar()
# plt.xlabel('Survived')

In [None]:
# train.groupby('sex').survived.mean().plot.bar(alpha=.8)
# plt.ylabel('Survival Rate')
# plt.hlines(survival_rate, *plt.xlim(), ls='--', alpha=.8)

In [None]:
train.groupby('parch').survived.mean().plot.bar(alpha=.8)
plt.ylabel('Survival Rate')
plt.hlines(survival_rate, *plt.xlim(), ls='--', alpha=.8)

In [None]:
# # Exploring 2 categorical variables, but now we're treating survived as a category, not a number.
# pd.crosstab(train.survived, train.sex)

In [None]:
# pd.crosstab(train.survived, train.alone)

In [None]:
# train.groupby('survived').age.describe()

In [None]:
# train.groupby('survived').fare.describe()

In [None]:
# plt.figure(figsize=(13, 7))
# sns.swarmplot(data=train, y='sex', x='age', hue='survived')

# Messing around with the TELCO project dataset

In [None]:
# We need to import data from SQL
# Connect to employees database
#defines function to create a sql url using personal credentials
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import acquire
import prepare
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from env import host, user, password

def get_db_url(database, user=user, host=host, password=password): 
    url = f'mysql+pymysql://{user}:{password}@{host}/{database}'
    return url

url = get_db_url('telco_churn')

query = '''
select * 
from customers as c
join contract_types as ct
on ct.contract_type_id = c.contract_type_id
join internet_service_types as i_s
on i_s.internet_service_type_id = c.internet_service_type_id
join payment_types as pt
on pt.payment_type_id = c.payment_type_id;
'''
df = pd.read_sql(query, url)

In [None]:
# #Cleaned Data

# Delete columns 'customer_id', contract_type_id, internet_service_type_id, payment_type_id    
df.drop(columns = ['customer_id','contract_type_id','internet_service_type_id', 'payment_type_id'], inplace = True)
# Replace partner, dependents, churn, phone_service, paperless billing, with boolean value
df.partner.replace(['Yes', 'No'], [1,0], inplace = True)
df.dependents.replace(['Yes', 'No'], [1,0], inplace = True)
df.churn.replace(['Yes', 'No'], [1,0], inplace = True)
df.phone_service.replace(['Yes', 'No'], [1,0], inplace = True)
df.paperless_billing.replace(['Yes', 'No'], [1,0], inplace = True)
# # Add dummy variables as new columns in dataframe and rename them, delete origional
# gender = df.gender.str.get_dummies()
# df = pd.concat([df, gender], axis=1)
# df.rename(columns = {'Female': 'is_female', 'Male': 'is_male'}, inplace = True)
# df.drop(columns = ['gender'], inplace = True)
# # Add dummy variables as new columns in dataframe and rename them, delete origional
# multiple = df.multiple_lines.str.get_dummies()
# df = pd.concat([df, multiple], axis=1)
# df.rename(columns = {'No': 'no_multiple_lines', 'Yes': 'yes_multiple_lines'}, inplace = True)
# df.drop(columns = ['multiple_lines'], inplace = True)
# # Add dummy variables as new columns in dataframe and rename them, delete origional
# multiple = df.online_security.str.get_dummies()
# df = pd.concat([df, multiple], axis=1)
# df.rename(columns = {'No': 'no_online_security', 'Yes': 'yes_online_security'}, inplace = True)
# df.drop(columns = ['online_security'], inplace = True)
# # Add dummy variables as new columns in dataframe and rename them, delete origional
# multiple = df.online_backup.str.get_dummies()
# df = pd.concat([df, multiple], axis=1)
# df.rename(columns = {'No': 'no_online_backup', 'Yes': 'yes_online_backup'}, inplace = True)
# df.drop(columns = ['online_backup'], inplace = True)
# # Add dummy variables as new columns in dataframe and rename them, delete origional
# multiple = df.device_protection.str.get_dummies()
# df = pd.concat([df, multiple], axis=1)
# df.rename(columns = {'No': 'no_device_protection', 'Yes': 'yes_device_protection'}, inplace = True)
# df.drop(columns = ['device_protection'], inplace = True)
# # Add dummy variables as new columns in dataframe and rename them, delete origional
# multiple = df.tech_support.str.get_dummies()
# df = pd.concat([df, multiple], axis=1)
# df.rename(columns = {'No': 'no_tech_support', 'Yes': 'yes_tech_support'}, inplace = True)
# df.drop(columns = ['tech_support'], inplace = True)
# # Add dummy variables as new columns in dataframe and rename them, delete origional
# multiple = df.streaming_tv.str.get_dummies()
# df = pd.concat([df, multiple], axis=1)
# df.rename(columns = {'No': 'no_streaming_tv', 'Yes': 'yes_streaming_tv'}, inplace = True)
# df.drop(columns = ['streaming_tv', 'No internet service'], inplace = True)
# # Add dummy variables as new columns in dataframe and rename them, delete origional
# multiple = df.streaming_movies.str.get_dummies()
# df = pd.concat([df, multiple], axis=1)
# df.rename(columns = {'No': 'no_streaming_movies', 'Yes': 'yes_streaming_movies'}, inplace = True)
# df.drop(columns = ['streaming_movies'], inplace = True)
# # Add dummy variables as new columns in dataframe and rename them, delete origional
# multiple = df.contract_type.str.get_dummies()
# df = pd.concat([df, multiple], axis=1)
# df.rename(columns = {'Month-to-month': 'month_to_month_contract', 'One year': 'one_year_contract', 'Two year': 'two_year_contract'}, inplace = True)
# df.drop(columns = ['contract_type'], inplace = True)
# Add dummy variables as new columns in dataframe and rename them, delete origional
#multiple = df.internet_service_type.str.get_dummies()
#df = pd.concat([df, multiple], axis=1)
#df.rename(columns = {'DSL': 'dsl', 'Fiber optic': 'fiber_optic'}, inplace = True)
df['internet_service'] = df.internet_service_type != 'None'
result = df['internet_service'].astype(int)
df['internet_service'] = result
#df.internet_service.replace(['Yes', 'No'], [1,0], inplace = True)
#df.drop(columns = ['internet_service_type','None'], inplace = True)
# # Add dummy variables as new columns in dataframe and rename them, delete origional
# multiple = df.payment_type.str.get_dummies()
# df = pd.concat([df, multiple], axis=1)
# df.rename(columns = {'Bank transfer (automatic)': 'auto_bank_transfer', 'Credit card (automatic)': 'auto_credit_card', 'Electronic check': 'e_check', 'Mailed check': 'mail_check'}, inplace = True)
# df.drop(columns = ['payment_type'], inplace = True)
# # Change total_charges to float from object
# df['total_charges'] = pd.to_numeric(df['total_charges'],errors='coerce')

In [None]:
df.internet_service.value_counts()

In [None]:
'''
Things to Consider:
1. How to handle colums that could be booleans (Yes/No/Null)
   -Keep as object or turn to int ?
   -Columns include '''
   
  

''' contract_type
    
2. Total charges change to float, it is currently an object
3. Delete origional dummy variable columns
'''

In [None]:
# dummy multiple lines

#df