# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_text
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

   # used to aquire dataset
import acquire_telco as aq
   # used to prepare and clean the dataset
import prepare_telco as pp

# Data Aquistion

In [2]:
# The function get_telco_data() I created in the file acquire.py pulls the dataset straight from SQL
# UNLESS the data has already been downloaded as a .csv,
# in which case the function will pull the data straight from that file instead.

df = aq.get_telco_data()
df.info()

Reading from csv file...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-nul

# Data Preperation

In [3]:
# The function prep_telco() that I created in the file prepare.py takes in the Telco dataframe and cleans it
# It does this by dropping uneeded columns ['payment_type_id', 'internet_service_type_id', 'contract_type_id']
# I also made sure the function cleared any nulls from the dataset
# Lastly, I make it easier to manipulate the data by creating dummy column for each categorical column.

In [4]:
df = pp.prep_telco(df)
df.head(2)

Unnamed: 0,customer_id,senior_citizen,tenure,monthly_charges,total_charges,gender_Male,partner_Yes,dependents_Yes,phone_service_Yes,multiple_lines_No phone service,...,streaming_movies_Yes,paperless_billing_Yes,churn_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,0002-ORFBO,0,9,65.6,593.3,0,1,1,1,0,...,0,1,0,1,0,0,0,0,0,1
1,0003-MKNFE,0,9,59.9,542.4,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1


In [5]:
# check for null values
df.isna().sum()

customer_id                              0
senior_citizen                           0
tenure                                   0
monthly_charges                          0
total_charges                            0
gender_Male                              0
partner_Yes                              0
dependents_Yes                           0
phone_service_Yes                        0
multiple_lines_No phone service          0
multiple_lines_Yes                       0
online_security_No internet service      0
online_security_Yes                      0
online_backup_No internet service        0
online_backup_Yes                        0
device_protection_No internet service    0
device_protection_Yes                    0
tech_support_No internet service         0
tech_support_Yes                         0
streaming_tv_No internet service         0
streaming_tv_Yes                         0
streaming_movies_No internet service     0
streaming_movies_Yes                     0
paperless_b

In [6]:
# rename columns for readability

df.rename(columns={'gender_Male': 'is_male',
                   'partner_Yes': 'married',
                   'dependents_Yes': 'children',
                   'phone_service_Yes': 'phone_service',
                   'multiple_lines_No phone service': 'no_phone_multiple_lines',
                   'multiple_lines_Yes': 'multiple_lines',
                   'online_security_No internet service': 'no_internet_online_security',
                   'online_security_Yes': 'online_security',
                   'online_backup_No internet service': 'no_internet_online_backup',
                   'online_backup_Yes': 'online_backup',
                   'device_protection_No internet service': 'no_internet_device_protection',
                   'device_protection_Yes': 'device_protection',
                   'tech_support_No internet service': 'no_internet_tech_support',
                   'tech_support_Yes': 'tech_support',
                   'streaming_tv_No internet service': 'no_internet_streaming_tv',
                   'streaming_tv_Yes': 'streaming_tv',
                   'streaming_movies_No internet service': 'no_internet_streaming_movies',
                   'streaming_movies_Yes': 'streaming_movies',
                   'paperless_billing_Yes': 'paperless_billing',
                   'churn_Yes': 'churn',
                   'contract_type_One year': 'one_year_contract',
                   'contract_type_Two year': 'two_year_contract',
                   'internet_service_type_Fiber optic': 'fiber_optic',
                   'internet_service_type_None': 'no_internet',
                   'payment_type_Credit card (automatic)': 'card_auto_pay',
                   'payment_type_Electronic check': 'electronic_check',
                   'payment_type_Mailed check': 'mailed_check'}, inplace=True)

# also drop customer id for training purposes
df = df.drop(columns=['customer_id'])

In [7]:
# check the sum of each column to see if there is any redundancy
# doing so shows that several columns made for the dummy list can be represented by just 1 column
# the 'no_internet' column
# may want to drop redundant columns later
df.isna().sum()

senior_citizen                   0
tenure                           0
monthly_charges                  0
total_charges                    0
is_male                          0
married                          0
children                         0
phone_service                    0
no_phone_multiple_lines          0
multiple_lines                   0
no_internet_online_security      0
online_security                  0
no_internet_online_backup        0
online_backup                    0
no_internet_device_protection    0
device_protection                0
no_internet_tech_support         0
tech_support                     0
no_internet_streaming_tv         0
streaming_tv                     0
no_internet_streaming_movies     0
streaming_movies                 0
paperless_billing                0
churn                            0
one_year_contract                0
two_year_contract                0
fiber_optic                      0
no_internet                      0
card_auto_pay       

In [11]:
# almost done prepping the data,
# now to split the data into our train, validate, and test samples

train, test = train_test_split(df, 
                               train_size = 0.8,
                               random_state=1313)

train, validate = train_test_split(train,
                                  train_size = 0.7,
                                  random_state=1313)

train.shape, validate.shape, test.shape

((3943, 31), (1691, 31), (1409, 31))

# Exploratory Data Analysis and Statistical Testing

# Modeling to fit the Data

# Model Evaluation

# Finding and Key Takeaways