# Imports

In [1]:
from env import user, password, host
import prepare

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

# Acquire

In [2]:
# Create a SQL query to get the data from SQL
query = '''
SELECT *
FROM customers
JOIN contract_types USING(`contract_type_id`)
JOIN internet_service_types USING(`internet_service_type_id`)
JOIN payment_types USING(`payment_type_id`)
'''

In [3]:
# Get the url for the SQL database, letting us query the telco_churn database
telco_url = f'mysql+pymysql://{user}:{password}@{host}/telco_churn'

In [4]:
telco = pd.read_sql(query, telco_url)

In [5]:
telco.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
1,4,1,1,0013-MHZWF,Female,0,No,Yes,9,Yes,...,Yes,Yes,Yes,Yes,69.4,571.45,No,Month-to-month,DSL,Credit card (automatic)
2,1,1,1,0015-UOCOJ,Female,1,No,No,7,Yes,...,No,No,No,Yes,48.2,340.35,No,Month-to-month,DSL,Electronic check
3,1,1,1,0023-HGHWL,Male,1,No,No,1,No,...,No,No,No,Yes,25.1,25.1,Yes,Month-to-month,DSL,Electronic check
4,3,1,1,0032-PGELS,Female,0,Yes,Yes,1,No,...,No,No,No,No,30.5,30.5,Yes,Month-to-month,DSL,Bank transfer (automatic)


## Create a Function and Test It

In [6]:
def get_telco_data():
    # Create a SQL query to get the data from SQL
    query = '''
            SELECT *
            FROM customers
            JOIN contract_types USING(`contract_type_id`)
            JOIN internet_service_types USING(`internet_service_type_id`)
            JOIN payment_types USING(`payment_type_id`)
            '''
    
    # Get the url for the SQL database, letting us query the telco_churn database
    telco_url = f'mysql+pymysql://{user}:{password}@{host}/telco_churn'
        
    return pd.read_sql(query, telco_url)

In [7]:
df = get_telco_data()
df.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
1,4,1,1,0013-MHZWF,Female,0,No,Yes,9,Yes,...,Yes,Yes,Yes,Yes,69.4,571.45,No,Month-to-month,DSL,Credit card (automatic)
2,1,1,1,0015-UOCOJ,Female,1,No,No,7,Yes,...,No,No,No,Yes,48.2,340.35,No,Month-to-month,DSL,Electronic check
3,1,1,1,0023-HGHWL,Male,1,No,No,1,No,...,No,No,No,Yes,25.1,25.1,Yes,Month-to-month,DSL,Electronic check
4,3,1,1,0032-PGELS,Female,0,Yes,Yes,1,No,...,No,No,No,No,30.5,30.5,Yes,Month-to-month,DSL,Bank transfer (automatic)


# Prepare

In [8]:
train, test, validate = prepare.prep_telco(telco, train_size=.8, seed=123)

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4507 entries, 1249 to 6958
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   customer_id            4507 non-null   object 
 1   gender                 4507 non-null   object 
 2   senior_citizen         4507 non-null   int64  
 3   partner                4507 non-null   object 
 4   dependents             4507 non-null   object 
 5   tenure                 4507 non-null   int64  
 6   phone_service          4507 non-null   object 
 7   multiple_lines         4507 non-null   object 
 8   online_security        4507 non-null   object 
 9   online_backup          4507 non-null   object 
 10  device_protection      4507 non-null   object 
 11  tech_support           4507 non-null   object 
 12  streaming_tv           4507 non-null   object 
 13  streaming_movies       4507 non-null   object 
 14  paperless_billing      4507 non-null   object 
 15  m

## Split the Data

In [10]:
train, test = train_test_split(telco, train_size=.8, random_state=123)

In [11]:
train, validate = train_test_split(train, train_size=.8, random_state=123)

In [12]:
print(train.shape, test.shape, validate.shape)

(4507, 24) (1409, 24) (1127, 24)


## Create a Function and Test It

In [13]:
def split_data(df, train_size, seed):
    # Create the train and test sets
    train, test = train_test_split(df, train_size=train_size, random_state=seed)
    # Create the validate set by splitting from train set
    train, validate = train_test_split(train, train_size=train_size, random_state=seed)
    
    return train, test, validate

In [14]:
train, test, validate = split_data(telco, train_size=.8, seed=123)

In [15]:
print(train.shape, test.shape, validate.shape)

(4507, 24) (1409, 24) (1127, 24)


## Clean the Data

In [16]:
telco.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

- Surprisingly, it looks like we don't have any nulls within the data
- We can patch out the type id's, since they were only useful for merging
    * We could've also done this in SQL, but I prefer getting to python as soon as possible
- Fields to look at:
    * gender: Currently an object, likely needs to be encoded
    * senior_citizen: It's an int type, does that mean it's encoded already?
    * partner: I have no clue what this field is for
    * dependents: Currently an object, either it's a bool or should be an int identifying how many dependents
    * phone_service - paperless_billing: Needs to be encoded
    * total_charges: Definitely should not be an object, likely needs to be a float
    * churn: Probably needs to be encoded


## Drop Columns

In [17]:
# Dropping all of the columns used just to merge the data together in the SQL query. These columns will provide nothing good for our models or exploration
telco.drop(columns=['payment_type_id',
                    'internet_service_type_id',
                    'contract_type_id'], inplace=True)

In [18]:
telco.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
1,0013-MHZWF,Female,0,No,Yes,9,Yes,No,No,No,...,Yes,Yes,Yes,Yes,69.4,571.45,No,Month-to-month,DSL,Credit card (automatic)
2,0015-UOCOJ,Female,1,No,No,7,Yes,No,Yes,No,...,No,No,No,Yes,48.2,340.35,No,Month-to-month,DSL,Electronic check
3,0023-HGHWL,Male,1,No,No,1,No,No phone service,No,No,...,No,No,No,Yes,25.1,25.1,Yes,Month-to-month,DSL,Electronic check
4,0032-PGELS,Female,0,Yes,Yes,1,No,No phone service,Yes,No,...,No,No,No,No,30.5,30.5,Yes,Month-to-month,DSL,Bank transfer (automatic)


A little bit more manageable

## Fixing Datatypes

First, looking to patch up any features which are leading to type errors (looking at you total_charges)

In [19]:
# Check to see if there's a space in total_charges
telco[telco.total_charges.str.contains(' ')]

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
1878,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,Yes,Yes,...,Yes,Yes,No,No,56.05,,No,Two year,DSL,Credit card (automatic)
1949,2775-SEFEE,Male,0,No,Yes,0,Yes,Yes,Yes,Yes,...,Yes,No,No,Yes,61.9,,No,Two year,DSL,Bank transfer (automatic)
2029,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,No,Yes,...,Yes,Yes,No,No,73.35,,No,Two year,DSL,Mailed check
2048,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,Yes,No,...,Yes,Yes,No,Yes,52.55,,No,Two year,DSL,Bank transfer (automatic)
2132,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,Yes,Yes,...,No,Yes,Yes,No,80.85,,No,Two year,DSL,Mailed check
6143,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No internet service,No internet service,...,No internet service,No internet service,No internet service,Yes,19.7,,No,One year,,Mailed check
6569,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No internet service,No internet service,...,No internet service,No internet service,No internet service,No,20.0,,No,Two year,,Mailed check
6605,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No internet service,No internet service,...,No internet service,No internet service,No internet service,No,20.25,,No,Two year,,Mailed check
6615,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No internet service,No internet service,...,No internet service,No internet service,No internet service,No,25.35,,No,Two year,,Mailed check
6686,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No internet service,No internet service,...,No internet service,No internet service,No internet service,No,25.75,,No,Two year,,Mailed check


Looks like we found the culprit: there's customers who haven't paid their first check yet, so total charges doesn't reflect it.
This leads to the first real choice for this project, do we impute or do we drop?
It would be best not to use these individuals for training, since none have churned and likely doesn't provide real value for train. But!
We could take this subset of customers and use them as a perfect example for a sub-test to test our predictions on.

Find the total_charges == ' ', then replace them with zero

In [20]:
telco.total_charges = telco.total_charges.str.replace(' ', '0')

In [21]:
telco.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
1,0013-MHZWF,Female,0,No,Yes,9,Yes,No,No,No,...,Yes,Yes,Yes,Yes,69.4,571.45,No,Month-to-month,DSL,Credit card (automatic)
2,0015-UOCOJ,Female,1,No,No,7,Yes,No,Yes,No,...,No,No,No,Yes,48.2,340.35,No,Month-to-month,DSL,Electronic check
3,0023-HGHWL,Male,1,No,No,1,No,No phone service,No,No,...,No,No,No,Yes,25.1,25.1,Yes,Month-to-month,DSL,Electronic check
4,0032-PGELS,Female,0,Yes,Yes,1,No,No phone service,Yes,No,...,No,No,No,No,30.5,30.5,Yes,Month-to-month,DSL,Bank transfer (automatic)


In [22]:
# Save the newbies in a new dataframe
new_customers = telco[telco.total_charges.str.contains(' ')]

In [23]:
# Drop the newbies out of the primary dataframe
telco.drop(telco[telco.total_charges.str.contains(' ')].index, inplace=True)

In [24]:
telco.multiple_lines.value_counts()

No                  3390
Yes                 2971
No phone service     682
Name: multiple_lines, dtype: int64

Looks like multiple lines could be split up, possibly for a new field called no_phone_service. We'll make this one a bit later

In [25]:
telco.online_security.value_counts()

No                     3498
Yes                    2019
No internet service    1526
Name: online_security, dtype: int64

Same as with the previous, looks like we can generate no-internet-service into a new feature

In [26]:
telco.online_backup.value_counts()

No                     3088
Yes                    2429
No internet service    1526
Name: online_backup, dtype: int64

In [27]:
telco.device_protection.value_counts()

No                     3095
Yes                    2422
No internet service    1526
Name: device_protection, dtype: int64

In [28]:
telco.tech_support.value_counts()

No                     3473
Yes                    2044
No internet service    1526
Name: tech_support, dtype: int64

In [29]:
telco.streaming_tv.value_counts()

No                     2810
Yes                    2707
No internet service    1526
Name: streaming_tv, dtype: int64

In [30]:
telco.streaming_movies.value_counts()

No                     2785
Yes                    2732
No internet service    1526
Name: streaming_movies, dtype: int64

## Encoding

# Explore

# Model

# Evaluate

# Conclude