# Imports

In [20]:
from env import user, password, host

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

# Acquire

In [2]:
# Create a SQL query to get the data from SQL
query = '''
SELECT *
FROM customers
JOIN contract_types USING(`contract_type_id`)
JOIN internet_service_types USING(`internet_service_type_id`)
JOIN payment_types USING(`payment_type_id`)
'''

In [3]:
# Get the url for the SQL database, letting us query the telco_churn database
telco_url = f'mysql+pymysql://{user}:{password}@{host}/telco_churn'

In [4]:
telco = pd.read_sql(query, telco_url)

In [5]:
telco.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
1,4,1,1,0013-MHZWF,Female,0,No,Yes,9,Yes,...,Yes,Yes,Yes,Yes,69.4,571.45,No,Month-to-month,DSL,Credit card (automatic)
2,1,1,1,0015-UOCOJ,Female,1,No,No,7,Yes,...,No,No,No,Yes,48.2,340.35,No,Month-to-month,DSL,Electronic check
3,1,1,1,0023-HGHWL,Male,1,No,No,1,No,...,No,No,No,Yes,25.1,25.1,Yes,Month-to-month,DSL,Electronic check
4,3,1,1,0032-PGELS,Female,0,Yes,Yes,1,No,...,No,No,No,No,30.5,30.5,Yes,Month-to-month,DSL,Bank transfer (automatic)


# Prepare

## Split the Data

In [21]:
train, test = train_test_split(telco, train_size = .8, random_state=123)

In [22]:
train, validate = train_test_split(train, train_size=.8, random_state=123)

In [23]:
print(train.shape, test.shape, validate.shape)

(4500, 18) (1407, 18) (1125, 18)


## Clean the Data

In [6]:
telco.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

- Surprisingly, it looks like we don't have any nulls within the data
- We can patch out the type id's, since they were only useful for merging
    * We could've also done this in SQL, but I prefer getting to python as soon as possible
- Fields to look at:
    * gender: Currently an object, likely needs to be encoded
    * senior_citizen: It's an int type, does that mean it's encoded already?
    * partner: I have no clue what this field is for
    * dependents: Currently an object, either it's a bool or should be an int identifying how many dependents
    * phone_service - paperless_billing: Needs to be encoded
    * total_charges: Definitely should not be an object, likely needs to be a float
    * churn: Probably needs to be encoded


## Drop Columns

In [7]:
# Dropping all of the columns used just to merge the data together in the SQL query. These columns will provide nothing good for our models or exploration
telco.drop(columns=['payment_type_id',
                    'internet_service_type_id',
                    'contract_type_id',          
                    'contract_type',
                    'internet_service_type',
                    'payment_type'], inplace=True)

In [8]:
telco.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn
0,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,No,No,No,Yes,No,59.9,542.4,No
1,0013-MHZWF,Female,0,No,Yes,9,Yes,No,No,No,No,Yes,Yes,Yes,Yes,69.4,571.45,No
2,0015-UOCOJ,Female,1,No,No,7,Yes,No,Yes,No,No,No,No,No,Yes,48.2,340.35,No
3,0023-HGHWL,Male,1,No,No,1,No,No phone service,No,No,No,No,No,No,Yes,25.1,25.1,Yes
4,0032-PGELS,Female,0,Yes,Yes,1,No,No phone service,Yes,No,No,No,No,No,No,30.5,30.5,Yes


A little bit more manageable

## Fixing Datatypes

First, looking to patch up any features which are leading to type errors (looking at you total_charges)

In [9]:
# Check to see if there's a space in total_charges
telco[telco.total_charges.str.contains(' ')]

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn
1878,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,Yes,Yes,Yes,Yes,Yes,No,No,56.05,,No
1949,2775-SEFEE,Male,0,No,Yes,0,Yes,Yes,Yes,Yes,No,Yes,No,No,Yes,61.9,,No
2029,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,No,Yes,Yes,Yes,Yes,No,No,73.35,,No
2048,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,Yes,No,Yes,Yes,Yes,No,Yes,52.55,,No
2132,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,Yes,Yes,Yes,No,Yes,Yes,No,80.85,,No
6143,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,19.7,,No
6569,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,20.0,,No
6605,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,20.25,,No
6615,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,25.35,,No
6686,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,25.75,,No


Looks like we found the culprit: there's customers who haven't paid their first check yet, so total charges doesn't reflect it.
This leads to the first real choice for this project, do we impute or do we drop?
It would be best not to use these individuals for training, since none have churned and likely doesn't provide real value for train. But!
We could take this subset of customers and use them as a perfect example for a sub-test to test our predictions on.

In [10]:
# Save the newbies in a new dataframe
new_customers = telco[telco.total_charges.str.contains(' ')]

In [11]:
# Drop the newbies out of the primary dataframe
telco.drop(telco[telco.total_charges.str.contains(' ')].index, inplace=True)

In [12]:
telco.multiple_lines.value_counts()

No                  3385
Yes                 2967
No phone service     680
Name: multiple_lines, dtype: int64

Looks like multiple lines could be split up, possibly for a new field called no_phone_service. We'll make this one a bit later

In [13]:
telco.online_security.value_counts()

No                     3497
Yes                    2015
No internet service    1520
Name: online_security, dtype: int64

Same as with the previous, looks like we can generate no-internet-service into a new feature

In [14]:
telco.online_backup.value_counts()

No                     3087
Yes                    2425
No internet service    1520
Name: online_backup, dtype: int64

In [15]:
telco.device_protection.value_counts()

No                     3094
Yes                    2418
No internet service    1520
Name: device_protection, dtype: int64

In [16]:
telco.tech_support.value_counts()

No                     3472
Yes                    2040
No internet service    1520
Name: tech_support, dtype: int64

In [17]:
telco.streaming_tv.value_counts()

No                     2809
Yes                    2703
No internet service    1520
Name: streaming_tv, dtype: int64

In [18]:
telco.streaming_movies.value_counts()

No                     2781
Yes                    2731
No internet service    1520
Name: streaming_movies, dtype: int64

## Encoding

# Explore

# Model

# Evaluate

# Conclude