In [17]:
import pandas as pd
import numpy as np
from env import user, password, host

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


import warnings
warnings.filterwarnings('ignore')

### Why are our customers churning?

- Pull in data

In [18]:
def get_db_url(dbname) -> str:
    url = 'mysql+pymysql://{}:{}@{}/{}'
    return url.format(user, password, host, dbname)

In [19]:
def get_telco_data():
    query = '''
    SELECT customers.customer_id, gender, senior_citizen, partner, dependents, tenure, monthly_charges, total_charges, phone_service, multiple_lines, online_security, online_backup, device_protection, tech_support, streaming_tv, streaming_movies, paperless_billing, contract_types.contract_type, payment_types.payment_type,internet_service_types.internet_service_type, churn
FROM customers
LEFT JOIN contract_types ON customers.contract_type_id=contract_types.contract_type_id
LEFT JOIN internet_service_types ON customers.internet_service_type_id=internet_service_types.internet_service_type_id
LEFT JOIN payment_types ON customers.payment_type_id=payment_types.payment_type_id
    '''
    df = pd.read_sql(query, get_db_url('telco_churn'))
    return df 

In [20]:
df = get_telco_data()

In [21]:
df.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,contract_type,payment_type,internet_service_type,churn
0,0003-MKNFE,Male,0,No,No,9,59.9,542.4,Yes,Yes,...,No,No,No,No,Yes,No,Month-to-month,Mailed check,DSL,No
1,0004-TLHLJ,Male,0,No,No,4,73.9,280.85,Yes,No,...,No,Yes,No,No,No,Yes,Month-to-month,Electronic check,Fiber optic,Yes
2,0011-IGKFF,Male,1,Yes,No,13,98.0,1237.85,Yes,No,...,Yes,Yes,No,Yes,Yes,Yes,Month-to-month,Electronic check,Fiber optic,Yes
3,0013-EXCHZ,Female,1,Yes,No,3,83.9,267.4,Yes,No,...,No,No,Yes,Yes,No,Yes,Month-to-month,Mailed check,Fiber optic,Yes
4,0013-MHZWF,Female,0,No,Yes,9,69.4,571.45,Yes,No,...,No,No,Yes,Yes,Yes,Yes,Month-to-month,Credit card (automatic),DSL,No


In [22]:
# Check for nulls in the columns of interest
assert df.total_charges.isna().sum() == 0, "There are missing values in this column. Handle them before proceeding"
assert df.monthly_charges.isna().sum() == 0, "There are missing values in this column. Handle them before proceeding"
assert df.tenure.isna().sum() == 0, "There are missing values in this column. Handle them before proceeding"

In [23]:
df.isnull().sum()

customer_id              0
gender                   0
senior_citizen           0
partner                  0
dependents               0
tenure                   0
monthly_charges          0
total_charges            0
phone_service            0
multiple_lines           0
online_security          0
online_backup            0
device_protection        0
tech_support             0
streaming_tv             0
streaming_movies         0
paperless_billing        0
contract_type            0
payment_type             0
internet_service_type    0
churn                    0
dtype: int64

In [24]:
# Make any string with multiple spaces into an empty space value
df.total_charges = df.total_charges.str.strip()

# Count the number of empty string entries
df[df.total_charges == ""]

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,contract_type,payment_type,internet_service_type,churn
1475,1371-DWPAZ,Female,0,Yes,Yes,0,56.05,,No,No phone service,...,Yes,Yes,Yes,Yes,No,No,Two year,Credit card (automatic),DSL,No
2670,2923-ARZLG,Male,0,Yes,Yes,0,19.7,,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,One year,Mailed check,,No
2942,2520-SGTTA,Female,0,Yes,Yes,0,20.0,,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,Two year,Mailed check,,No
2979,2775-SEFEE,Male,0,No,Yes,0,61.9,,Yes,Yes,...,Yes,No,Yes,No,No,Yes,Two year,Bank transfer (automatic),DSL,No
3031,3115-CZMZD,Male,0,No,Yes,0,20.25,,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,Two year,Mailed check,,No
3050,3213-VVOLG,Male,0,Yes,Yes,0,25.35,,Yes,Yes,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,Two year,Mailed check,,No
3204,4075-WKNIU,Female,0,Yes,Yes,0,73.35,,Yes,Yes,...,Yes,Yes,Yes,Yes,No,No,Two year,Mailed check,DSL,No
3242,4367-NUYAO,Male,0,Yes,Yes,0,25.75,,Yes,Yes,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,Two year,Mailed check,,No
3252,4472-LVYGI,Female,0,Yes,Yes,0,52.55,,No,No phone service,...,No,Yes,Yes,Yes,No,Yes,Two year,Bank transfer (automatic),DSL,No
4710,5709-LVOEQ,Female,0,Yes,Yes,0,80.85,,Yes,No,...,Yes,Yes,No,Yes,Yes,No,Two year,Mailed check,DSL,No


In [25]:
# df.total_charges = df.total_charges.astype(float)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customer_id              7043 non-null object
gender                   7043 non-null object
senior_citizen           7043 non-null int64
partner                  7043 non-null object
dependents               7043 non-null object
tenure                   7043 non-null int64
monthly_charges          7043 non-null float64
total_charges            7043 non-null object
phone_service            7043 non-null object
multiple_lines           7043 non-null object
online_security          7043 non-null object
online_backup            7043 non-null object
device_protection        7043 non-null object
tech_support             7043 non-null object
streaming_tv             7043 non-null object
streaming_movies         7043 non-null object
paperless_billing        7043 non-null object
contract_type            7043 non-null object
payment_type             7043 non-null object
internet_service_typ

### Prep still needed:
> - impute monthly charges that have empty strings
> - change monthly charges to a float
> - make the data tide
> - encode churn column

### encoding

In [27]:
# step 1 split
train, test = train_test_split(df)

In [28]:
# step 2 for loop w/ list of columns to encode
encoder = LabelEncoder()

encode_list = [
    'gender', 'partner', 'dependents', 'phone_service'
    , 'multiple_lines', 'online_security', 'online_backup'
    , 'device_protection', 'tech_support'
    , 'streaming_movies', 'streaming_tv', 'paperless_billing', 'churn'
    ]
               
               
               
for e in encode_list:
    train[e] = encoder.fit_transform(train[e])
    test[e] = encoder.transform(test[e])

In [29]:
train.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,contract_type,payment_type,internet_service_type,churn
3460,5092-STPKP,0,0,0,0,24,56.35,1381.2,1,0,...,0,0,2,0,0,0,Month-to-month,Credit card (automatic),DSL,0
6129,9067-SQTNS,1,0,1,1,44,20.6,926.0,1,0,...,1,1,1,1,1,0,One year,Bank transfer (automatic),,0
2189,3721-CNEYS,0,0,0,0,2,70.95,137.95,1,0,...,0,0,0,0,0,1,Month-to-month,Electronic check,Fiber optic,1
3068,3280-MRDOF,1,1,0,0,30,69.1,2093.9,1,2,...,0,2,2,2,0,1,Two year,Credit card (automatic),DSL,0
3149,3764-MNMOI,1,0,0,0,46,19.2,908.15,1,0,...,1,1,1,1,1,0,Two year,Credit card (automatic),,0
