In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from acquire import get_telco_data

In [2]:
df = get_telco_data()

In [3]:
df.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,contract_type_id,payment_type_id,internet_service_type_id,churn
0,0002-ORFBO,Female,0,Yes,Yes,9,65.6,593.3,Yes,No,...,Yes,No,Yes,Yes,No,Yes,2,2,1,No
1,0003-MKNFE,Male,0,No,No,9,59.9,542.4,Yes,Yes,...,No,No,No,No,Yes,No,1,2,1,No
2,0004-TLHLJ,Male,0,No,No,4,73.9,280.85,Yes,No,...,No,Yes,No,No,No,Yes,1,1,2,Yes
3,0011-IGKFF,Male,1,Yes,No,13,98.0,1237.85,Yes,No,...,Yes,Yes,No,Yes,Yes,Yes,1,1,2,Yes
4,0013-EXCHZ,Female,1,Yes,No,3,83.9,267.4,Yes,No,...,No,No,Yes,Yes,No,Yes,1,2,2,Yes


In [4]:
df.shape

(7043, 21)

In [5]:
# Check for nulls in the columns of interest
assert df.total_charges.isna().sum() == 0, "There are missing values in this column. Handle them before proceeding"
assert df.monthly_charges.isna().sum() == 0, "There are missing values in this column. Handle them before proceeding"
assert df.tenure.isna().sum() == 0, "There are missing values in this column. Handle them before proceeding"

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customer_id                 7043 non-null object
gender                      7043 non-null object
senior_citizen              7043 non-null int64
partner                     7043 non-null object
dependents                  7043 non-null object
tenure                      7043 non-null int64
monthly_charges             7043 non-null float64
total_charges               7043 non-null object
phone_service               7043 non-null object
multiple_lines              7043 non-null object
online_security             7043 non-null object
online_backup               7043 non-null object
device_protection           7043 non-null object
tech_support                7043 non-null object
streaming_tv                7043 non-null object
streaming_movies            7043 non-null object
paperless_billing           7043 non-null object
contract_type_id            7043 non-null int64
payment_type_

In [7]:
#why is total_charges an object?
df.total_charges.value_counts(dropna=False)

           11
20.2       11
19.75       9
19.9        8
19.65       8
           ..
4287.2      1
68.75       1
1423.05     1
1818.9      1
1345.55     1
Name: total_charges, Length: 6531, dtype: int64

- It looks like there are 11 observations where the total_charges is an empty string
- Need to change total_charges to a float

In [8]:
# Make any string with multiple spaces into an empty space value
df.total_charges = df.total_charges.str.strip()

# Count the number of empty string entries
df[df.total_charges == ""]

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,contract_type_id,payment_type_id,internet_service_type_id,churn
945,1371-DWPAZ,Female,0,Yes,Yes,0,56.05,,No,No phone service,...,Yes,Yes,Yes,Yes,No,No,3,4,1,No
1731,2520-SGTTA,Female,0,Yes,Yes,0,20.0,,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,3,2,3,No
1906,2775-SEFEE,Male,0,No,Yes,0,61.9,,Yes,Yes,...,Yes,No,Yes,No,No,Yes,3,3,1,No
2025,2923-ARZLG,Male,0,Yes,Yes,0,19.7,,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,2,2,3,No
2176,3115-CZMZD,Male,0,No,Yes,0,20.25,,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,3,2,3,No
2250,3213-VVOLG,Male,0,Yes,Yes,0,25.35,,Yes,Yes,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,3,2,3,No
2855,4075-WKNIU,Female,0,Yes,Yes,0,73.35,,Yes,Yes,...,Yes,Yes,Yes,Yes,No,No,3,2,1,No
3052,4367-NUYAO,Male,0,Yes,Yes,0,25.75,,Yes,Yes,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,3,2,3,No
3118,4472-LVYGI,Female,0,Yes,Yes,0,52.55,,No,No phone service,...,No,Yes,Yes,Yes,No,Yes,3,3,1,No
4054,5709-LVOEQ,Female,0,Yes,Yes,0,80.85,,Yes,No,...,Yes,Yes,No,Yes,Yes,No,3,2,1,No


It looks like those values are blank bc the tenure is 0. I will change it to 1 since they were probably customers for about a month

In [9]:
df.tenure.value_counts().sort_index()

0      11
1     613
2     238
3     200
4     176
     ... 
68    100
69     95
70    119
71    170
72    362
Name: tenure, Length: 73, dtype: int64

In [10]:
# Replace any tenures of 0 with 1

df.tenure.replace(0, 1, inplace=True)
# # Validate my tenure count for value 1

df.tenure.value_counts().sort_index()

1     624
2     238
3     200
4     176
5     133
     ... 
68    100
69     95
70    119
71    170
72    362
Name: tenure, Length: 72, dtype: int64

In [11]:
df[df.tenure == 1]

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,contract_type_id,payment_type_id,internet_service_type_id,churn
17,0021-IKXGC,Female,1,No,No,1,72.10,72.1,Yes,Yes,...,No,No,No,No,No,Yes,1,1,2,No
19,0023-HGHWL,Male,1,No,No,1,25.10,25.1,No,No phone service,...,No,No,No,No,No,Yes,1,1,1,Yes
25,0032-PGELS,Female,0,Yes,Yes,1,30.50,30.5,No,No phone service,...,No,No,No,No,No,No,1,3,1,Yes
48,0082-LDZUE,Male,0,No,No,1,44.30,44.3,Yes,No,...,No,No,No,No,No,Yes,1,2,1,No
63,0107-WESLM,Male,0,No,No,1,19.85,19.85,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,1,1,3,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6980,9907-SWKKF,Female,1,No,No,1,25.05,25.05,No,No phone service,...,No,No,No,No,No,No,1,2,1,Yes
7007,9940-RHLFB,Female,0,No,No,1,75.30,75.3,Yes,No,...,No,Yes,No,No,No,No,1,1,2,Yes
7021,9962-BFPDU,Female,0,Yes,Yes,1,20.05,20.05,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,1,2,3,No
7033,9975-SKRNR,Male,0,No,No,1,18.90,18.9,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,1,2,3,No


In [12]:
# Replace the blank total_charges with the monthly_charge for tenure == 1
df.total_charges.replace('', df.monthly_charges, inplace=True)

In [13]:
# Validate my changes
df[df.tenure == 1]

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,contract_type_id,payment_type_id,internet_service_type_id,churn
17,0021-IKXGC,Female,1,No,No,1,72.10,72.1,Yes,Yes,...,No,No,No,No,No,Yes,1,1,2,No
19,0023-HGHWL,Male,1,No,No,1,25.10,25.1,No,No phone service,...,No,No,No,No,No,Yes,1,1,1,Yes
25,0032-PGELS,Female,0,Yes,Yes,1,30.50,30.5,No,No phone service,...,No,No,No,No,No,No,1,3,1,Yes
48,0082-LDZUE,Male,0,No,No,1,44.30,44.3,Yes,No,...,No,No,No,No,No,Yes,1,2,1,No
63,0107-WESLM,Male,0,No,No,1,19.85,19.85,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,1,1,3,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6980,9907-SWKKF,Female,1,No,No,1,25.05,25.05,No,No phone service,...,No,No,No,No,No,No,1,2,1,Yes
7007,9940-RHLFB,Female,0,No,No,1,75.30,75.3,Yes,No,...,No,Yes,No,No,No,No,1,1,2,Yes
7021,9962-BFPDU,Female,0,Yes,Yes,1,20.05,20.05,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,1,2,3,No
7033,9975-SKRNR,Male,0,No,No,1,18.90,18.9,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,1,2,3,No


In [14]:
df.total_charges = df.total_charges.astype(float)

In [15]:
def wrangle_telco():
    df = acquire.get_telco_data()
    df.tenure.replace(0, 1, inplace=True)
    df.total_charges = df.total_charges.str.strip()
    df.total_charges.replace('', df.monthly_charges, inplace=True)
    df.total_charges = df.total_charges.astype(float)
    return df

In [16]:
# step 1 split
train, test = train_test_split(df, random_state=123, train_size=.86)
train, validate = train_test_split(train, random_state=123, train_size=.83)

In [17]:
# step 2 for loop w/ list of columns to encode
encoder = LabelEncoder()

encode_list = [
    'gender', 'partner', 'dependents', 'phone_service'
    , 'multiple_lines', 'online_security', 'online_backup'
    , 'device_protection', 'tech_support'
    , 'streaming_movies', 'streaming_tv', 'paperless_billing', 'churn'
    ]
               
               
               
for e in encode_list:
    train[e] = encoder.fit_transform(train[e])
    test[e] = encoder.transform(test[e])

In [18]:
train.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,contract_type_id,payment_type_id,internet_service_type_id,churn
4782,6728-DKUCO,0,0,1,1,72,104.15,7303.05,1,2,...,2,0,0,2,2,1,2,1,2,0
3552,5032-USPKF,0,0,0,0,38,84.1,3187.65,1,2,...,2,2,0,2,2,0,2,3,1,0
1688,2460-NGXBJ,1,1,1,1,11,75.2,775.3,1,2,...,0,0,0,0,0,1,1,4,2,0
3210,4603-JANFB,1,0,0,0,1,69.85,69.85,1,0,...,0,0,0,0,0,1,1,1,2,1
2647,3761-FLYZI,0,1,1,0,65,108.8,7082.45,1,2,...,2,2,0,2,2,1,1,3,2,0


In [19]:
train.shape

(5026, 21)

In [20]:
test.shape

(987, 21)

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5026 entries, 4782 to 4315
Data columns (total 21 columns):
customer_id                 5026 non-null object
gender                      5026 non-null int64
senior_citizen              5026 non-null int64
partner                     5026 non-null int64
dependents                  5026 non-null int64
tenure                      5026 non-null int64
monthly_charges             5026 non-null float64
total_charges               5026 non-null float64
phone_service               5026 non-null int64
multiple_lines              5026 non-null int64
online_security             5026 non-null int64
online_backup               5026 non-null int64
device_protection           5026 non-null int64
tech_support                5026 non-null int64
streaming_tv                5026 non-null int64
streaming_movies            5026 non-null int64
paperless_billing           5026 non-null int64
contract_type_id            5026 non-null int64
payment_type_id      

In [22]:
# How many people have churned?
train.churn.value_counts()

0    3706
1    1320
Name: churn, dtype: int64

Let's try to narrow down which features have an effect on churn
try a .corr 

In [23]:
train.corr()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,contract_type_id,payment_type_id,internet_service_type_id,churn
gender,1.0,-0.004663,-0.007578,0.016105,-0.003627,-0.020876,-0.010098,0.000425,-0.017346,-0.012186,-0.022398,0.004285,-0.009498,-0.004764,-0.004436,-0.019429,0.003868,-0.005449,0.00104,-0.004412
senior_citizen,-0.004663,1.0,0.017866,-0.20891,0.009552,0.220709,0.096819,0.011648,0.139979,-0.125236,-0.004461,-0.031011,-0.14998,0.024609,0.038489,0.152232,-0.144449,-0.089539,-0.027149,0.150677
partner,-0.007578,0.017866,1.0,0.442386,0.384309,0.125799,0.330268,0.029932,0.160946,0.129755,0.156249,0.149445,0.125166,0.143017,0.134767,0.00315,0.287108,0.13837,-0.007884,-0.145413
dependents,0.016105,-0.20891,0.442386,1.0,0.162196,-0.103421,0.066173,0.001268,-0.017779,0.136732,0.078686,0.067737,0.126025,0.044739,0.026201,-0.108924,0.237651,0.136012,0.032273,-0.164659
tenure,-0.003627,0.009552,0.384309,0.162196,1.0,0.25296,0.827702,0.015866,0.345716,0.329188,0.379456,0.358512,0.325898,0.285212,0.294898,0.020344,0.672822,0.333982,-0.031001,-0.356516
monthly_charges,-0.020876,0.220709,0.125799,-0.103421,0.25296,1.0,0.654077,0.24436,0.432819,-0.054794,0.123092,0.165367,0.002501,0.327965,0.33923,0.354714,-0.070806,-0.077827,-0.322427,0.19222
total_charges,-0.010098,0.096819,0.330268,0.066173,0.827702,0.654077,1.0,0.115594,0.455608,0.254908,0.382655,0.37714,0.283258,0.384166,0.39782,0.168335,0.449305,0.217945,-0.176332,-0.203565
phone_service,0.000425,0.011648,0.029932,0.001268,0.015866,0.24436,0.115594,1.0,-0.023924,-0.021123,0.016128,0.001735,-0.018417,0.051544,0.051769,0.010544,-0.002758,-0.013222,0.378922,0.01505
multiple_lines,-0.017346,0.139979,0.160946,-0.017779,0.345716,0.432819,0.455608,-0.023924,1.0,0.008721,0.12643,0.115874,0.011261,0.164579,0.179023,0.164503,0.112203,0.036874,-0.114085,0.029163
online_security,-0.012186,-0.125236,0.129755,0.136732,0.329188,-0.054794,0.254908,-0.021123,0.008721,1.0,0.184695,0.172348,0.283303,0.045946,0.047845,-0.15339,0.373992,0.239419,-0.037749,-0.296673


Which contract type has the highest number of people churning?

In [24]:
train.groupby(['churn']).contract_type.value_counts()

AttributeError: 'DataFrameGroupBy' object has no attribute 'contract_type'

In [None]:
train.groupby(['contract_type', 'churn']).tenure.count().nlargest()

- It looks like the highest number of people who have churned have a month to month contract and they tend to churn just after the first month.
- why?

In [None]:
#Does the churn group buy more of one specific type of internet?
train.groupby(['churn']).internet_service_type.value_counts()

In [None]:
train.groupby(['churn']).payment_type.value_counts()

In [None]:
train.groupby(['churn','contract_type']).payment_type.value_counts()

- It looks like the non-automatic payment types account for the highest number of people who have churned.
- I would like to do some feature engineering that groups payment type into automatic and manual

In [None]:
df1 = train[['churn','contract_type', 'payment_type']]

In [None]:
df1.head()

In [None]:
melt = df1.melt(id_vars=['churn','contract_type'], value_name='payment_mode')
melt.index=train.index

In [None]:
melt.head()

In [None]:
melt = melt['payment_mode'].str.split('(', expand = True)
melt.head()

In [None]:
melt.columns = ['payment_method', 'mode_of_payment']

In [None]:
df1 = pd.concat([df1, melt], axis=1)
df1.head()

In [None]:
df1 = df1.drop(columns='payment_method')
df1.head()

In [None]:
df1['mode_of_payment'] = df1.mode_of_payment.fillna('manual')
df1['mode_of_payment'] = df1.mode_of_payment.str.replace(')','')
df1.head()

In [None]:
df1.groupby('churn').payment_type.value_counts()

In [None]:
df1.mode_of_payment.value_counts()

In [None]:
df1.groupby(['churn','contract_type']).mode_of_payment.value_counts()

In [None]:
df1.groupby(['churn'])['contract_type'].count().reset_index()

In [None]:
train.columns

In [None]:
X = df[['senior_citizen','multiple_lines','streaming_movies','paperless_billing','contract_type','payment_type']]
y = df[['churn']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

In [None]:
encoder = LabelEncoder()

encode_list = [
    'gender', 'partner', 'dependents', 'phone_service'
    , 'multiple_lines', 'online_security', 'online_backup'
    , 'device_protection', 'tech_support'
    , 'streaming_movies', 'streaming_tv', 'paperless_billing', 'churn'
    ]
               
               
               
for e in encode_list:
    train[e] = encoder.fit_transform(train[e])
    test[e] = encoder.transform(test[e])

In [None]:
train.head()

In [None]:
logit = LogisticRegression()

In [None]:
logit.fit(X_train, y_train)