In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

from acquire import get_telco_data

In [2]:
df = get_telco_data()

In [3]:
df.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,contract_type,payment_type,internet_service_type,churn
0,0003-MKNFE,Male,0,No,No,9,59.9,542.4,Yes,Yes,...,No,No,No,No,Yes,No,Month-to-month,Mailed check,DSL,No
1,0004-TLHLJ,Male,0,No,No,4,73.9,280.85,Yes,No,...,No,Yes,No,No,No,Yes,Month-to-month,Electronic check,Fiber optic,Yes
2,0011-IGKFF,Male,1,Yes,No,13,98.0,1237.85,Yes,No,...,Yes,Yes,No,Yes,Yes,Yes,Month-to-month,Electronic check,Fiber optic,Yes
3,0013-EXCHZ,Female,1,Yes,No,3,83.9,267.4,Yes,No,...,No,No,Yes,Yes,No,Yes,Month-to-month,Mailed check,Fiber optic,Yes
4,0013-MHZWF,Female,0,No,Yes,9,69.4,571.45,Yes,No,...,No,No,Yes,Yes,Yes,Yes,Month-to-month,Credit card (automatic),DSL,No


In [4]:
df.shape

(7043, 21)

In [5]:
# Check for nulls in the columns of interest
assert df.total_charges.isna().sum() == 0, "There are missing values in this column. Handle them before proceeding"
assert df.monthly_charges.isna().sum() == 0, "There are missing values in this column. Handle them before proceeding"
assert df.tenure.isna().sum() == 0, "There are missing values in this column. Handle them before proceeding"

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customer_id              7043 non-null object
gender                   7043 non-null object
senior_citizen           7043 non-null int64
partner                  7043 non-null object
dependents               7043 non-null object
tenure                   7043 non-null int64
monthly_charges          7043 non-null float64
total_charges            7043 non-null object
phone_service            7043 non-null object
multiple_lines           7043 non-null object
online_security          7043 non-null object
online_backup            7043 non-null object
device_protection        7043 non-null object
tech_support             7043 non-null object
streaming_tv             7043 non-null object
streaming_movies         7043 non-null object
paperless_billing        7043 non-null object
contract_type            7043 non-null object
payment_type             7043 non-null object
internet_service_typ

In [7]:
#why is total_charges an object?
df.total_charges.value_counts(dropna=False)

          11
20.2      11
19.75      9
19.65      8
19.9       8
          ..
3046.4     1
1412.4     1
145        1
161.95     1
1035.7     1
Name: total_charges, Length: 6531, dtype: int64

- It looks like there are 11 observations where the total_charges is an empty string
- Need to change total_charges to a float

In [8]:
# Make any string with multiple spaces into an empty space value
df.total_charges = df.total_charges.str.strip()

# Count the number of empty string entries
df[df.total_charges == ""]

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,contract_type,payment_type,internet_service_type,churn
1475,1371-DWPAZ,Female,0,Yes,Yes,0,56.05,,No,No phone service,...,Yes,Yes,Yes,Yes,No,No,Two year,Credit card (automatic),DSL,No
2670,2923-ARZLG,Male,0,Yes,Yes,0,19.7,,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,One year,Mailed check,,No
2942,2520-SGTTA,Female,0,Yes,Yes,0,20.0,,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,Two year,Mailed check,,No
2979,2775-SEFEE,Male,0,No,Yes,0,61.9,,Yes,Yes,...,Yes,No,Yes,No,No,Yes,Two year,Bank transfer (automatic),DSL,No
3031,3115-CZMZD,Male,0,No,Yes,0,20.25,,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,Two year,Mailed check,,No
3050,3213-VVOLG,Male,0,Yes,Yes,0,25.35,,Yes,Yes,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,Two year,Mailed check,,No
3204,4075-WKNIU,Female,0,Yes,Yes,0,73.35,,Yes,Yes,...,Yes,Yes,Yes,Yes,No,No,Two year,Mailed check,DSL,No
3242,4367-NUYAO,Male,0,Yes,Yes,0,25.75,,Yes,Yes,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,Two year,Mailed check,,No
3252,4472-LVYGI,Female,0,Yes,Yes,0,52.55,,No,No phone service,...,No,Yes,Yes,Yes,No,Yes,Two year,Bank transfer (automatic),DSL,No
4710,5709-LVOEQ,Female,0,Yes,Yes,0,80.85,,Yes,No,...,Yes,Yes,No,Yes,Yes,No,Two year,Mailed check,DSL,No


It looks like those values are blank bc the tenure is 0. I will change it to 1 since they were probably customers for about a month

In [9]:
df.tenure.value_counts().sort_index()

0      11
1     613
2     238
3     200
4     176
     ... 
68    100
69     95
70    119
71    170
72    362
Name: tenure, Length: 73, dtype: int64

In [10]:
# Replace any tenures of 0 with 1

df.tenure.replace(0, 1, inplace=True)
# # Validate my tenure count for value 1

df.tenure.value_counts().sort_index()

1     624
2     238
3     200
4     176
5     133
     ... 
68    100
69     95
70    119
71    170
72    362
Name: tenure, Length: 72, dtype: int64

In [11]:
df[df.tenure == 1]

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,contract_type,payment_type,internet_service_type,churn
7,0021-IKXGC,Female,1,No,No,1,72.10,72.1,Yes,Yes,...,No,No,No,No,No,Yes,Month-to-month,Electronic check,Fiber optic,No
8,0023-HGHWL,Male,1,No,No,1,25.10,25.1,No,No phone service,...,No,No,No,No,No,Yes,Month-to-month,Electronic check,DSL,Yes
13,0032-PGELS,Female,0,Yes,Yes,1,30.50,30.5,No,No phone service,...,No,No,No,No,No,No,Month-to-month,Bank transfer (automatic),DSL,Yes
19,0082-LDZUE,Male,0,No,No,1,44.30,44.3,Yes,No,...,No,No,No,No,No,Yes,Month-to-month,Mailed check,DSL,No
29,0107-WESLM,Male,0,No,No,1,19.85,19.85,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,Month-to-month,Electronic check,,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6794,9907-SWKKF,Female,1,No,No,1,25.05,25.05,No,No phone service,...,No,No,No,No,No,No,Month-to-month,Mailed check,DSL,Yes
6808,9940-RHLFB,Female,0,No,No,1,75.30,75.3,Yes,No,...,No,Yes,No,No,No,No,Month-to-month,Electronic check,Fiber optic,Yes
6816,9962-BFPDU,Female,0,Yes,Yes,1,20.05,20.05,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,Month-to-month,Mailed check,,No
6823,9975-SKRNR,Male,0,No,No,1,18.90,18.9,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,Month-to-month,Mailed check,,No


In [12]:
# Replace the blank total_charges with the monthly_charge for tenure == 1
df.total_charges.replace('', df.monthly_charges, inplace=True)

In [13]:
# Validate my changes
df[df.tenure == 1]

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,contract_type,payment_type,internet_service_type,churn
7,0021-IKXGC,Female,1,No,No,1,72.10,72.1,Yes,Yes,...,No,No,No,No,No,Yes,Month-to-month,Electronic check,Fiber optic,No
8,0023-HGHWL,Male,1,No,No,1,25.10,25.1,No,No phone service,...,No,No,No,No,No,Yes,Month-to-month,Electronic check,DSL,Yes
13,0032-PGELS,Female,0,Yes,Yes,1,30.50,30.5,No,No phone service,...,No,No,No,No,No,No,Month-to-month,Bank transfer (automatic),DSL,Yes
19,0082-LDZUE,Male,0,No,No,1,44.30,44.3,Yes,No,...,No,No,No,No,No,Yes,Month-to-month,Mailed check,DSL,No
29,0107-WESLM,Male,0,No,No,1,19.85,19.85,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,Month-to-month,Electronic check,,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6794,9907-SWKKF,Female,1,No,No,1,25.05,25.05,No,No phone service,...,No,No,No,No,No,No,Month-to-month,Mailed check,DSL,Yes
6808,9940-RHLFB,Female,0,No,No,1,75.30,75.3,Yes,No,...,No,Yes,No,No,No,No,Month-to-month,Electronic check,Fiber optic,Yes
6816,9962-BFPDU,Female,0,Yes,Yes,1,20.05,20.05,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,Month-to-month,Mailed check,,No
6823,9975-SKRNR,Male,0,No,No,1,18.90,18.9,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,No,Month-to-month,Mailed check,,No


In [14]:
df.total_charges = df.total_charges.astype(float)

In [None]:
def wrangle_telco():
    df = acquire.get_telco_data()
    df.tenure.replace(0, 1, inplace=True)
    df.total_charges = df.total_charges.str.strip()
    df.total_charges.replace('', df.monthly_charges, inplace=True)
    df.total_charges = df.total_charges.astype(float)
    return df

In [15]:
# step 1 split
train, test = train_test_split(df)

In [16]:
# step 2 for loop w/ list of columns to encode
encoder = LabelEncoder()

encode_list = [
    'gender', 'partner', 'dependents', 'phone_service'
    , 'multiple_lines', 'online_security', 'online_backup'
    , 'device_protection', 'tech_support'
    , 'streaming_movies', 'streaming_tv', 'paperless_billing', 'churn'
    ]
               
               
               
for e in encode_list:
    train[e] = encoder.fit_transform(train[e])
    test[e] = encoder.transform(test[e])

In [17]:
train.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,phone_service,multiple_lines,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,contract_type,payment_type,internet_service_type,churn
1503,1568-BEKZM,1,1,1,0,70,90.05,6333.4,1,2,...,2,2,2,2,2,0,Two year,Credit card (automatic),DSL,0
2009,3272-VUHPV,0,0,1,1,8,56.3,401.5,1,0,...,2,0,2,0,0,0,Month-to-month,Bank transfer (automatic),DSL,0
1115,1442-OKRJE,1,0,1,1,66,103.15,7031.3,1,0,...,2,2,0,2,2,1,One year,Bank transfer (automatic),Fiber optic,0
5293,7872-RDDLZ,0,1,0,0,67,54.9,3725.5,1,2,...,2,0,0,0,0,0,Month-to-month,Electronic check,DSL,0
40,0128-MKWSG,0,0,0,1,26,45.8,1147.0,0,1,...,0,0,2,0,2,0,Month-to-month,Mailed check,DSL,0


In [20]:
train.shape

(5282, 21)