In [1]:
import numpy as np
import seaborn as sns
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import env
from pydataset import data
import scipy
import os
from sklearn.model_selection import train_test_split
# turn off pink boxes for demo
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")

# import our own acquire module
import acquire


In [2]:
df = acquire.get_telco_data()

In [3]:
categorical_columns = [column for column in df.columns if df[column].dtype == "O"]
categorical_columns

['customer_id',
 'gender',
 'partner',
 'dependents',
 'phone_service',
 'multiple_lines',
 'online_security',
 'online_backup',
 'device_protection',
 'tech_support',
 'streaming_tv',
 'streaming_movies',
 'paperless_billing',
 'total_charges',
 'churn',
 'contract_type',
 'payment_type',
 'internet_service_type']

In [4]:
df

Unnamed: 0,internet_service_type_id,payment_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,payment_type,internet_service_type
0,1,2,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.60,593.3,No,One year,Mailed check,DSL
1,1,2,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.90,542.4,No,Month-to-month,Mailed check,DSL
2,2,1,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.90,280.85,Yes,Month-to-month,Electronic check,Fiber optic
3,2,1,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.00,1237.85,Yes,Month-to-month,Electronic check,Fiber optic
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.90,267.4,Yes,Month-to-month,Mailed check,Fiber optic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,2,2,9987-LUTYD,Female,0,No,No,13,Yes,...,Yes,No,No,No,55.15,742.9,No,One year,Mailed check,DSL
7039,2,1,1,9992-RRAMN,Male,0,Yes,No,22,Yes,...,No,No,Yes,Yes,85.10,1873.7,Yes,Month-to-month,Electronic check,Fiber optic
7040,1,2,1,9992-UJOEL,Male,0,No,No,2,Yes,...,No,No,No,Yes,50.30,92.75,No,Month-to-month,Mailed check,DSL
7041,1,2,3,9993-LHIEB,Male,0,Yes,Yes,67,Yes,...,Yes,No,Yes,No,67.85,4627.65,No,Two year,Mailed check,DSL


In [5]:

def prep_telco_data(df):
    # Drop duplicate columns
    df.drop(columns=['dependents','partner','customer_id','payment_type', 'internet_service_type', 'contract_type'], inplace=True)
       
    # Drop null values stored as whitespace    
    df['total_charges'] = df['total_charges'].str.strip()
    df = df[df.total_charges != '']
    
    # Convert to correct datatype
    df['total_charges'] = df.total_charges.astype(float)
    
    # Convert binary categorical variables to numeric
    df = df.replace({"Yes": 1},{"No":0},{"Female": 0},{"Male":1},{"No phone service":2})
    
# Get dummies for non-binary categorical variables
    dummy_df = pd.get_dummies(df[categorical_columns], dummy_na=False, \
                              drop_first=True)
    
    # Concatenate dummy dataframe to original 
    df = pd.concat([df, dummy_df], axis=1)
    
    return df


In [6]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [7]:
def split_data(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames; stratify on survived.
    return train, validate, test DataFrames.
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.churn)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123, 
                                       stratify=train_validate.churn)
    return train, validate, test


train, validate, test = split_data(df)

In [8]:
train, validate, test = split_data(df)
train

Unnamed: 0,internet_service_type_id,payment_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,payment_type,internet_service_type
5310,1,3,3,7503-MIOGA,Female,1,Yes,No,72,Yes,...,Yes,Yes,Yes,Yes,89.85,6697.35,No,Two year,Bank transfer (automatic),DSL
3790,1,4,3,5329-KRDTM,Male,1,Yes,No,72,Yes,...,Yes,Yes,No,No,77.35,5396.25,No,Two year,Credit card (automatic),DSL
4398,2,1,2,6199-IWKGC,Female,1,Yes,No,46,Yes,...,Yes,Yes,Yes,No,100.25,4753.85,No,One year,Electronic check,Fiber optic
2635,1,1,1,3748-FVMZZ,Male,0,No,No,4,No,...,Yes,No,Yes,Yes,40.05,162.45,No,Month-to-month,Electronic check,DSL
2986,1,2,1,4280-DLSHD,Male,0,Yes,No,8,Yes,...,No,No,No,Yes,54.75,445.85,No,Month-to-month,Mailed check,DSL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6230,2,1,3,8809-RIHDD,Male,0,Yes,Yes,72,Yes,...,No,Yes,Yes,Yes,103.40,7372.65,Yes,Two year,Electronic check,Fiber optic
356,1,1,1,0523-VNGTF,Female,1,No,No,52,No,...,Yes,Yes,No,Yes,50.50,2566.3,No,Month-to-month,Electronic check,DSL
2128,2,1,2,3058-HJCUY,Male,0,Yes,Yes,41,Yes,...,Yes,Yes,Yes,Yes,102.60,4213.35,Yes,One year,Electronic check,Fiber optic
3586,1,2,2,5081-NWSUP,Female,0,No,No,10,Yes,...,Yes,No,Yes,No,64.90,685.55,No,One year,Mailed check,DSL


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   internet_service_type_id  7043 non-null   int64  
 1   payment_type_id           7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

In [10]:
df.drop(columns=['gender'],inplace= True)

In [None]:
# convert column names to lowercase, replace '.' in column names with '_'
df.columns = [col.lower().replace('.', '_') for col in df]
df

In [None]:
df.info()

In [None]:
# split into train, validate, test
train, validate, test = split_data(df)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['churn'])
y_train = train.churn_yes

X_validate = validate.drop(columns=['churn'])
y_validate = validate.churn_yes

X_test = test.drop(columns=['churn'])
y_test = test.churn_yes

In [None]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)


In [None]:
baseline_accuracy = (train.churn_yes == 0).mean()
baseline_accuracy

In [None]:
train


In [None]:
clf = clf.fit(X_train, y_train)


In [None]:
plt.figure(figsize=(13, 7))
plot_tree(clf, feature_names=X_train.columns, class_names=clf.classes_, rounded=True)
