In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,recall_score,precision_score

import warnings
warnings.filterwarnings("ignore")

from acquire import get_telco_data
from prepare import prep_telco_data

In [115]:
df = get_telco_data(cached = True)

In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               7043 non-null   object 
 1   gender                    7043 non-null   object 
 2   senior_citizen            7043 non-null   int64  
 3   partner                   7043 non-null   object 
 4   dependents                7043 non-null   object 
 5   tenure                    7043 non-null   int64  
 6   phone_service             7043 non-null   object 
 7   multiple_lines            7043 non-null   object 
 8   internet_service_type_id  7043 non-null   int64  
 9   online_security           7043 non-null   object 
 10  online_backup             7043 non-null   object 
 11  device_protection         7043 non-null   object 
 12  tech_support              7043 non-null   object 
 13  streaming_tv              7043 non-null   object 
 14  streamin

In [120]:
df.tenure.max()

72

In [123]:
# Churn is greater then 50 % for first 8 months of service
df = get_telco_data(cached = True)
month_one = df.tenure <= 8
test = df[month_one]
test.churn.value_counts(normalize = True)

Yes    0.505476
No     0.494524
Name: churn, dtype: float64

In [122]:
# Churn is greater then 50 % for first 8 months of service
df = get_telco_data(cached = True)
month_one = df.tenure >= 9
test = df[month_one]
test.churn.value_counts(normalize = True)

No     0.813112
Yes    0.186888
Name: churn, dtype: float64

In [58]:
# Number of Customers who Churned over 72 Months
df.churn.value_counts()

No     5174
Yes    1869
Name: churn, dtype: int64

In [59]:
# Percent of Customers who Churned over 72 Months
df.churn.value_counts(normalize = True)

No     0.73463
Yes    0.26537
Name: churn, dtype: float64

In [125]:
# Number of customers who churned in their first month
month = df.tenure <= 8
num_cust = df[month]
num_cust.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1735 entries, 2 to 7001
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               1735 non-null   object 
 1   gender                    1735 non-null   object 
 2   senior_citizen            1735 non-null   int64  
 3   partner                   1735 non-null   object 
 4   dependents                1735 non-null   object 
 5   tenure                    1735 non-null   int64  
 6   phone_service             1735 non-null   object 
 7   multiple_lines            1735 non-null   object 
 8   internet_service_type_id  1735 non-null   int64  
 9   online_security           1735 non-null   object 
 10  online_backup             1735 non-null   object 
 11  device_protection         1735 non-null   object 
 12  tech_support              1735 non-null   object 
 13  streaming_tv              1735 non-null   object 
 14  streamin

In [45]:
test.churn.value_counts(normalize = True)

Series([], Name: churn, dtype: float64)

In [2]:
# Cleaned Data for Exploration
train, validate, test = prep_telco_data(get_telco_data(cached = True))

In [6]:
train.columns

Index(['gender', 'senior_citizen', 'partner', 'dependents', 'tenure',
       'phone_service', 'multiple_lines', 'online_security', 'online_backup',
       'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
       'paperless_billing', 'monthly_charges', 'total_charges', 'churn',
       'contract_type', 'internet_service_type', 'payment_type', 'gender_cc',
       'multiple_lines_cc', 'online_security_cc', 'online_backup_cc',
       'device_protection_cc', 'tech_support_cc', 'streaming_tv_cc',
       'streaming_movies_cc', 'month_to_month_contract', 'one_year_contract',
       'two_year_contract', 'internet_service'],
      dtype='object')

In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3943 entries, 5670 to 6867
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   gender                   3943 non-null   object 
 1   senior_citizen           3943 non-null   int64  
 2   partner                  3943 non-null   int64  
 3   dependents               3943 non-null   int64  
 4   tenure                   3943 non-null   float64
 5   phone_service            3943 non-null   int64  
 6   multiple_lines           3943 non-null   object 
 7   online_security          3943 non-null   object 
 8   online_backup            3943 non-null   object 
 9   device_protection        3943 non-null   object 
 10  tech_support             3943 non-null   object 
 11  streaming_tv             3943 non-null   object 
 12  streaming_movies         3943 non-null   object 
 13  paperless_billing        3943 non-null   int64  
 14  monthly_charges      

In [21]:
train.churn.value_counts(normalize = True)

0    0.73472
1    0.26528
Name: churn, dtype: float64