# Telco Customer Churn

#### LogBook

In [None]:
#  21 Oct 2022   >>>   Project Start
#  23 Oct 2022   >>>   Project End

# Author: Andres Montes de Oca
# GitHub: https://github.com/AndresMontesDeOca
# Competition #02 (https://www.kaggle.com/datasets/blastchar/telco-customer-churn)

#### Load Data and Info

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st

pd.set_option('display.max_columns', None)

data = pd.read_csv('Data/data.csv')

# Ignore Warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set(style='white', context='notebook', palette='pastel')


print(data.info())
display(data.head())

# Categorical Features

In [None]:
categorical_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
           'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
           'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']

#### SeniorCitizen

In [None]:
# Converting 1 to True and 0 to False
data['SeniorCitizen'] = data['SeniorCitizen'].astype(bool)

#### Category Cast

In [None]:
# Converting dtype to Category
data[categorical_cols] = data[categorical_cols].astype('category')

#### Plots

In [None]:
# To draw the right number of subplots
columns = 3
rows = int(np.ceil(len(categorical_cols)/columns))
fig = plt.figure(figsize=(10, 20))

# Plot
for i, column_name in enumerate(categorical_cols):
    ax = fig.add_subplot(rows, columns, i+1)
    data[column_name].value_counts(True).plot(kind='pie', autopct='%.2f%%', # explode not working
                        textprops={'fontsize':8}, startangle=90, cmap='Set2').set_title(column_name)
                                                                                       
    ax.set_ylabel('')

# Asthetics    
fig.tight_layout()
plt.show()


# Discrete Features

In [None]:
discrete_cols = ['tenure']

#### Tenure (CountPlot)

In [None]:
# Set x_ticks values and labels
x_ticks = [0, 12, data[discrete_cols]['tenure'].median(), data[discrete_cols]['tenure'].max()]

fig = plt.figure(figsize=(10, 3))
sns.countplot(x=data[discrete_cols]['tenure']).set_title('Tenure (in months)')
plt.xlabel('')
plt.xticks(x_ticks)
plt.show()

# Problem on the first six months
# Even though it is an Ordinal feature, an histogram is the best way to show it. Too 

# Continuous Features

In [None]:
continuous_cols = ['MonthlyCharges', 'TotalCharges']

#### TotalCharges Cast to Numeric

In [None]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# Not Numbers replaced for Nulls
print('Nulls deleted:', data.TotalCharges.isnull().sum())

# Nulls drop??
data.dropna(subset='TotalCharges', inplace=True)

#### Plots

In [None]:
# To draw the right number of subplots
rows = len(continuous_cols)

fig = plt.figure(figsize=(10, 4)) # Explore about this

for i, col_name in enumerate(continuous_cols):
    ax = fig.add_subplot(rows, 1, i+1)
    sns.histplot(data=data[col_name], bins=72).set_title(col_name)
    ax.set_xlabel('')
    
# Skew value of raw data
print('Skew:\n', data[continuous_cols].skew(), '\n')
print('Mean:\n', data[continuous_cols].mean())
print(data[continuous_cols].shape)
    
fig.tight_layout()
plt.show()

# Feature Engineering

## Churn (Target)

In [None]:
from sklearn.preprocessing import LabelEncoder

y = pd.Series(LabelEncoder().fit_transform(data.Churn.astype('category')), name='Churn')

In [None]:
nominal_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',  'OnlineSecurity', 'DeviceProtection', 'TechSupport', 'StreamingTV', \
           'StreamingMovies',  'PaperlessBilling', 'OnlineBackup']

In [None]:
# Generic Functions
from sklearn.preprocessing import OneHotEncoder

def onehot_encoding(data_):
    oh_encoder = OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse=False).fit(data_)
    result = pd.DataFrame(oh_encoder.transform(data_), columns=oh_encoder.get_feature_names_out())
    return result

#### 'No internet service' Problem

In [None]:
# Lets replace 'No internet service' for 'No' in all impacted Features
no_internet_service_cols = ['OnlineSecurity', 'OnlineBackup', 
                            'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

for col_name in no_internet_service_cols:
    data.loc[data[col_name]=='No internet service', col_name] = 'No'

#### PaymentMethod Cardinality

In [None]:
print(data.PaymentMethod.value_counts())

#### PaymentMethod (OneHot Encoder)

In [None]:
# Big ccardinality
# print(data.PaymentMethod.value_counts())

data_PaymentMethod_oh = onehot_encoding(data['PaymentMethod'].to_frame())
print(data_PaymentMethod_oh.shape)

#### PaymentMethod (LabelEncoder)

In [None]:
from sklearn.preprocessing import LabelEncoder

data_PaymentMethod_label= pd.Series(LabelEncoder().fit_transform(data['PaymentMethod']), name='PaymentMethod')
print(data_PaymentMethod_label.shape)

## OneHot Encoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

oh_encoder = OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse=False).fit(data[nominal_cols])

data_nominals = pd.DataFrame(oh_encoder.transform(data[nominal_cols]), columns=oh_encoder.get_feature_names_out())

print(data_nominals.shape)

## Ordinal Encoder

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_cols = ['MultipleLines', 'InternetService', 'Contract']

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
data_ordinals = pd.DataFrame()

for col_name in ordinal_cols:
    ordinal_serie = pd.Series(ordinal_encoder.fit_transform(data[col_name].to_frame())\
                              .flatten(), name=col_name)
    data_ordinals = pd.concat([data_ordinals, ordinal_serie], axis=1)
    
print(data_ordinals.shape)

## Continuous Features

In [None]:
continuous_cols = ['MonthlyCharges', 'TotalCharges']

# Skew values before Transformation
print('Series.skew(): Around 0 should be Normal')
print('TotalCharges:', data.TotalCharges.skew())
print('MonthlyCharges:', data.MonthlyCharges.skew(), '\n')

# P-Value 
print('Stats.shapiro(): real p-value ')
print('Shapiro TotalCharges:', st.shapiro(data.TotalCharges)[1])
print('Shapiro MonthlyCharges:', st.shapiro(data.MonthlyCharges)[1], '\n')

# P-Value as well?
print('Stats.skewtest(): Same as Shapiro?')
print('TotalCharges:', st.skewtest(data.TotalCharges)[1])
print('MonthlyCharges:', st.skewtest(data.MonthlyCharges)[1])

### BoxCox

In [None]:
data_continuous_boxcox = pd.DataFrame()

for col_name in continuous_cols:
    continuous_serie = pd.Series(st.boxcox(data[col_name])[0], name=col_name)
    data_continuous_boxcox = pd.concat([data_continuous_boxcox, continuous_serie], axis=1)
    
# Skew value of transformed data
print('Skew:\n', data_continuous_boxcox.skew(), '\n')
print('Mean:\n', data_continuous_boxcox.mean())
print(data_continuous_boxcox.shape)


fig = plt.figure(figsize=(10, 4)) # Explore about this

for i, col_name in enumerate(continuous_cols):
    ax = fig.add_subplot(rows, 1, i+1)
    sns.histplot(data=data_continuous_boxcox[col_name], bins=72).set_title(column_name)
    ax.set_xlabel('')
    
fig.tight_layout()
plt.show()

### Standard Scaler

In [None]:
from sklearn.preprocessing import StandardScaler

data_continuous_stdscaler = pd.DataFrame()

for col_names in continuous_cols:
    continuous_serie = pd.Series(StandardScaler().fit_transform(data[col_names].to_frame()).flatten(), name=col_names)
    data_continuous_stdscaler = pd.concat([data_continuous_stdscaler, continuous_serie], axis=1)

print('Skew:\n', data_continuous_stdscaler.skew(), '\n')
print('Mean:\n', data_continuous_stdscaler.mean())
print(data_continuous_stdscaler.shape)


fig = plt.figure(figsize=(10, 4)) # Explore about this

for i, col_name in enumerate(continuous_cols):
    ax = fig.add_subplot(rows, 1, i+1)
    sns.histplot(data=data_continuous_stdscaler[col_name], bins=72).set_title(col_name)
    ax.set_xlabel('')
    
fig.tight_layout()
plt.show()

# Shape doesn't change, just the scale

# Discrete Features

In [None]:
discrete_cols = ['tenure']

data_discrete = data[discrete_cols]

# Nothing to do here

# Correlation Analysis

In [None]:
data_cols = [y, data_discrete, data_nominals, data_ordinals, data_PaymentMethod_label, data_continuous_boxcox]

# data_continuous_stdscaler reduce the performance
# data_PaymentMethod_oh reduce the performance too

# Lets create the final DataFrame
# Dont know why there are 11 nulls
data_FINAL = pd.concat(data_cols, axis=1)

print(data_FINAL.isnull().sum())
data_FINAL.dropna(inplace=True)
print(data_FINAL.shape)

#### Correlation Matrix

In [None]:
fig = plt.figure(figsize=(6, 4))
sns.heatmap(data_FINAL.corr(), vmin=-1, vmax=1, cmap='RdYlGn')
plt.title('Correlation Matrix')
plt.show()

#### Feature Selection

In [None]:
# Drop Features from data_FINAL to test

data_FINAL.drop(columns=['StreamingMovies_Yes', 'DeviceProtection_Yes', 'StreamingTV_Yes', 'StreamingMovies_Yes',
                        'OnlineBackup_Yes', 'TotalCharges'], inplace=True)

# Basic Modeling

In [None]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(data_FINAL.drop(columns='Churn'), data_FINAL.Churn)

# !pip install lazypredict
import lazypredict
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import precision_score, accuracy_score, recall_score

model_lazy = LazyClassifier()
models, predictions = model_lazy.fit(Xtrain, Xtest, ytrain, ytest)
display(models.sort_values(by='Accuracy', ascending=False))
