# Machine Learning for Marketing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [57]:
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

In [35]:
# Load dataset
telco = pd.read_csv('data/telco.csv')
telco.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# Preparation for modeling

In [36]:
# Types
telco.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [37]:
# Separate the ID ans the target variable (Churn flag)
cust_id = ['customerID']
target = ['Churn']

In [38]:
# Separate cat and num columns
telco.nunique()

customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64

In [39]:
# Hypothesis : categorical variables have less than 10 unique values
categorical = telco.nunique()[telco.nunique()<10].keys().to_list()
categorical.remove(target[0])
categorical

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [40]:
# Numerical columns
numerical = [col for col in telco.columns
                if col not in categorical + target + cust_id]
numerical

['tenure', 'MonthlyCharges', 'TotalCharges']

In [41]:
# One hot encoding for categorical variables
telco = pd.get_dummies(data=telco, columns=categorical, drop_first=True)

# Other solution : from sklearn.preprocessing import OneHotEncoder

In [48]:
# Replace ' ' by np.nan, to solve error in scaling
telco[numerical] = telco[numerical].replace(' ', np.nan, regex=True)

In [49]:
# Scaling numercial features

# Initialize scaler
scaler = StandardScaler()

# Fit transform
numerical_scaled = scaler.fit_transform(telco[numerical])

# Build a df
numerical_scaled = pd.DataFrame(numerical_scaled, columns=numerical)

In [51]:
numerical_scaled.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.277445,-1.160323,-0.994194
1,0.066327,-0.259629,-0.17374
2,-1.236724,-0.36266,-0.959649
3,0.514251,-0.746535,-0.195248
4,-1.236724,0.197365,-0.940457


In [50]:
telco.head()

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,SeniorCitizen_1,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,1,29.85,29.85,No,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
1,5575-GNVDE,34,56.95,1889.5,No,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
2,3668-QPYBK,2,53.85,108.15,Yes,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
3,7795-CFOCW,45,42.3,1840.75,No,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,9237-HQITU,2,70.7,151.65,Yes,0,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0


In [52]:
# Drop non scaled numerical variables
telco = telco.drop(columns=numerical, axis=1)

In [54]:
# Merge categorical with scaled numerical
telco_scaled = telco.merge(right=numerical_scaled,
                           how='left',
                           left_index=True,
                           right_index=True
)
telco_scaled.head()

Unnamed: 0,customerID,Churn,gender_Male,SeniorCitizen_1,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,...,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges
0,7590-VHVEG,No,0,0,1,0,0,1,0,0,...,0,0,0,1,0,1,0,-1.277445,-1.160323,-0.994194
1,5575-GNVDE,No,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,1,0.066327,-0.259629,-0.17374
2,3668-QPYBK,Yes,1,0,0,0,1,0,0,0,...,0,0,0,1,0,0,1,-1.236724,-0.36266,-0.959649
3,7795-CFOCW,No,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0.514251,-0.746535,-0.195248
4,9237-HQITU,Yes,0,0,0,0,1,0,0,1,...,0,0,0,1,0,1,0,-1.236724,0.197365,-0.940457


In [55]:
telco_scaled.shape

(7043, 32)

# ML modeling

**Supervised learning steps :**
1. Split data to training and testing
2. Initialize the model
3. Fit the model on the training data
4. Predict values on the testing data

In [None]:
# # 1. Train test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# # 2. Initialize the model
# init_tree = tree.DecisionTreeClassifier(max_depth=7, criterion='gini', splitter='best')

# # 3. Fit the model on the training data
# model_tree = init_tree.fit(X_train, y_train)

# # 4. Predict values on the testing data
# y_pred = model_tree.predict(X_test)

# # 5. Measure model perf on testing data
# accuracy_score(y_test, y_pred)

**Unsupervised learning steps :**
1. Initialize the model
2. Fit the model
3. Assign cluster values
4. Explore results

In [None]:
# # 1. Initialize the model
# kmeans = KMeans(n_clusters=3)

# # 2. Fit the model
# kmeans.fit(data)

# # 3. Assign cluster values
# data.assign(Cluster=kmeans.labels_)

# # 4. Explore results
# data.groupby('Cluster').mean()

Suite : 
https://medium.com/@lucapetriconi/churn-modeling-a-detailed-step-by-step-guide-in-python-1e96d51c7523 => mieux