**Telecom Customer Churn**

---

Challenge Link: https://www.kaggle.com/c/dsrp-kaggle-semillero-01

Autor: **Keven Fernandez Carrillo** 
con el Apoyo de la comunidad **Data Science Research Perú**.

Versión: 1.0

GitHub: 
- https://github.com/KevenRFC
- https://github.com/DataScienceResearchPeru

# 1) IMPORT & INSTALL PACKAGES

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as pylab 
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats 
from sklearn import metrics as mt

%matplotlib inline
pylab.rcParams['figure.figsize'] = 6,4

# Ignorar warnings
import warnings
warnings.filterwarnings("ignore")

# 2) DATA UNDERSTANDING

## 2.1) Load Data

In [None]:
# Seleccion de Variables a usar en este BASELINE:
features_iniciales = ['ID',
 'Sexo',
 'AdultoMayor',
 'MesesCliente',
 'ServicioTelefonico',
 'LineasMultiples',
 'ProteccionDispositivo',
 'SoporteTecnico',
 'FacturacionElectronica',
 'MontoCargadoMes']

In [None]:
import os
print(os.listdir("../input"))

In [None]:
# Import from

path = "../input/"
df_train = pd.read_csv(path+"churn_data_train.csv",encoding='latin-1', usecols=features_iniciales+['Churn'])
df_test = pd.read_csv(path+"churn_data_test.csv",encoding='latin-1', usecols=features_iniciales)

## 2.2) Data Exploration

### 2.2.1) Basic Statistics

In [None]:
df_train.shape, df_test.shape

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [0]:
# Defining features types
ID = 'ID'
TARGET = 'Churn'

In [None]:
# Distribución del Target
df_train[TARGET].value_counts(dropna=False)

In [None]:
df_train[TARGET].value_counts(dropna=False, normalize = True)*100

In [None]:
# Generar estadisticos básicos para cada variable:
### count: Count number of non-NA/null observations.	
### unique: Count uniques numbers of non-NA/null observations.
### top: Mean of the values.
### freq: Mean of the values.

### mean: Mean of the values.
### std: Standard deviation of the observations.

### min: Minimum of the values in the object.
### X%: The value of Quartil: 25% - Q1 , 50% - Q2, 75% - Q3
### max: Maximum of the values in the object.

df_train['AdultoMayor'] = df_train['AdultoMayor'].astype(str) # Convertir a variable categorica
df_train.describe(include = 'all').T

In [0]:
df_train['AdultoMayor'] = df_train['AdultoMayor'].astype(float) # Convertir a variable numerica

### 2.2.2) EDA

#### 2.2.2.a) Evaluate missings

In [None]:
import missingno as msno
msno.matrix(df_train)

In [None]:
msno.matrix(df_test)

#### 2.2.3.b) Identify outliers

In [0]:
None

#### 2.2.4.c) Adictionales

In [0]:
None

# 3) DATA PREPARATION

In [0]:
# Copy dataset and then apply transformation to copied dataset
ds_train = df_train.copy()

In [0]:
ds_test = df_test.copy()

## 3.1) Data Cleaning

### 3.1.1) Impute missings

In [0]:
# AdultoMayor (imputacion por MODA)
ds_train["AdultoMayor"].fillna(0, inplace = True)
ds_test["AdultoMayor"].fillna(0, inplace = True)

# MesesCliente (imputacion por MEDIA)
ds_train["MesesCliente"].fillna(32, inplace = True)
ds_test["MesesCliente"].fillna(32, inplace = True)

# ProteccionDispositivo (imputacion por MODA)
ds_train["ProteccionDispositivo"].fillna('No', inplace = True)
ds_test["ProteccionDispositivo"].fillna('No', inplace = True)

# SoporteTecnico (imputacion por MODA)
ds_train["SoporteTecnico"].fillna('No', inplace = True)
ds_test["SoporteTecnico"].fillna('No', inplace = True)

# FacturacionElectronica (imputacion por MODA)
ds_train["FacturacionElectronica"].fillna('Si', inplace = True)
ds_test["FacturacionElectronica"].fillna('Si', inplace = True)

# MontoCargadoMes (imputacion por MEDIA)
ds_train["MontoCargadoMes"].fillna(68.7, inplace = True)
ds_test["MontoCargadoMes"].fillna(68.7, inplace = True)

### 3.1.2) Treat outliers

In [0]:
None

## 3.2) Data Transformation

In [None]:
ds_train.head()

In [0]:
# Sexo 
dicc_sexo = {'Masculino': 1, 'Femenino':0 }
ds_train["Sexo"] = ds_train["Sexo"].map(dicc_sexo)
ds_test["Sexo"] = ds_test["Sexo"].map(dicc_sexo)

In [0]:
# ServicioTelefonico 
dicc_serv_telef = {'Si': 1, 'No':0 }
ds_train["ServicioTelefonico"] = ds_train["ServicioTelefonico"].map(dicc_serv_telef)
ds_test["ServicioTelefonico"] = ds_test["ServicioTelefonico"].map(dicc_serv_telef)

In [0]:
# LineasMultiples 
dicc_lin_mult = {'Si': 2, 'No':1, 'Sin servicio telefonico':0 }
ds_train["LineasMultiples"] = ds_train["LineasMultiples"].map(dicc_lin_mult)
ds_test["LineasMultiples"] = ds_test["LineasMultiples"].map(dicc_lin_mult)

In [0]:
# FacturacionElectronica 
dicc_fact_elect = {'Si': 1, 'No':0 }
ds_train["FacturacionElectronica"] = ds_train["FacturacionElectronica"].map(dicc_fact_elect)
ds_test["FacturacionElectronica"] = ds_test["FacturacionElectronica"].map(dicc_fact_elect)

In [0]:
# Crear Features Dummies
ds_train.loc[ds_train['ProteccionDispositivo']=='Sin servicio de internet', 'ProteccionDispositivo'] = 'SinServInter'
ds_train.loc[ds_train['SoporteTecnico']=='Sin servicio de internet', 'SoporteTecnico'] = 'SinServInter'

ds_test.loc[ds_test['ProteccionDispositivo']=='Sin servicio de internet', 'ProteccionDispositivo'] = 'SinServInter'
ds_test.loc[ds_test['SoporteTecnico']=='Sin servicio de internet', 'SoporteTecnico'] = 'SinServInter'

ds_train = pd.get_dummies(ds_train, columns=['ProteccionDispositivo','SoporteTecnico'])
ds_test = pd.get_dummies(ds_test, columns=['ProteccionDispositivo','SoporteTecnico'])

In [None]:
ds_train.head()

## 3.3) Feature Engineering

In [None]:
# New Feature 1
tmp_byAdultoMayor_medianMontoMes = ds_train.groupby(['AdultoMayor'])['MontoCargadoMes'].median().round()
tmp_byAdultoMayor_medianMontoMes

In [0]:
ds_train['flg_bySexo_mayorMedianMontoMes'] = ds_train.apply(lambda x: 1 if x.MontoCargadoMes >= tmp_byAdultoMayor_medianMontoMes[x.AdultoMayor] else 0,
                                                       axis = 1)

ds_test['flg_bySexo_mayorMedianMontoMes'] = ds_test.apply(lambda x: 1 if x.MontoCargadoMes >= tmp_byAdultoMayor_medianMontoMes[x.AdultoMayor] else 0,
                                                       axis = 1)

In [None]:
ds_train.head(10)

In [0]:
# New Feature 2,3,4, ...
### Here

## 3.4) Feature Selection

In [None]:
features_to_model = list(ds_train.columns)

features_to_model.remove(TARGET) # Eliminar variable Target
features_to_model.remove(ID) # Eliminar variable ID

list(features_to_model)

***Select Final Features:***

In [0]:
# Selección de variables. 
### Una opción es: en base a un modelo basado en árboles, generar la importancia de Variables y seleccionar los features mas importantes.
features_to_model = features_to_model # ['var1', 'var2', 'varn'] 

In [None]:
len(features_to_model)

In [0]:
# Features & Target
X = ds_train[features_to_model]
y = ds_train[TARGET]

X_summit = ds_test[features_to_model]

In [None]:
print("train: ", X.shape,", summit: ", X_summit.shape)

## 3.5) Train & Test Split

In [None]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size = 0.70, random_state=9)
print((len(X_train), len(y_train)), (len(X_test), len(y_test)))

In [None]:
X_train.info()

# 4) Modeling & Evaluation - Simple

## 4.1.A. LogisticRegression

### 4.1.1 Training

In [0]:
from sklearn.linear_model import LogisticRegression

In [0]:
# Create  model objet 
model_rlog = LogisticRegression(C=0.01, max_iter= 100, random_state=0, n_jobs = 4, penalty = 'l1')

# Fit the model:
model_rlog.fit(X_train, y_train)

model = model_rlog 

In [None]:
df_weights = pd.DataFrame({'feature':X_train.columns.values, 'beta': np.round(model_rlog.coef_[0],4) })
df_weights

### 4.1.2 Evaluación del Modelo

In [0]:
# Generar las predicciones:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Generar las probabilidades
y_pred_proba_train = model.predict_proba(X_train)[:,1]
y_pred_proba_test = model.predict_proba(X_test)[:,1]

In [None]:
accuracy_train = mt.accuracy_score(y_train, y_pred_train)
accuracy_test = mt.accuracy_score(y_test, y_pred_test)

print("Accuracy - Train: {}".format(accuracy_train))
print("Accuracy - Test : {}".format(accuracy_test))

El punto de corte por defecto es de 0.50 para decidir si la predicción final será 1 ó 0. A continuación trataremos de encontrar ese punto de corte que optimice la métrica de evaluación del problema..

### ****Find best threshold:****

In [0]:
list_accuracy_test = []
for threshold in range(0,100):
  pred_0_1 = [1 if x >= threshold/100 else 0 for x in y_pred_proba_test]
  list_accuracy_test.append(mt.accuracy_score(y_test, pred_0_1))

In [None]:
xs = [x/100 for x in range(0,100)]
ys = list_accuracy_test
plt.plot(xs, ys)

In [None]:
best_scoring = max(list_accuracy_test)
best_threshold = list_accuracy_test.index(best_scoring)/100
print("El mejor threshold es: {}".format(best_threshold))

In [None]:
accuracy_train = mt.accuracy_score(y_train, [1 if x >= best_threshold else 0 for x in y_pred_proba_train])
accuracy_test = mt.accuracy_score(y_test, [1 if x >= best_threshold else 0 for x in y_pred_proba_test])

print("Accuracy - Train: {}".format(accuracy_train))
print("Accuracy - Test : {}".format(accuracy_test))

## 4.1.B. Decision Tree

### 4.1.1 Training

In [None]:
from sklearn.tree import DecisionTreeClassifier
DecisionTreeClassifier()

In [0]:
# Create  model objet 
model_tree = DecisionTreeClassifier(max_depth=6, min_samples_leaf=5,random_state=0)

# Fit the model:
model_tree.fit(X_train, y_train)

model = model_tree

### 4.1.2 Evaluación del Modelo

In [0]:
# Generar las predicciones:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Generar las probabilidades
y_pred_proba_train = model.predict_proba(X_train)[:,1]
y_pred_proba_test = model.predict_proba(X_test)[:,1]

In [None]:
accuracy_train = mt.accuracy_score(y_train, y_pred_train)
accuracy_test = mt.accuracy_score(y_test, y_pred_test)

print("Accuracy - Train: {}".format(accuracy_train))
print("Accuracy - Test : {}".format(accuracy_test))

El punto de corte por defecto es de 0.50 para decidir si la predicción final será 1 ó 0. A continuación trataremos de encontrar ese punto de corte que optimice la métrica de evaluación del problema..

### ****Find best threshold:****

In [0]:
list_accuracy_test = []
for threshold in range(0,100):
  pred_0_1 = [1 if x >= threshold/100 else 0 for x in y_pred_proba_test]
  list_accuracy_test.append(mt.accuracy_score(y_test, pred_0_1))

In [None]:
xs = [x/100 for x in range(0,100)]
ys = list_accuracy_test
plt.plot(xs, ys)

In [None]:
best_scoring = max(list_accuracy_test)
best_threshold = list_accuracy_test.index(best_scoring)/100
print("El mejor threshold es: {}".format(best_threshold))

In [None]:
accuracy_train = mt.accuracy_score(y_train, [1 if x >= best_threshold else 0 for x in y_pred_proba_train])
accuracy_test = mt.accuracy_score(y_test, [1 if x >= best_threshold else 0 for x in y_pred_proba_test])

print("Accuracy - Train: {}".format(accuracy_train))
print("Accuracy - Test : {}".format(accuracy_test))

### Feature Importances

In [None]:
df_feature_importances = pd.DataFrame()
df_feature_importances['feature'] = X_train.columns
df_feature_importances['importance'] = model.feature_importances_/model.feature_importances_.sum()
df_feature_importances = df_feature_importances.sort_values(by = ['importance','feature'],ascending=False)
df_feature_importances.reset_index(drop = True,inplace=True)

df_feature_importances

In [None]:
df_feature_importances[['feature','importance']].sort_values(by=['importance'],
                                                             ascending = [True]).plot(kind='barh',
                                                             x='feature',
                                                             y='importance',
                                                             legend=True, 
                                                             figsize=(5, 5))

## 4.1.C. Random Forest

### 4.1.1 Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier()

In [0]:
# Create  model objet 
model_rf = RandomForestClassifier(n_estimators = 150, random_state = 0, max_depth=5, 
                                  max_features = 0.5, min_samples_leaf = 10, 
                                  n_jobs = -1)

# Fit the model:
model_rf.fit(X_train, y_train)

model = model_rf

### 4.1.2 Evaluación del Modelo

In [0]:
# Generar las predicciones:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Generar las probabilidades
y_pred_proba_train = model.predict_proba(X_train)[:,1]
y_pred_proba_test = model.predict_proba(X_test)[:,1]

In [None]:
accuracy_train = mt.accuracy_score(y_train, y_pred_train)
accuracy_test = mt.accuracy_score(y_test, y_pred_test)

print("Accuracy - Train: {}".format(accuracy_train))
print("Accuracy - Test : {}".format(accuracy_test))

El punto de corte por defecto es de 0.50 para decidir si la predicción final será 1 ó 0. A continuación trataremos de encontrar ese punto de corte que optimice la métrica de evaluación del problema..

### ****Find best threshold:****

In [0]:
list_accuracy_test = []
for threshold in range(0,100):
  pred_0_1 = [1 if x >= threshold/100 else 0 for x in y_pred_proba_test]
  list_accuracy_test.append(mt.accuracy_score(y_test, pred_0_1))

In [None]:
xs = [x/100 for x in range(0,100)]
ys = list_accuracy_test
plt.plot(xs, ys)

In [None]:
best_scoring = max(list_accuracy_test)
best_threshold = list_accuracy_test.index(best_scoring)/100
print("El mejor threshold es: {}".format(best_threshold))

In [None]:
accuracy_train = mt.accuracy_score(y_train, [1 if x >= best_threshold else 0 for x in y_pred_proba_train])
accuracy_test = mt.accuracy_score(y_test, [1 if x >= best_threshold else 0 for x in y_pred_proba_test])

print("Accuracy - Train: {}".format(accuracy_train))
print("Accuracy - Test : {}".format(accuracy_test))

### Feature Importances

In [None]:
df_feature_importances = pd.DataFrame()
df_feature_importances['feature'] = X_train.columns
df_feature_importances['importance'] = model.feature_importances_/model.feature_importances_.sum()
df_feature_importances = df_feature_importances.sort_values(by = ['importance','feature'],ascending=False)
df_feature_importances.reset_index(drop = True,inplace=True)

df_feature_importances

In [None]:
df_feature_importances[['feature','importance']].sort_values(by=['importance'],
                                                             ascending = [True]).plot(kind='barh',
                                                             x='feature',
                                                             y='importance',
                                                             legend=True, 
                                                             figsize=(5, 5))

**MODELO FINAL**

Como se puede notar, de los 3 tipos de algoritmos entrenados, el modelo basado en Random Forrest es el ganador con un accuracy optimizado por el punto de corte (threhold: 0.36)

# Predicciones on Submission DS

In [0]:
pred_prob_subm = model_rf.predict_proba(X_summit)[:,1]
pred_subm = [1 if x >= best_threshold else 0 for x in pred_prob_subm]

In [None]:
Y_summit_pred = pd.DataFrame()
Y_summit_pred[ID] = df_test[ID]
Y_summit_pred[TARGET] = pred_subm #pred_prob_subm
Y_summit_pred.head()

To submission:

In [None]:
Y_summit_pred.to_csv("krfc_submission_01_baseline.csv", index = False)