In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv(os.path.join(dirname, filename))
df.head()
df.info()

Limpio los valores NaN y elimino la columna con código de cliente.

In [None]:
df.drop('customerID', axis=1, inplace=True)
df['TotalCharges'] = df['TotalCharges'].replace(" ",np.nan)
df.dropna(how='any', inplace= True)
df['TotalCharges'] = df['TotalCharges'].astype(float)
df['SeniorCitizen'] = df['SeniorCitizen'].astype(int)



**Reviso el porcentaje de clientes dados de baja sobre el total para ver el balance de los Labels.
**

In [None]:
labels=df['Churn'].value_counts().index
values=df['Churn'].value_counts().values
plt.figure(figsize=(7,7))
plt.pie(values,labels=labels,autopct='%1.1f%%')
plt.title('Churn Status',color='black',fontsize=10)
plt.show()

**Cantidad de meses que los clientes mantuvieron el servicio, según el tipo de plan (Month to Month Contract, One Year Contract, Two Years Contract).
**

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(nrows=1, ncols=3, sharey = True, figsize = (20,6))

ax = sns.distplot(df[df['Contract']=='Month-to-month']['tenure'],
                   hist=True, kde=False,
                   bins=int(180/5), color = 'turquoise',
                   hist_kws={'edgecolor':'black'},
                   kde_kws={'linewidth': 4},
                 ax=ax1)
ax.set_ylabel('# de Clientes')
ax.set_xlabel('Tenure (months)')
ax.set_title('Month to Month Contract')

ax = sns.distplot(df[df['Contract']=='One year']['tenure'],
                   hist=True, kde=False,
                   bins=int(180/5), color = 'steelblue',
                   hist_kws={'edgecolor':'black'},
                   kde_kws={'linewidth': 4},
                 ax=ax2)
ax.set_xlabel('Tenure (months)',size = 14)
ax.set_title('One Year Contract',size = 14)

ax = sns.distplot(df[df['Contract']=='Two year']['tenure'],
                   hist=True, kde=False,
                   bins=int(180/5), color = 'darkblue',
                   hist_kws={'edgecolor':'black'},
                   kde_kws={'linewidth': 4},
                 ax=ax3)

ax.set_xlabel('Tenure (months)')
title = ax.set_title('Two Year Contract')



**Transformo todas las columnas que tengan valores Yes/No a valores numéricos 0 o 1.**

In [None]:
columns_to_convert = ['Partner', 'Dependents','PhoneService','OnlineSecurity' ,
                      'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','PaperlessBilling',
                      'Churn']


df[columns_to_convert] = df[columns_to_convert].replace(dict(Yes=1, No=0))


**Transformo las columnas con valores categóricos a valores numéricos.**

In [None]:
categorical_columns = ['gender', 'MultipleLines', 'InternetService', 'Contract', 'PaymentMethod']

df = pd.get_dummies(data=df, columns= categorical_columns)

df.isnull().values.any()
df.isnull().sum().sum()

**Divido el dataset en datos de train/test 70/30**


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
X = pd.get_dummies(df.drop('Churn', axis=1), drop_first=True)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)



Estandarizo las columnas numéricas y luego las adhiero de nuevo al dataset.

In [None]:
from sklearn.preprocessing import StandardScaler


numerical_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']

ss = StandardScaler()
X_train[numerical_columns] = ss.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = ss.fit_transform(X_test[numerical_columns])
#scl = pd.DataFrame(scl, columns=numerical_columns)

X_train.info()


In [None]:
from imblearn.over_sampling import SMOTE
np.where(np.isnan(X_train)) 
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)
X_train.info()




In [None]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

cv = StratifiedKFold(n_splits=10, random_state=41, shuffle=True)
def evaluar_rendimiento(modelo, nombre, X_train, y_train, cv):
    s = cross_val_score(modelo, X_train, y_train, cv=cv, n_jobs=-1)
    print("Rendimiento de {}:\t{:0.3} ± {:0.3}".format(nombre, s.mean().round(3), s.std().round(3)))

dt = DecisionTreeClassifier()
evaluar_rendimiento(dt, "Árbol de decisión", X_train, y_train, cv)
ab = AdaBoostClassifier()
gb = GradientBoostingClassifier()
evaluar_rendimiento(ab,  "AdaBoostClassifier + GS", X_train, y_train, cv)
evaluar_rendimiento(gb, "GradientBoostingClassifier + GS", X_train, y_train, cv)

In [None]:
params_ab = {"n_estimators": [100, 500],
             "learning_rate":[0.01, 0.1, 1.0],
             "base_estimator__max_depth": [1, 2, 3]}

grid_ab = GridSearchCV(AdaBoostClassifier(base_estimator=DecisionTreeClassifier()), 
                       param_grid=params_ab, cv=cv, verbose=1, n_jobs=-1)
grid_ab.fit(X_train, y_train)

In [None]:
evaluar_rendimiento(grid_ab.best_estimator_, "AdaBoostClassifier + GS", X_train, y_train, cv)

In [None]:
params_gb = {'n_estimators':[100, 500] , 
             'learning_rate':[0.001, 0.001, 0.1, 1.0],
             'max_depth' : [1, 2, 3, 4]}

grid_gb = GridSearchCV(gb, param_grid=params_gb, cv=cv, verbose=1, n_jobs=-1)

In [None]:
grid_gb.fit(X_train, y_train)

In [None]:
evaluar_rendimiento(grid_gb.best_estimator_, "GradientBoostingClassifier + GS", X_train, y_train, cv)

In [None]:
from sklearn.tree import DecisionTreeClassifier
treeclf = DecisionTreeClassifier(max_depth=4, random_state=1)
treeclf.fit(X_train, y_train)

In [None]:
# Calcular la importancia de los atributos
atributos = X_train.columns

pd.DataFrame({'Atributo':atributos,
              'importancia':treeclf.feature_importances_}).sort_values('importancia',
                                                                      ascending=False).head()

In [None]:
import xgboost as xgb
import scipy.stats as st
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


model_xgb = xgb.XGBClassifier(n_jobs=-1)
one_to_left = st.beta(10, 1) # Esta distribución nos dará valores entre 0 y 1 mayormente cercanos a 1

params = {  
    "n_estimators": st.randint(20,40),  # Número de árboles del ensamble.
    "max_depth": st.randint(3, 12),     # Profundidad máxima de cada árbol.
    "learning_rate": st.uniform(0.05, 0.4), # Learning rate (“eta”)
    "colsample_bytree": one_to_left, # Ratio del subsample de features.
    "subsample": one_to_left,     # Ratio del  subsample de observaciones.
    "gamma": st.uniform(0, 10), # Reducción mínima de la pérdida requerida para seguir splitteando.
    'reg_alpha': st.uniform(0.05,10),   # Término de regularización L1 de los pesos.
    "min_child_weight": st.uniform(1,20), # Suma mínima de los pesos de una instancia (hessiano) necesaria en un child.
}
xgb_cv = RandomizedSearchCV(model_xgb, params, n_iter=25, verbose=True)
xgb_cv.fit(X_train,y_train)



In [None]:
from scikitplot.metrics import plot_roc
from sklearn.metrics import roc_auc_score

y_predicted_xgb = xgb_cv.predict_proba(X_test)
xgb_auc = roc_auc_score(y_test, y_predicted_xgb[:,1])
print("El valor del AUC es: ", xgb_auc)
plot_roc(y_test, y_predicted_xgb, plot_micro=False, plot_macro=False)

In [None]:
xgb.plot_importance(xgb_cv.best_estimator_, height=0.8, max_num_features=5);