In [19]:
import pandas as pd
import numpy as np
from tabulate import tabulate

def missing_zero_values_table(df):
    zero_val = (df == 0.00).astype(int).sum(axis=0)
    unique_val = data.nunique()
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mz_table = pd.concat([unique_val, zero_val, mis_val, mis_val_percent], axis=1)
    mz_table = mz_table.rename(
        columns={0: 'Unique Values', 1: 'Zero Values', 2: 'Missing Values', 3: '% of Total Values'})
    mz_table['Total Zero / Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
    mz_table['% Total Zero / Missing Values'] = 100 * mz_table['Total Zero / Missing Values'] / len(df)
    mz_table['Data Type'] = df.dtypes
    # This shows only the zero value rows
    #mz_table = mz_table[
    #    mz_table.iloc[:, 1] != 0].sort_values(
    #    '% of Total Values', ascending=False).round(1)
    print("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"
                                                                                                   "There are " + str(
        mz_table.shape[0]) +
          " columns that have missing values.")
    #         mz_table.to_excel('D:/sampledata/missing_and_zero_values.xlsx', freeze_panes=(1,0), index = False)
    return mz_table

def normalize_data(df,column,replaced,replacement):
    for i in column: df[i] = df[i].replace(
                        to_replace=[replaced],
                        value=[replacement])


############################ START CODE ######################################

# Import Data
data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',
                   converters={'SeniorCitizen': lambda x: pd.to_numeric(x, errors='coerce'),
                                'tenure': lambda x: pd.to_numeric(x, errors='coerce'),
                                'MonthlyCharges': lambda x: pd.to_numeric(x, errors='coerce'),
                               'TotalCharges': lambda x: pd.to_numeric(x, errors='coerce'),
                               })

# Normalize values to Yes / No
normalize_data(data, ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'], 'No internet service', 'No')

# Validation of NULL & Zero
r = missing_zero_values_table(data)
print(tabulate(r, headers='keys', tablefmt='psql'))

Your selected dataframe has 21 columns and 7043 Rows.
There are 21 columns that have missing values.
+------------------+-----------------+---------------+------------------+---------------------+-------------------------------+---------------------------------+-------------+
|                  |   Unique Values |   Zero Values |   Missing Values |   % of Total Values |   Total Zero / Missing Values |   % Total Zero / Missing Values | Data Type   |
|------------------+-----------------+---------------+------------------+---------------------+-------------------------------+---------------------------------+-------------|
| customerID       |            7043 |             0 |                0 |            0        |                             0 |                        0        | object      |
| gender           |               2 |             0 |                0 |            0        |                             0 |                        0        | object      |
| SeniorCitizen    

In [4]:
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7020.0,7028.0
mean,0.162147,32.371149,64.669266,2278.85392
std,0.368612,24.559481,30.146503,2264.843268
min,0.0,0.0,0.0,0.0
25%,0.0,9.0,35.4,399.4
50%,0.0,29.0,70.325,1394.8
75%,0.0,55.0,89.85,3785.3
max,1.0,72.0,118.75,8684.8


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

In [14]:
min_max_scaler = preprocessing.MinMaxScaler()

data = data.dropna()
X = pd.get_dummies(data.drop('Churn', axis=1))
x_scaled = x_scaled = min_max_scaler.fit_transform(X)
X = pd.DataFrame(x_scaled)

lab_enc = LabelEncoder()
y = lab_enc.fit_transform(data['Churn'])

In [15]:
display(X.head(), y)
print(f"X: {X.shape}\ny:{y.shape}")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7038,7039,7040,7041,7042,7043,7044,7045,7046,7047
0,0.0,0.0,0.251368,0.001275,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.464789,0.479579,0.215867,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.014085,0.453474,0.01031,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,0.619718,0.356211,0.210241,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.014085,0.595368,0.01533,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


array([0, 0, 1, ..., 0, 1, 0])

X: (7009, 7048)
y:(7009,)


In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score,StratifiedKFold

skd = StratifiedKFold(n_splits=4, random_state=41, shuffle=True)

def evaluar(modelo):
    s = cross_val_score(modelo, X, y, cv=skd, n_jobs=-1)
    print(f"Rendimiento de {modelo.__class__.__name__:<25}:  {s.mean():0.3f} ± {s.std():0.3f}")

evaluar(DecisionTreeClassifier())

Rendimiento de DecisionTreeClassifier   :  0.768 ± 0.008


In [17]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier

evaluar(DecisionTreeClassifier())
evaluar(BaggingClassifier(DecisionTreeClassifier(), n_estimators=100))
evaluar(RandomForestClassifier(n_estimators=100))
evaluar(ExtraTreesClassifier(n_estimators=100))

Rendimiento de DecisionTreeClassifier   :  0.770 ± 0.007
Rendimiento de BaggingClassifier        :  0.798 ± 0.002
Rendimiento de RandomForestClassifier   :  0.791 ± 0.005
Rendimiento de ExtraTreesClassifier     :  0.782 ± 0.003


In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [50, 100, 200], 
              'max_features': [1, 5, 8, 10, 21], 
              'max_depth': [5, 20, 50, 70, 100], 
              'min_samples_leaf':[1, 5, 8, 10, 50]}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=skd, verbose=1, n_jobs=-1)

In [23]:
%%time
grid_search.fit(X, y)
display(grid_search.best_estimator_)

Fitting 4 folds for each of 375 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   58.7s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:  5.1min finished


RandomForestClassifier(max_depth=100, max_features=21)

Wall time: 5min 20s


In [24]:
grid_search.best_score_

0.7916962148384895