In [24]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import time

import warnings
warnings.filterwarnings('ignore')

# sklearn

from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin

from telcoFunc import *

import features_creation as fc
from features_creation import *

import inspect, re


from tqdm import tqdm
import gc

In [25]:

tcc = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

category_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
                'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
                'PaymentMethod']

numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

target = 'Churn'

ID_col = 'customerID'

assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

tcc['TotalCharges']= tcc['TotalCharges'].apply(lambda x: x if x!= ' ' else np.nan).astype(float)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No',  value=0, inplace=True)

In [27]:
train = pd.read_csv('train_new1.csv')
test = pd.read_csv('test_new1.csv')

In [28]:
train.shape

(5282, 27)

In [31]:
X_train = train.iloc[:,:-1]
X_test = test.iloc[:,:-1]
y_train = train.iloc[:,-1]
y_test = test.iloc[:,-1]

In [None]:
parameter_space = {
    "min_samples_leaf": range(1, 10, 3), 
    "min_samples_split": range(1, 10, 3),
    "max_depth": range(5, 16, 5),
    "max_leaf_nodes": [None] + list(range(20, 70, 20)), 
    "n_estimators": range(10, 160, 70), 
    "max_features":['sqrt', 'log2'] + list(range(3, 9)), 
    "max_samples":[None, 0.4, 0.5, 0.6]}

RF_0 = RandomForestClassifier(random_state=12)
grid_RF_0 = GridSearchCV(RF_0, parameter_space, n_jobs=-1)

grid_RF_0.fit(X_train, y_train)

In [34]:
grid_RF_0.best_score_

0.8091627605286547

In [35]:
grid_RF_0.score(X_train, y_train), grid_RF_0.score(X_test, y_test)

(0.8277167739492617, 0.7950028392958546)

In [36]:
grid_RF_0.best_params_

{'max_depth': 10,
 'max_features': 'log2',
 'max_leaf_nodes': 60,
 'max_samples': 0.5,
 'min_samples_leaf': 7,
 'min_samples_split': 4,
 'n_estimators': 10}

In [None]:
parameter_space = {
    "min_samples_leaf": range(6, 9), 
    "min_samples_split": range(3, 6),
    "max_depth": range(9, 12),
    "max_leaf_nodes": [None] + list(range(58, 63, 2)), 
    "n_estimators": range(8, 13, 2), 
    "max_features":['sqrt', 'log2'] + list(range(4, 7)), 
    "max_samples":[None, 0.48, 0.5, 0.52]}

RF_0 = RandomForestClassifier(random_state=12)
grid_RF_0 = GridSearchCV(RF_0, parameter_space, n_jobs=-1)

grid_RF_0.fit(X_train, y_train)

In [38]:
grid_RF_0.best_score_

0.8091627605286547

In [39]:
grid_RF_0.score(X_train, y_train), grid_RF_0.score(X_test, y_test)

(0.8277167739492617, 0.7950028392958546)

In [40]:
grid_RF_0.best_params_

{'max_depth': 10,
 'max_features': 'log2',
 'max_leaf_nodes': 60,
 'max_samples': 0.5,
 'min_samples_leaf': 7,
 'min_samples_split': 3,
 'n_estimators': 10}

In [None]:
parameter_space = {
    "min_samples_leaf": range(6, 9), 
    "min_samples_split": range(2, 5),
    "max_depth": range(9, 12),
    "max_leaf_nodes": [None] + list(range(58, 62)), 
    "n_estimators": range(9, 12), 
    "max_features":['sqrt', 'log2'] + list(range(4, 7)), 
    "max_samples":[0.49, 0.5, 0.51]}

RF_0 = RandomForestClassifier(random_state=12)
grid_RF_0 = GridSearchCV(RF_0, parameter_space, n_jobs=-1)

grid_RF_0.fit(X_train, y_train)

In [42]:
grid_RF_0.best_score_

0.8091627605286547

In [43]:
grid_RF_0.score(X_train, y_train), grid_RF_0.score(X_test, y_test)

(0.8277167739492617, 0.7950028392958546)

In [44]:
grid_RF_0.best_params_

{'max_depth': 10,
 'max_features': 'log2',
 'max_leaf_nodes': 60,
 'max_samples': 0.5,
 'min_samples_leaf': 7,
 'min_samples_split': 2,
 'n_estimators': 10}

|Models|CV.best_score_|train_score|test_score|
|:--:|:--:|:--:|:--:|
|Logistic+grid|0.8045|0.8075|0.7956|
|RF+grid_R1|0.8092|0.8277|0.7950|
|RF+grid_R2|0.8092|0.8277|0.7950|
|RF+grid_R3|0.8092|0.8277|0.7950|