In [336]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold

In [337]:
df = pd.read_csv('./Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [338]:
df.shape

(10000, 14)

In [339]:
df.groupby('Gender').sum()

Unnamed: 0_level_0,RowNumber,CustomerId,Surname,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Female,22455729,71284888959,HargraveHillOnioBoniMitchellObinnaKayChinScott...,2956727,FranceSpainFranceFranceSpainGermanyFranceFranc...,178260,22561,343720500.0,7015,3192,2284,457032800.0,1139
Male,27549271,85624516735,ChuBartlettHeH?BearceAndrewsGoforthRomeoMuldro...,3548561,SpainFranceFranceFranceFranceSpainGermanyGerma...,210958,27567,421138400.0,8287,3863,2867,543869600.0,898


In [340]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [341]:
# CreditScore	Geography	Gender	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedSalary	

le = LabelEncoder()
df['Geography'] = le.fit_transform(df['Geography'])
df['Gender'] = le.fit_transform(df['Gender'])

In [342]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,0,0,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,0,0,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,0,0,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,2,0,43,2,125510.82,1,1,1,79084.1,0


In [343]:
#col_scal = df['CreditScore','Age','Balance','EstimatedSalary']
scaler = MinMaxScaler()
df[['CreditScore','Age','Balance','EstimatedSalary']] = scaler.fit_transform(df[['CreditScore','Age','Balance','EstimatedSalary']])

In [344]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,0.538,0,0,0.324324,2,0.0,1,1,1,0.506735,1
1,2,15647311,Hill,0.516,2,0,0.310811,1,0.334031,1,0,1,0.562709,0
2,3,15619304,Onio,0.304,0,0,0.324324,8,0.636357,3,1,0,0.569654,1
3,4,15701354,Boni,0.698,0,0,0.283784,1,0.0,2,0,0,0.46912,0
4,5,15737888,Mitchell,1.0,2,0,0.337838,2,0.500246,1,1,1,0.3954,0


In [345]:
# 'CreditScore',	'Geography',	'Gender',	'Age',	'Tenure',	'Balance',	'NumOfProducts',	'HasCrCard',	'IsActiveMember',	'EstimatedSalary'	

In [346]:
x = df[['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance','NumOfProducts', 'IsActiveMember']]
y= df['Exited']

In [347]:
best = SelectKBest(k=8)
x_best = best.fit_transform(x,y)
selected = best.get_support(indices=True)

In [348]:
x.columns[selected]

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'IsActiveMember'],
      dtype='object')

In [349]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

lr = LogisticRegression()
lr.fit(x_train,y_train)
np.set_printoptions(suppress=True) # Suprime la notacion cientifica si es necesario
y_pred = lr.predict(x_test)

In [350]:
confusion_matrix(y_test,y_pred)


array([[1565,   42],
       [ 324,   69]], dtype=int64)

In [351]:
accuracy_score(y_test,y_pred)

0.817

In [352]:
params = {'n_neighbors': [2,3,4,5,6,7,8,9,10]}

knn = KNeighborsClassifier()
model = GridSearchCV(knn,params,cv=5)
model.fit(x_train,y_train)
model.best_params_

{'n_neighbors': 2}

In [353]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(x_train,y_train)
y_pred_knn = knn.predict(x_test)

In [354]:
accuracy_score(y_test,y_pred_knn)

0.8255

In [355]:
params = {'max_depth': [2,3,4,5,6,7,8,9,10]}
arbol = DecisionTreeClassifier(criterion='entropy',random_state=0)
model_2 = GridSearchCV(arbol,params,cv=10)
model_2.fit(x_train,y_train)
model_2.best_params_ 

{'max_depth': 5}

In [356]:
arbol = DecisionTreeClassifier(criterion='entropy',max_depth=8,random_state=0)
arbol.fit(x_train,y_train)
y_pred_arbol = arbol.predict(x_test)

In [357]:
accuracy_score(y_test,y_pred_arbol)

0.857

In [358]:
num_folds = 10

kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
for train_index, test_index in kfold.split(x):
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    lr.fit(X_train, y_train)
    score_lr = lr.score(X_test, y_test)

    arbol.fit(X_train, y_train)
    score_arbol = arbol.score(X_test, y_test)

    knn.fit(X_train, y_train)
    score_knn = knn.score(X_test, y_test)


print('Score de Regresion Logistica: ',score_lr)
print('Score de Arbol de decision: ',score_arbol)
print('Score de KNN: ',score_knn)

Score de Regresion Logistica:  0.824
Score de Arbol de decision:  0.846
Score de KNN:  0.834
