# TP 2 methods

In [107]:
import pandas as pd
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import sklearn.metrics

In [108]:
df = pd.read_csv('celldata.csv')

In [109]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,Salary,Churn
0,632,Germany,Female,50,5,107959.39,1,1,1,6985,1
1,649,France,Female,42,7,0.0,2,0,1,22974,0
2,595,France,Male,29,6,150685.79,1,1,0,87771,0
3,653,Spain,Male,35,6,116662.96,2,1,1,23864,0
4,559,Spain,Female,40,7,144470.77,1,1,1,18918,0


In [110]:
Y = df['Churn']
X = df.drop(columns = ['Churn'])

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, 
    test_size=0.2,
    random_state=42,
    stratify=Y
)

In [111]:
categorical_cols = ['Geography', 'Gender']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ],
    remainder='passthrough'
)

## Classification Decision Tree

In [112]:
model_tree = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('tree', DecisionTreeClassifier())
])

modelfit = model_tree.fit(X_train, Y_train)
pY_train = modelfit.predict_proba(X_test)

In [113]:
predxclass = np.argmax(pY_train, axis=1)
E_train = (Y_test != predxclass).sum()/len(Y_test)

print("The accuracy on the training set is %5.2f->", 1-E_train)

The accuracy on the training set is %5.2f-> 0.785


### Scores for classification decision tree

In [114]:
print(f"Confusion matrix : {sklearn.metrics.confusion_matrix(Y_test, predxclass)}")
print(f"Recall : {sklearn.metrics.recall_score(Y_test, predxclass)}")
print(f"Precision : {sklearn.metrics.precision_score(Y_test, predxclass)}")
print(f"F1-score : {sklearn.metrics.f1_score(Y_test, predxclass)}")
print(f"ROC-AUC score : {sklearn.metrics.roc_auc_score(Y_test, model_tree.predict_proba(X_test)[:,1])}")

Confusion matrix : [[1095  183]
 [ 161  161]]
Recall : 0.5
Precision : 0.4680232558139535
F1-score : 0.48348348348348347
ROC-AUC score : 0.6784037558685447


## Bagging

In [115]:
model_bagging = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('bagging', BaggingClassifier(n_estimators=10, random_state=0))
])

modelfit = model_bagging.fit(X_train, Y_train)
pY_train = modelfit.predict_proba(X_test)

In [116]:
predxclass = np.argmax(pY_train, axis=1)
E_train = (Y_test != predxclass).sum()/len(Y_test)

print("The accuracy on the training set is %5.2f->", 1-E_train)

The accuracy on the training set is %5.2f-> 0.85625


### Scores for bagging

In [117]:
print(f"Confusion matrix : {sklearn.metrics.confusion_matrix(Y_test, predxclass)}")
print(f"Recall : {sklearn.metrics.recall_score(Y_test, predxclass)}")
print(f"Precision : {sklearn.metrics.precision_score(Y_test, predxclass)}")
print(f"F1-score : {sklearn.metrics.f1_score(Y_test, predxclass)}")
print(f"ROC-AUC score : {sklearn.metrics.roc_auc_score(Y_test, model_bagging.predict_proba(X_test)[:,1])}")

Confusion matrix : [[1216   62]
 [ 168  154]]
Recall : 0.4782608695652174
Precision : 0.7129629629629629
F1-score : 0.5724907063197026
ROC-AUC score : 0.824353123572352


## Random Forest

In [118]:
model_random_forest = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('bagging', RandomForestClassifier(max_depth=2000, random_state=0))
])

modelfit = model_random_forest.fit(X_train, Y_train)
pY_train = modelfit.predict_proba(X_test)

In [119]:
predxclass = np.argmax(pY_train, axis=1)
E_train = (Y_test != predxclass).sum()/len(Y_test)

print("The accuracy on the training set is %5.2f->", 1-E_train)

The accuracy on the training set is %5.2f-> 0.865


### Scores for random forest

In [120]:
print(f"Confusion matrix : {sklearn.metrics.confusion_matrix(Y_test, predxclass)}")
print(f"Recall : {sklearn.metrics.recall_score(Y_test, predxclass)}")
print(f"Precision : {sklearn.metrics.precision_score(Y_test, predxclass)}")
print(f"F1-score : {sklearn.metrics.f1_score(Y_test, predxclass)}")
print(f"ROC-AUC score : {sklearn.metrics.roc_auc_score(Y_test, model_random_forest.predict_proba(X_test)[:,1])}")

Confusion matrix : [[1228   50]
 [ 166  156]]
Recall : 0.484472049689441
Precision : 0.7572815533980582
F1-score : 0.5909090909090909
ROC-AUC score : 0.8498381593911295


## Extra Trees

In [125]:
model_extra_trees = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('bagging', ExtraTreesClassifier(max_depth=2000, random_state=0, bootstrap=True, oob_score=True))
])

modelfit = model_extra_trees.fit(X_train, Y_train)
pY_train = modelfit.predict_proba(X_test)

In [126]:
predxclass = np.argmax(pY_train, axis=1)
E_train = (Y_test != predxclass).sum()/len(Y_test)

print("The accuracy on the training set is %5.2f->", 1-E_train)

The accuracy on the training set is %5.2f-> 0.859375


### Scores for extra trees

In [127]:
print(f"Confusion matrix : {sklearn.metrics.confusion_matrix(Y_test, predxclass)}")
print(f"Recall : {sklearn.metrics.recall_score(Y_test, predxclass)}")
print(f"Precision : {sklearn.metrics.precision_score(Y_test, predxclass)}")
print(f"F1-score : {sklearn.metrics.f1_score(Y_test, predxclass)}")
print(f"ROC-AUC score : {sklearn.metrics.roc_auc_score(Y_test, model_extra_trees.predict_proba(X_test)[:,1])}")

Confusion matrix : [[1235   43]
 [ 182  140]]
Recall : 0.43478260869565216
Precision : 0.7650273224043715
F1-score : 0.5544554455445545
ROC-AUC score : 0.8430862955510843
