In [275]:
#Data
import numpy as np
import pandas as pd

# Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Metricas
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import fbeta_score

# Plot
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

## Modelos Propuestos:
#### <li> Regresion lineal => SGDClassifier ✅
#### <li> Arbol de decision ✅
#### <li> K vecinos mas cercanos
#### <li> Regresion logistica
#### <li> SVM
#### <li> Random forest

Carga de Datos y Filtrado de Datos

In [276]:
train = pd.read_csv("travel_insurance_prediction_train.csv")
test = pd.read_csv("travel_insurance_prediction_test.csv")

train["Employment Type"] = train["Employment Type"].replace(["Private Sector/Self Employed"], 0)
train["Employment Type"] = train["Employment Type"].replace(["Government Sector"], 1)
test["Employment Type"] = test["Employment Type"].replace(["Private Sector/Self Employed"], 0)
test["Employment Type"] = test["Employment Type"].replace(["Government Sector"], 1)

train["GraduateOrNot"] = train["GraduateOrNot"].replace(["Yes"], 0)
train["GraduateOrNot"] = train["GraduateOrNot"].replace(["No"], 1)
test["GraduateOrNot"] = test["GraduateOrNot"].replace(["Yes"], 0)
test["GraduateOrNot"] = test["GraduateOrNot"].replace(["No"], 1)

train["FrequentFlyer"] = train["FrequentFlyer"].replace(["Yes"], 0)
train["FrequentFlyer"] = train["FrequentFlyer"].replace(["No"], 1)
test["FrequentFlyer"] = test["FrequentFlyer"].replace(["Yes"], 0)
test["FrequentFlyer"] = test["FrequentFlyer"].replace(["No"], 1)

train["EverTravelledAbroad"] = train["EverTravelledAbroad"].replace(["Yes"], 0)
train["EverTravelledAbroad"] = train["EverTravelledAbroad"].replace(["No"], 1)
test["EverTravelledAbroad"] = test["EverTravelledAbroad"].replace(["Yes"], 0)
test["EverTravelledAbroad"] = test["EverTravelledAbroad"].replace(["No"], 1)

#train = train.drop(columns=["Customer"])

Aqui me surgio la duda sobre si esta bien reemplazar las palabras por numeros 0 y 1, o deberiamos aplicar un one hot encoder. Las variables son categoricas son binarias. 

In [277]:
X, y = train.iloc[:, 0:8], train.TravelInsurance

In [278]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=0)

In [279]:
X_train.shape
X_test.shape

(1192, 8)

In [280]:
X_train["Employment Type"].value_counts()

0    205
1     93
Name: Employment Type, dtype: int64

-----------------------------------------------------------------------------------------------
# SGD Classifier

In [281]:
clf = make_pipeline(StandardScaler(),
                   SGDClassifier(random_state=42))
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdclassifier', SGDClassifier(random_state=42))])

### Metricas en Entrenamiento

In [283]:
y_pred_train = clf.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred_train))
print('Precision Score: ', precision_score(y_train, y_pred_train))
print('F1 Score: ', f1_score(y_train, y_pred_train))
print('Recall Score: ', recall_score(y_train, y_pred_train))
print('fbeta_score: ', fbeta_score(y_train, y_pred_train ,average='binary', beta=0.5))

Accuracy Score:  0.6912751677852349
Precision Score:  0.5446428571428571
F1 Score:  0.5700934579439252
Recall Score:  0.5980392156862745
fbeta_score:  0.5545454545454545


### Metricas en Test

In [285]:
y_pred_test = clf.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred_test))
print('Precision Score: ', precision_score(y_test, y_pred_test))
print('F1 Score: ', f1_score(y_test, y_pred_test))
print('Recall Score: ', recall_score(y_test, y_pred_test))
print('fbeta_score: ', fbeta_score(y_test, y_pred_test,average='binary', beta=0.5))

Accuracy Score:  0.7114093959731543
Precision Score:  0.5990783410138248
F1 Score:  0.6018518518518517
Recall Score:  0.6046511627906976
fbeta_score:  0.6001846722068328


Habria que analizar porque esta dando una accuracy mejor en test que en el mismo entrenamiento. Puede ser por los pocos datos que tenemos?

In [286]:
# Selección de parámetros
param_grid = {
    'loss': ['perceptron','hinge','log','squared_loss','epsilon_insensitive'],
    'penalty' : ['l2', 'l1', 'none'],
    'alpha' : [0.0001, 0.001, 0.01, 0.1],
    'random_state': [42],
    'learning_rate': ['optimal','constant','invscaling'],
    'eta0': [0.0001, 0.001, 0.01, 0.1]
}

best_clf = make_pipeline(StandardScaler(), GridSearchCV(SGDClassifier(), param_grid, cv=3))
best_clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('gridsearchcv',
                 GridSearchCV(cv=3, estimator=SGDClassifier(),
                              param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1],
                                          'eta0': [0.0001, 0.001, 0.01, 0.1],
                                          'learning_rate': ['optimal',
                                                            'constant',
                                                            'invscaling'],
                                          'loss': ['perceptron', 'hinge', 'log',
                                                   'squared_loss',
                                                   'epsilon_insensitive'],
                                          'penalty': ['l2', 'l1', 'none'],
                                          'random_state': [42]}))])

### Metricas en Entrenamiento

In [288]:
y_pred_train = best_clf.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred_train))
print('Precision Score: ', precision_score(y_train, y_pred_train))
print('F1 Score: ', f1_score(y_train, y_pred_train))
print('Recall Score: ', recall_score(y_train, y_pred_train))
print('fbeta_score: ', fbeta_score(y_train, y_pred_train ,average='binary', beta=0.5))

Accuracy Score:  0.7483221476510067
Precision Score:  0.8857142857142857
F1 Score:  0.45255474452554745
Recall Score:  0.30392156862745096
fbeta_score:  0.6404958677685951


### Metricas en Test

In [289]:
y_pred_test = best_clf.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred_test))
print('Precision Score: ', precision_score(y_test, y_pred_test))
print('F1 Score: ', f1_score(y_test, y_pred_test))
print('Recall Score: ', recall_score(y_test, y_pred_test))
print('fbeta_score: ', fbeta_score(y_test, y_pred_test,average='binary', beta=0.5))

Accuracy Score:  0.7223154362416108
Precision Score:  0.8721804511278195
F1 Score:  0.4120781527531084
Recall Score:  0.26976744186046514
fbeta_score:  0.602910602910603


El tema de este ultimo modelo es que no sabemos que hiperparametros adopto por el pipeline. Pero como sólo tenemos que cargar los datos de prediccion de test podria funcionar

-----------------------------------------------------------------------------------------------
# Decision Tree

## Decision Tree Vanilla

In [231]:
tree_def = DecisionTreeClassifier(random_state=42)
tree_def.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [292]:
y_pred = tree_def.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred))
print('Precision Score: ', precision_score(y_train, y_pred))
print('F1 Score: ', f1_score(y_train, y_pred))
print('Recall Score: ', recall_score(y_train, y_pred))
print('fbeta_score: ', fbeta_score(y_train, y_pred, average='binary', beta=0.5))

Accuracy Score:  1.0
Precision Score:  1.0
F1 Score:  1.0
Recall Score:  1.0
fbeta_score:  1.0


In [293]:
y_pred_test = tree_def.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred_test))
print('Precision Score: ', precision_score(y_test, y_pred_test))
print('F1 Score: ', f1_score(y_test, y_pred_test))
print('Recall Score: ', recall_score(y_test, y_pred_test))
print('fbeta_score: ', fbeta_score(y_test, y_pred_test,average='binary', beta=0.5))

Accuracy Score:  0.7239932885906041
Precision Score:  0.6193853427895981
F1 Score:  0.6143024618991794
Recall Score:  0.6093023255813953
fbeta_score:  0.6173421300659754


En entrenamiento avanza tanto que divide todos los datos.

In [294]:
# Selección de parámetros
param_grid = {
    'criterion': ['entropy','gini'],
    'max_depth': [None,2,5,10,50],
    'min_samples_leaf' : [None, 1, 2,3,8,20,40,60]
}

best_tree = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
best_tree.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [None, 2, 5, 10, 50],
                         'min_samples_leaf': [None, 1, 2, 3, 8, 20, 40, 60]})

In [296]:
y_pred = best_tree.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred))
print('Precision Score: ', precision_score(y_train, y_pred))
print('F1 Score: ', f1_score(y_train, y_pred))
print('Recall Score: ', recall_score(y_train, y_pred))
print('fbeta_score: ', fbeta_score(y_train, y_pred ,average='binary', beta=0.5))

Accuracy Score:  0.8489932885906041
Precision Score:  0.8701298701298701
F1 Score:  0.7486033519553071
Recall Score:  0.6568627450980392
fbeta_score:  0.8170731707317072


In [297]:
print("El mejor score promedio cruzado validado",best_tree.best_score_)
print("-------------------------------------------------------------------------------------------------")
print("El mejor estimador es", best_tree.best_estimator_)
print("-------------------------------------------------------------------------------------------------")
print("Los parámetros que dieron los mejores resultados", best_tree.best_params_)

El mejor score promedio cruzado validado 0.8324293785310735
-------------------------------------------------------------------------------------------------
El mejor estimador es DecisionTreeClassifier(max_depth=5, min_samples_leaf=8)
-------------------------------------------------------------------------------------------------
Los parámetros que dieron los mejores resultados {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 8}


In [298]:
y_pred_test = best_tree.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred_test))
print('Precision Score: ', precision_score(y_test, y_pred_test))
print('F1 Score: ', f1_score(y_test, y_pred_test))
print('Recall Score: ', recall_score(y_test, y_pred_test))
print('fbeta_score: ', fbeta_score(y_test, y_pred_test,average='binary', beta=0.5))

Accuracy Score:  0.8137583892617449
Precision Score:  0.8312101910828026
F1 Score:  0.7016129032258065
Recall Score:  0.6069767441860465
fbeta_score:  0.7740213523131674


In [238]:
test_id = test["Customer"]
test_pred = best_tree.predict(X_test)

submission = pd.DataFrame(list(zip(test_id, test_pred)), columns=["Customer", "TravelInsurance"])
submission.to_csv("travel_insurance_submission.csv", header=True, index=False)

In [230]:
### Adaboost Regressor

In [197]:
from sklearn.ensemble import AdaBoostRegressor

regr = AdaBoostRegressor(random_state=0, n_estimators=100)
regr.fit(X_train, y_train)
AdaBoostRegressor(n_estimators=100, random_state=0)

AdaBoostRegressor(n_estimators=100, random_state=0)

In [198]:
regr.score(X_train, y_train)

0.34352972634763

-----------------------------------------------------------------------------------------------
## SVR
El problema de este modelo es que arroja resultados continuos. Lo dejo por si encontramos que podemos modificar su salida con alguna funcion de activacion

In [23]:
from sklearn.svm import SVR

regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
regr.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(epsilon=0.2))])

In [24]:
y_pred = regr.predict(X_train)
y_pred

array([0.15395717, 0.28924264, 0.17146898, ..., 0.31427207, 0.7630869 ,
       0.15394908])

In [252]:
y_pred = regr.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred))
print('Precision Score: ', precision_score(y_train, y_pred))
print('F1 Score: ', f1_score(y_train, y_pred))
print('Recall Score: ', recall_score(y_train, y_pred))
#print('fbeta_score: ', fbeta_score(y_train, y_pred ,average='macro', beta=0.5))

ValueError: Classification metrics can't handle a mix of binary and continuous targets

-----------------------------------------------------------------------------------------------
## C-Support Vector Classification.
Es un modelo que tardo mucho en entrenar. En pc local, aprox 2 minutos con el 20 % de datos de entrenamiento (lo reduje para ello). Adjunto comentado el codigo y por los resultados creo que no vale la pena. Luego lo corro (o si alguno le da paciencia) con el entrenamiento al 80 % como esta seteado. 

In [6]:
#from sklearn import svm
#clf = svm.SVC(kernel='linear', cache_size=7000)

In [7]:
#clf = clf.fit(X_train, y_train)

In [8]:
#y_pred = clf.predict(X_train)
#print('Accuracy Score: ', accuracy_score(y_train, y_pred))
#print('Precision Score: ', precision_score(y_train, y_pred))
#print('F1 Score: ', f1_score(y_train, y_pred))
#print('Recall Score: ', recall_score(y_train, y_pred))

Accuracy Score:  0.6812080536912751
Precision Score:  0.5346534653465347
F1 Score:  0.5320197044334977
Recall Score:  0.5294117647058824



<li> Accuracy Score:  0.6812080536912751
<li> Precision Score:  0.5346534653465347
<li> F1 Score:  0.5320197044334977
<li> Recall Score:  0.5294117647058824

## Support Vector Machine

In [299]:
from sklearn import svm
clf = svm.SVC(kernel="rbf", C=1, cache_size=2000, )
clf.fit(X_train, y_train)

SVC(C=1, cache_size=2000)

In [300]:
y_pred = clf.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred))
print('Precision Score: ', precision_score(y_train, y_pred))
print('F1 Score: ', f1_score(y_train, y_pred))
print('Recall Score: ', recall_score(y_train, y_pred))
print('fbeta_score: ', fbeta_score(y_train, y_pred ,average='macro', beta=0.5))

Accuracy Score:  0.7986577181208053
Precision Score:  0.9038461538461539
F1 Score:  0.6103896103896104
Recall Score:  0.46078431372549017
fbeta_score:  0.7836932750136686


In [301]:
#Este no me lo corrio. Deje la compu 1 hora y no funco. 
#parameters = {"kernel":("linear", "rbf"), "C":[1, 10]}
#svc = svm.SVC()
#clf = GridSearchCV(svc, parameters)
#clf.fit(X_train, y_train)

In [302]:
print("SVM con Kernel rbf, dio accuracy de 0,7986")
print("SVM con Kernel Linear, dio accuracy de 0,6812")
print("SVM con Kernel Poly, dio accuracy de 0,7885")
print("SVM con Kernel Sigmoid, dio accuracy de 0,4832")
print("SVM con Kernel rbf + gamma auto da accuracy de 1 en train, pero en test 0,63")

SVM con Kernel rbf, dio accuracy de 0,7986
SVM con Kernel Linear, dio accuracy de 0,6812
SVM con Kernel Poly, dio accuracy de 0,7885
SVM con Kernel Sigmoid, dio accuracy de 0,4832
SVM con Kernel rbf + gamma auto da accuracy de 1 en train, pero en test 0,63


In [304]:
y_pred = clf.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred))
print('Precision Score: ', precision_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred))
print('Recall Score: ', recall_score(y_test, y_pred))
print('fbeta_score: ', fbeta_score(y_test, y_pred,average='macro', beta=0.5))

Accuracy Score:  0.7827181208053692
Precision Score:  0.8976744186046511
F1 Score:  0.5984496124031008
Recall Score:  0.44883720930232557
fbeta_score:  0.7701766180303106


In [305]:
clf = make_pipeline(StandardScaler(), svm.SVC(kernel="poly", gamma="auto"))
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto', kernel='poly'))])

In [306]:
y_pred = clf.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred))
print('Precision Score: ', precision_score(y_train, y_pred))
print('F1 Score: ', f1_score(y_train, y_pred))
print('Recall Score: ', recall_score(y_train, y_pred))
print('fbeta_score: ', fbeta_score(y_train, y_pred ,average='macro', beta=0.5))

Accuracy Score:  0.8322147651006712
Precision Score:  0.9333333333333333
F1 Score:  0.6913580246913581
Recall Score:  0.5490196078431373
fbeta_score:  0.8274751920451535


In [308]:
y_pred = clf.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred))
print('Precision Score: ', precision_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred))
print('Recall Score: ', recall_score(y_test, y_pred))
print('fbeta_score: ', fbeta_score(y_test, y_pred,average='binary', beta=0.5))

Accuracy Score:  0.712248322147651
Precision Score:  0.7142857142857143
F1 Score:  0.45813586097946285
Recall Score:  0.3372093023255814
fbeta_score:  0.5837359098228664


## KMeans

In [309]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=1)
neigh.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=1)

In [310]:
y_pred = neigh.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred))
print('Precision Score: ', precision_score(y_train, y_pred))
print('F1 Score: ', f1_score(y_train, y_pred))
print('Recall Score: ', recall_score(y_train, y_pred))
print('fbeta_score: ', fbeta_score(y_train, y_pred ,average='macro', beta=0.5))

Accuracy Score:  1.0
Precision Score:  1.0
F1 Score:  1.0
Recall Score:  1.0
fbeta_score:  1.0


In [311]:
y_pred = neigh.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred))
print('Precision Score: ', precision_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred))
print('Recall Score: ', recall_score(y_test, y_pred))
print('fbeta_score: ', fbeta_score(y_test, y_pred,average='binary', beta=0.5))

Accuracy Score:  0.6761744966442953
Precision Score:  0.5462184873949579
F1 Score:  0.5739514348785871
Recall Score:  0.6046511627906976
fbeta_score:  0.5569837189374465


In [312]:
# Selección de parámetros
param_grid = {
    'weights': ['uniform','distance'],
    'algorithm': ["auto", "KDTree", 'ball_tree', 'brute'],
    'n_neighbors': [1]
}

best_knn = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5)
best_knn.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsRegressor(),
             param_grid={'algorithm': ['auto', 'KDTree', 'ball_tree', 'brute'],
                         'n_neighbors': [1],
                         'weights': ['uniform', 'distance']})

In [313]:
y_pred = best_knn.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred))
print('Precision Score: ', precision_score(y_train, y_pred))
print('F1 Score: ', f1_score(y_train, y_pred))
print('Recall Score: ', recall_score(y_train, y_pred))
print('fbeta_score: ', fbeta_score(y_train, y_pred, average='binary', beta=0.5))

Accuracy Score:  1.0
Precision Score:  1.0
F1 Score:  1.0
Recall Score:  1.0
fbeta_score:  1.0


In [314]:
y_pred = best_knn.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred))
print('Precision Score: ', precision_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred))
print('Recall Score: ', recall_score(y_test, y_pred))
print('fbeta_score: ', fbeta_score(y_test, y_pred,average='binary', beta=0.5))

Accuracy Score:  0.6761744966442953
Precision Score:  0.5462184873949579
F1 Score:  0.5739514348785871
Recall Score:  0.6046511627906976
fbeta_score:  0.5569837189374465


In [315]:
print("El mejor score promedio cruzado validado",best_knn.best_score_)
print("-------------------------------------------------------------------------------------------------")
print("El mejor estimador es", best_knn.best_estimator_)
print("-------------------------------------------------------------------------------------------------")
print("Los parámetros que dieron los mejores resultados", best_knn.best_params_)

El mejor score promedio cruzado validado -0.42907289851904273
-------------------------------------------------------------------------------------------------
El mejor estimador es KNeighborsRegressor(n_neighbors=1)
-------------------------------------------------------------------------------------------------
Los parámetros que dieron los mejores resultados {'algorithm': 'auto', 'n_neighbors': 1, 'weights': 'uniform'}


## Logistic Regression

In [316]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [318]:
y_pred = clf.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred))
print('Precision Score: ', precision_score(y_train, y_pred))
print('F1 Score: ', f1_score(y_train, y_pred))
print('Recall Score: ', recall_score(y_train, y_pred))
print('fbeta_score: ', fbeta_score(y_train, y_pred, average='binary', beta=0.5))

Accuracy Score:  0.62751677852349
Precision Score:  0.4482758620689655
F1 Score:  0.41269841269841273
Recall Score:  0.38235294117647056
fbeta_score:  0.4333333333333333


In [319]:
y_pred = clf.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred))
print('Precision Score: ', precision_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred))
print('Recall Score: ', recall_score(y_test, y_pred))
print('fbeta_score: ', fbeta_score(y_test, y_pred,average='binary', beta=0.5))

Accuracy Score:  0.6157718120805369
Precision Score:  0.45364238410596025
F1 Score:  0.3743169398907104
Recall Score:  0.3186046511627907
fbeta_score:  0.41819291819291815


## Random Forest

In [320]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [321]:
y_pred = clf.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred))
print('Precision Score: ', precision_score(y_train, y_pred))
print('F1 Score: ', f1_score(y_train, y_pred))
print('Recall Score: ', recall_score(y_train, y_pred))
print('fbeta_score: ', fbeta_score(y_train, y_pred ,average='binary', beta=0.5))

Accuracy Score:  0.7986577181208053
Precision Score:  0.9038461538461539
F1 Score:  0.6103896103896104
Recall Score:  0.46078431372549017
fbeta_score:  0.7580645161290321


In [322]:
y_pred = clf.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred))
print('Precision Score: ', precision_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred))
print('Recall Score: ', recall_score(y_test, y_pred))
print('fbeta_score: ', fbeta_score(y_test, y_pred,average='binary', beta=0.5))

Accuracy Score:  0.7843959731543624
Precision Score:  0.909952606635071
F1 Score:  0.5990639625585022
Recall Score:  0.44651162790697674
fbeta_score:  0.7535321821036107


In [219]:
#customer = test["Customer"]
#test = test.drop(columns=["Customer"])

In [220]:
#y_final = clf.predict(test)

In [226]:
#test_id = test["Customer"]
#test_pred = clf.predict(test)
#test_pred["Customer"] = customer

In [323]:
test_id = test["Customer"]
test_pred = clf.predict(X_test)

submission = pd.DataFrame(list(zip(test_id, test_pred)), columns=["Customer", "TravelInsurance"])
submission.to_csv("travel_insurance_submission.csv", header=True, index=False)