In [2]:
#Data
import numpy as np
import pandas as pd

# Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Metricas
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix

# Plot
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

Carga de Datos y Filtrado de Datos

In [3]:
train = pd.read_csv("travel_insurance_prediction_train.csv")
test = pd.read_csv("travel_insurance_prediction_test.csv")

train["Employment Type"] = train["Employment Type"].replace(["Private Sector/Self Employed"], 0)
train["Employment Type"] = train["Employment Type"].replace(["Government Sector"], 1)
test["Employment Type"] = test["Employment Type"].replace(["Private Sector/Self Employed"], 0)
test["Employment Type"] = test["Employment Type"].replace(["Government Sector"], 1)

train["GraduateOrNot"] = train["GraduateOrNot"].replace(["Yes"], 0)
train["GraduateOrNot"] = train["GraduateOrNot"].replace(["No"], 1)
test["GraduateOrNot"] = test["GraduateOrNot"].replace(["Yes"], 0)
test["GraduateOrNot"] = test["GraduateOrNot"].replace(["No"], 1)

train["FrequentFlyer"] = train["FrequentFlyer"].replace(["Yes"], 0)
train["FrequentFlyer"] = train["FrequentFlyer"].replace(["No"], 1)
test["FrequentFlyer"] = test["FrequentFlyer"].replace(["Yes"], 0)
test["FrequentFlyer"] = test["FrequentFlyer"].replace(["No"], 1)

train["EverTravelledAbroad"] = train["EverTravelledAbroad"].replace(["Yes"], 0)
train["EverTravelledAbroad"] = train["EverTravelledAbroad"].replace(["No"], 1)
test["EverTravelledAbroad"] = test["EverTravelledAbroad"].replace(["Yes"], 0)
test["EverTravelledAbroad"] = test["EverTravelledAbroad"].replace(["No"], 1)

train.drop(columns=["Customer"])

Unnamed: 0,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,33,0,0,550000,6,0,1,1,1
1,28,0,0,800000,7,0,0,1,0
2,31,0,0,1250000,4,0,1,1,0
3,31,1,1,300000,7,0,1,1,0
4,28,0,0,1250000,3,0,1,1,0
...,...,...,...,...,...,...,...,...,...
1485,31,1,1,300000,5,0,1,1,0
1486,31,0,0,950000,3,0,0,1,0
1487,28,0,0,1250000,5,0,1,1,0
1488,31,1,0,1300000,5,0,1,1,0


In [4]:
X, y = train.iloc[:, 0:9], train.TravelInsurance

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=0)

In [6]:
X_train.shape
X_test.shape

(298, 9)

In [7]:
X_train["Employment Type"].value_counts()

0    830
1    362
Name: Employment Type, dtype: int64

-----------------------------------------------------------------------------------------------
# SGD Classifier

In [8]:
clf = make_pipeline(StandardScaler(),
                   SGDClassifier(random_state=42))
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdclassifier', SGDClassifier(random_state=42))])

### Metricas en Entrenamiento

In [9]:
y_pred_train = clf.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred_train))
print('Precision Score: ', precision_score(y_train, y_pred_train))
print('F1 Score: ', f1_score(y_train, y_pred_train))
print('Recall Score: ', recall_score(y_train, y_pred_train))

Accuracy Score:  0.7307046979865772
Precision Score:  0.631578947368421
F1 Score:  0.610909090909091
Recall Score:  0.5915492957746479


### Metricas en Test

In [10]:
y_pred_test = clf.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred_test))
print('Precision Score: ', precision_score(y_test, y_pred_test))
print('F1 Score: ', f1_score(y_test, y_pred_test))
print('Recall Score: ', recall_score(y_test, y_pred_test))

Accuracy Score:  0.7315436241610739
Precision Score:  0.6477272727272727
F1 Score:  0.5876288659793815
Recall Score:  0.5377358490566038


Habria que analizar porque esta dando una accuracy mejor en test que en el mismo entrenamiento. Puede ser por los pocos datos que tenemos?

In [11]:
# Selección de parámetros
param_grid = {
    'loss': ['perceptron','hinge','log','squared_loss','epsilon_insensitive'],
    'penalty' : ['l2', 'l1', 'none'],
    'alpha' : [0.0001, 0.001, 0.01, 0.1],
    'random_state': [42],
    'learning_rate': ['optimal','constant','invscaling'],
    'eta0': [0.0001, 0.001, 0.01, 0.1]
}

best_clf = make_pipeline(StandardScaler(), GridSearchCV(SGDClassifier(), param_grid, cv=4))
best_clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('gridsearchcv',
                 GridSearchCV(cv=4, estimator=SGDClassifier(),
                              param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1],
                                          'eta0': [0.0001, 0.001, 0.01, 0.1],
                                          'learning_rate': ['optimal',
                                                            'constant',
                                                            'invscaling'],
                                          'loss': ['perceptron', 'hinge', 'log',
                                                   'squared_loss',
                                                   'epsilon_insensitive'],
                                          'penalty': ['l2', 'l1', 'none'],
                                          'random_state': [42]}))])

### Metricas en Entrenamiento

In [12]:
y_pred_train = best_clf.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred_train))
print('Precision Score: ', precision_score(y_train, y_pred_train))
print('F1 Score: ', f1_score(y_train, y_pred_train))
print('Recall Score: ', recall_score(y_train, y_pred_train))

Accuracy Score:  0.7734899328859061
Precision Score:  0.7910447761194029
F1 Score:  0.6109510086455332
Recall Score:  0.49765258215962443


### Metricas en Test

In [13]:
y_pred_test = best_clf.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred_test))
print('Precision Score: ', precision_score(y_test, y_pred_test))
print('F1 Score: ', f1_score(y_test, y_pred_test))
print('Recall Score: ', recall_score(y_test, y_pred_test))

Accuracy Score:  0.761744966442953
Precision Score:  0.7868852459016393
F1 Score:  0.5748502994011977
Recall Score:  0.4528301886792453


El tema de este ultimo modelo es que no sabemos que hiperparametros adopto por el pipeline. Pero como sólo tenemos que cargar los datos de prediccion de test podria funcionar

-----------------------------------------------------------------------------------------------
# Decision Tree

## Decision Tree Vanilla

In [14]:
tree_def = DecisionTreeClassifier(random_state=42)
tree_def.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [15]:
y_pred = tree_def.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred))
print('Precision Score: ', precision_score(y_train, y_pred))
print('F1 Score: ', f1_score(y_train, y_pred))
print('Recall Score: ', recall_score(y_train, y_pred))

Accuracy Score:  1.0
Precision Score:  1.0
F1 Score:  1.0
Recall Score:  1.0


In [16]:
y_pred_test = tree_def.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred_test))
print('Precision Score: ', precision_score(y_test, y_pred_test))
print('F1 Score: ', f1_score(y_test, y_pred_test))
print('Recall Score: ', recall_score(y_test, y_pred_test))

Accuracy Score:  0.7651006711409396
Precision Score:  0.6666666666666666
F1 Score:  0.6728971962616822
Recall Score:  0.6792452830188679


En entrenamiento avanza tanto que divide todos los datos. En test da Accuracy de 0,76

In [17]:
# Selección de parámetros
param_grid = {
    'criterion': ['entropy','gini'],
    'max_depth': [None,2,5,10,50],
    'min_samples_leaf' : [1, 2,3,8,20,40,60]
}

best_tree = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
best_tree.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [None, 2, 5, 10, 50],
                         'min_samples_leaf': [1, 2, 3, 8, 20, 40, 60]})

In [18]:
y_pred = best_tree.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred))
print('Precision Score: ', precision_score(y_train, y_pred))
print('F1 Score: ', f1_score(y_train, y_pred))
print('Recall Score: ', recall_score(y_train, y_pred))

Accuracy Score:  0.8355704697986577
Precision Score:  0.8938356164383562
F1 Score:  0.7270194986072424
Recall Score:  0.6126760563380281


In [19]:
print("El mejor score promedio cruzado validado",best_tree.best_score_)
print("-------------------------------------------------------------------------------------------------")
print("El mejor estimador es", best_tree.best_estimator_)
print("-------------------------------------------------------------------------------------------------")
print("Los parámetros que dieron los mejores resultados", best_tree.best_params_)
print

El mejor score promedio cruzado validado 0.8288632607854858
-------------------------------------------------------------------------------------------------
El mejor estimador es DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=3)
-------------------------------------------------------------------------------------------------
Los parámetros que dieron los mejores resultados {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 3}


<function print>

In [20]:
y_pred_test = best_tree.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred_test))
print('Precision Score: ', precision_score(y_test, y_pred_test))
print('F1 Score: ', f1_score(y_test, y_pred_test))
print('Recall Score: ', recall_score(y_test, y_pred_test))

Accuracy Score:  0.8489932885906041
Precision Score:  0.9692307692307692
F1 Score:  0.736842105263158
Recall Score:  0.5943396226415094


Aqui tenemos el mejor resultado que ha arrojado un modelo. 

In [21]:
from sklearn.ensemble import AdaBoostRegressor

regr = AdaBoostRegressor(random_state=0, n_estimators=100)
regr.fit(X_train, y_train)
AdaBoostRegressor(n_estimators=100, random_state=0)

AdaBoostRegressor(n_estimators=100, random_state=0)

In [22]:
regr.score(X_train, y_train)

0.38664218991562294

-----------------------------------------------------------------------------------------------
## SVR
El problema de este modelo es que arroja resultados continuos. Lo dejo por si encontramos que podemos modificar su salida con alguna funcion de activacion

In [23]:
from sklearn.svm import SVR

regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
regr.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(epsilon=0.2))])

In [24]:
y_pred = regr.predict(X_train)
y_pred

array([0.15395717, 0.28924264, 0.17146898, ..., 0.31427207, 0.7630869 ,
       0.15394908])

In [25]:
y_pred = regr.predict(X_train)
print('Accuracy Score: ', accuracy_score(y_train, y_pred))
print('Precision Score: ', precision_score(y_train, y_pred))
print('F1 Score: ', f1_score(y_train, y_pred))
print('Recall Score: ', recall_score(y_train, y_pred))

ValueError: Classification metrics can't handle a mix of binary and continuous targets

-----------------------------------------------------------------------------------------------
## C-Support Vector Classification.
Es un modelo que tardo mucho en entrenar. En pc local, aprox 2 minutos con el 20 % de datos de entrenamiento (lo reduje para ello). Adjunto comentado el codigo y por los resultados creo que no vale la pena. Luego lo corro (o si alguno le da paciencia) con el entrenamiento al 80 % como esta seteado. 

In [6]:
#from sklearn import svm
#clf = svm.SVC(kernel='linear', cache_size=7000)

In [7]:
#clf = clf.fit(X_train, y_train)

In [8]:
#y_pred = clf.predict(X_train)
#print('Accuracy Score: ', accuracy_score(y_train, y_pred))
#print('Precision Score: ', precision_score(y_train, y_pred))
#print('F1 Score: ', f1_score(y_train, y_pred))
#print('Recall Score: ', recall_score(y_train, y_pred))

Accuracy Score:  0.6812080536912751
Precision Score:  0.5346534653465347
F1 Score:  0.5320197044334977
Recall Score:  0.5294117647058824



<li> Accuracy Score:  0.6812080536912751
<li> Precision Score:  0.5346534653465347
<li> F1 Score:  0.5320197044334977
<li> Recall Score:  0.5294117647058824