# NaiveBayes

In [103]:
import numpy as np
import pandas as pd
from plotnine import *
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics as mt

def procesar_csv(df):
    from sklearn import preprocessing
    # convertirmos las columnas origin y destination a variables numericas 
    le = preprocessing.LabelEncoder()
    origin_as_int = le.fit_transform(df['origin'])
    destination_as_int = le.fit_transform(df["destination"])
    
    df["origin_int"] = origin_as_int
    df.drop('origin', axis=1, inplace=True)
    
    df["destination_int"] = origin_as_int
    df.drop('destination', axis=1, inplace=True)
    
    # Cambiamos el formato del tiempo a unix time stamp para medir la distancia de forma mas facil.
    df["date_departure_int"] = np.log10(pd.to_datetime(df["date"].map(str)+" "+df["departure_time"]).apply(lambda x: x.value))
    df.drop('departure_time', axis=1, inplace=True)
    df.drop('date', axis=1, inplace=True)
    df.drop('Unnamed: 0', axis=1, inplace=True)
    # retornamos el data frame modificado sin valore nulos.
    return df.dropna()

In [104]:
# Leemos los datos
df = pd.read_csv("ALUMNOS-trainData.csv")
# Procesamos los datos con la funcion procesar_csv()
df = procesar_csv(df)
# transformamos los datos a binarios
df["Labels"] = np.where(df["noshow"] >= 4, 1,0)
# Definimos nuestra columna target
target = df['Labels']
# La eliminamos del dataframe
df.drop('noshow', axis=1, inplace=True)


#Columnas despues de la limpieza
df.columns



Index(['fligth_number', 'distance', 'denied_boarding', 'pax_midlow',
       'pax_high', 'pax_midhigh', 'pax_low', 'pax_freqflyer', 'group_bookings',
       'out_of_stock', 'dom_cnx', 'int_cnx', 'p2p', 'capacity', 'revenues_usd',
       'bookings', 'origin_int', 'destination_int', 'date_departure_int',
       'Labels'],
      dtype='object')

In [106]:
df

Unnamed: 0,fligth_number,distance,denied_boarding,pax_midlow,pax_high,pax_midhigh,pax_low,pax_freqflyer,group_bookings,out_of_stock,dom_cnx,int_cnx,p2p,capacity,revenues_usd,bookings,origin_int,destination_int,date_departure_int,Labels
0,8995,1394,0,94,2,28,94,22,19,0,1,0,239,174.0,10631.1,240,12,12,18.099599,1
1,7061,489,0,100,4,4,87,25,0,0,19,9,192,174.0,5453.7,220,79,79,18.094508,0
2,8972,2180,0,7,1,5,64,2,0,0,0,10,69,218.0,16161.0,79,122,122,18.115589,1
3,6039,524,0,54,4,7,21,4,0,0,26,0,64,174.0,10828.7,90,27,27,18.102437,1
4,7713,1170,0,21,88,18,82,0,88,1,23,3,183,144.0,16721.6,209,51,51,18.099124,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999885,6024,552,0,25,12,20,4,12,0,0,64,9,0,144.0,14102.1,73,143,143,18.098888,1
999886,6669,3934,0,129,3,19,121,2,20,0,0,118,156,168.0,32907.8,274,79,79,18.093891,1
999887,9094,1430,0,2,1,0,108,20,36,0,7,0,124,218.0,13519.3,131,62,62,18.114587,1
999888,7704,1091,0,67,2,7,42,16,0,0,34,2,98,144.0,8729.6,134,53,53,18.093013,0


In [107]:
#Cree un objeto de KFold cross validation

kf = KFold(n_splits = 10, shuffle = True, random_state = 0)

#Cree arreglos para guardar los errores de cada fold (tanto para entrenamiento como test).

errorTrain = np.zeros(10)
errorTest = np.zeros(10)

#Cree el objeto de Naive Bayes que estime correspondiente, con el alpha deseado

NB = GaussianNB()
i=0
#Aplique K-fold cross validation
for train_index, test_index in kf.split(df):#Complete
    #extraiga los datos de entrenamiento y test
    trainData = df.iloc[train_index, :]
    testData = df.iloc[test_index, :]
    
    trainLabel = target.iloc[train_index]
    testLabel = target.iloc[test_index]

    #Entrene el modelo usando los datos de entrenamiento
    NB = NB.fit(trainData, trainLabel)

    #Prediga los valores de TrainData y calcule su error
    prediccion = NB.predict(trainData)
    errorTrain[i] = mt.f1_score(trainLabel, prediccion, pos_label = 1)

    #Prediga los valores de testData y calcule su error
    prediccion = NB.predict(testData)
    errorTest[i] = mt.f1_score(testLabel, prediccion, pos_label = 1)
    i+=1
    print("Numero iteración: ", i) 

Numero iteración:  1
Numero iteración:  2
Numero iteración:  3
Numero iteración:  4
Numero iteración:  5
Numero iteración:  6
Numero iteración:  7
Numero iteración:  8
Numero iteración:  9
Numero iteración:  10


In [108]:
#Muestre la media y desviación estandar de cada error 
print("F1-score modelo para training: ", errorTrain.mean(), "+-", errorTrain.std())   
print("F1-score modelo para test: ", errorTest.mean(), "+-", errorTest.std())  

F1-score modelo para training:  0.7970856990939426 +- 0.04851168147487114
F1-score modelo para test:  0.7970816597471932 +- 0.04870229960591916
