# Naive Bayesian Classifier

In [27]:
import pandas as pd
import random as rd
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

## Loading Dataset

In [5]:
input_header = {"PROC_TRACEINFO" : "id",
                "OP070_V_1_angle_value": "angle_1",
                "OP090_SnapRingPeakForce_value" : "snap_ring_peak_force",
                "OP070_V_2_angle_value" : "angle_2",
                "OP120_Rodage_I_mesure_value" : "rodage_i",
                "OP090_SnapRingFinalStroke_value" : "snap_ring_final_stroke",
                "OP110_Vissage_M8_torque_value" : "vissage_m8_torque",
                "OP100_Capuchon_insertion_mesure" : "capuchon_insertion",
                "OP120_Rodage_U_mesure_value" : "rodage_u",
                "OP070_V_1_torque_value" : "torque_1",
                "OP090_StartLinePeakForce_value" : "start_line_peak_force",
                "OP110_Vissage_M8_angle_value" : "vissage_m8_angle",
                "OP090_SnapRingMidPointForce_val" : "snap_ring_midpoint_force",
                "OP070_V_2_torque_value" : "torque_2"}
output_header = {"PROC_TRACEINFO" : "id",
                 "Binar OP130_Resultat_Global_v" : "result"}

train_input = pd.read_csv("../data/train_inputs.csv", header=0).rename(columns=input_header)
train_output = pd.read_csv("../data/train_output.csv", header=0).rename(columns=output_header)

## Preparing Dataset

In [17]:
train_input_ = train_input[train_input.columns[~train_input.columns.isin(["id", "capuchon_insertion"])]]
X_train, X_test, y_train, y_test = train_test_split(train_input_, train_output["result"], test_size = 0.3, random_state = 123)

## Model Generation

In [18]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

GaussianNB()

## Model Evaluation

In [22]:
y_pred = gnb.predict(X_test)

metrics.accuracy_score(y_test, y_pred)

0.9908256880733946

## Balancing Classes

### Remove Individuals

In [28]:
# Copy dataset
train_input_remove = train_input.copy()
train_output_remove = train_output.copy()

# Select relevant index
defect_index = train_output.index[train_output["result"] == 1].tolist()
valid_index = train_output.index[train_output["result"] == 0].tolist()
rd.shuffle(valid_index) # Shuffle in order to eliminated "production correlation"

# Remove 
train_input_remove = train_input_remove.iloc[valid_index[len(defect_index):] + defect_index,:]
train_output_remove = train_output_remove.iloc[valid_index[len(defect_index):] + defect_index,:]

train_input_ = train_input_remove[train_input_remove.columns[~train_input_remove.columns.isin(["id", "capuchon_insertion"])]]
X_train, X_test, y_train, y_test = train_test_split(train_input_, train_output_remove["result"], test_size = 0.3, random_state = 123)

gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

0.990451135145669

## Cross Validation

In [26]:
train_input_ = train_input[train_input.columns[~train_input.columns.isin(["id", "capuchon_insertion"])]]

gnb_cross = GaussianNB()

scores = cross_val_score(gnb_cross, train_input_, train_output["result"], cv = 5)
scores

array([0.99087353, 0.99116326, 0.9910184 , 0.9910184 , 0.99116326])

## Balance Classes

## Tests

In [3]:
test = datasets.load_wine()
type(test)

sklearn.utils.Bunch