# Naive Bayes Classifier - Training and Validation

## Imports

In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, matthews_corrcoef
import pandas as pd
import numpy as np
import time as t
import statistics


## Reading Datasets

In [2]:
# Loading the MSMARCO Dataset
MSMARCO = pd.read_csv("MSMARCO_biomedical_PT/train.csv")[["question_pt"]]

# Cleaning MSMARCO Dataset
MSMARCO['question_pt'] = MSMARCO['question_pt'].apply(lambda x: x.rstrip('\n'))
MSMARCO['question_pt'] = MSMARCO['question_pt'].apply(lambda x: x.rstrip('\t'))
MSMARCO['question_pt'] = MSMARCO['question_pt'].apply(lambda x: x.rstrip('?'))
MSMARCO['question_pt'] = MSMARCO['question_pt'].apply(lambda x: x.rstrip('.'))

#MSMARCO.tail(20)



# Loading the MIMIC Dataset
MIMIC_dev = pd.read_csv("MIMICSQL_PT/dev.csv")[["question_pt"]]
MIMIC_train = pd.read_csv("MIMICSQL_PT/train.csv")[["question_pt"]]

MIMIC = pd.concat([MIMIC_dev, MIMIC_train], ignore_index=True)

# Cleaning MIMIC Dataset
MIMIC['question_pt'] = MIMIC['question_pt'].apply(lambda x: x.rstrip('\n'))
MIMIC['question_pt'] = MIMIC['question_pt'].apply(lambda x: x.rstrip('\t'))
MIMIC['question_pt'] = MIMIC['question_pt'].apply(lambda x: x.rstrip('?'))
MIMIC['question_pt'] = MIMIC['question_pt'].apply(lambda x: x.rstrip('.'))



# Loading the SPIDER Dataset
SPIDER_med = pd.read_csv("SPIDER/medicine_enzyme_interaction.txt", sep=";;", header=None)
SPIDER_protein = pd.read_csv("SPIDER/protein_institute.txt", sep=";;", header=None)
SPIDER_scientist = pd.read_csv("SPIDER/scientist_1.txt", sep=";;", header=None)

SPIDER = pd.concat([SPIDER_med, SPIDER_protein, SPIDER_scientist], ignore_index=True)
SPIDER.columns = ["question_pt"]

# Cleaning SPIDER Dataset
SPIDER['question_pt'] = SPIDER['question_pt'].apply(lambda x: x.rstrip('\n'))
SPIDER['question_pt'] = SPIDER['question_pt'].apply(lambda x: x.rstrip('\t'))
SPIDER['question_pt'] = SPIDER['question_pt'].apply(lambda x: x.rstrip('?'))
SPIDER['question_pt'] = SPIDER['question_pt'].apply(lambda x: x.rstrip('.'))


  return func(*args, **kwargs)


## Joining Datasets

In [3]:
#Join datasets


MSMARCO["type"] = 1
MIMIC["type"] = 0
SPIDER["type"] = 0


tp0 = pd.concat([MIMIC, SPIDER], ignore_index=True)

df = pd.concat([MSMARCO.head(tp0.count()[0]), tp0], ignore_index = True)



## Cross Validation Training

In [4]:
tic = t.time()

# Prepare cross validation
skf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

results_acc = []
results_mcc = []
results_f1_weighted = []
results_f1_micro = []
results_f1_macro = []

for train_index, val_index in skf.split(X=df['question_pt'], y=df['type']):
    # Splitting Dataframe (dataset not included)
    train_df = df.iloc[train_index]
    val_df = df.iloc[val_index]
    
    # Defining and training the model
    model = make_pipeline(TfidfVectorizer(), MultinomialNB())
    model.fit(train_df['question_pt'].values, train_df['type'].values)
    
    # Validate the model 
    preds = model.predict(list(val_df['question_pt']))
    
    acc = accuracy_score(val_df['type'].values, preds)
    mcc = matthews_corrcoef(val_df['type'].values, preds)
    f1_weighted = f1_score(val_df['type'].values, preds, average='weighted')
    f1_micro = f1_score(val_df['type'].values, preds, average='micro')
    f1_macro = f1_score(val_df['type'].values, preds, average='macro')
    
    # Append model scores
    results_acc.append(acc)
    results_mcc.append(mcc)
    results_f1_weighted.append(f1_weighted)
    results_f1_micro.append(f1_micro)
    results_f1_macro.append(f1_macro)
    
    print('\n################# RESULTS #################')
    print('\nAccumulated data:')
    print('>> results_acc', results_acc)    
    print('>> results_mcc', results_mcc)
    print('>> results_f1_weighted', results_f1_weighted)
    print('>> results_f1_micro', results_f1_micro)
    print('>> results_f1_macro', results_f1_macro)
    
    print('\nAVGs:')
    print('>> avg_acc:', round(sum(results_acc) / len(results_acc), 3))
    print('>> avg_mcc:', round(sum(results_mcc) / len(results_mcc), 3))
    print('>> avg_f1_weighted:', round(sum(results_f1_weighted) / len(results_f1_weighted), 3))
    print('>> avg_f1_micro:', round(sum(results_f1_micro) / len(results_f1_micro), 3))
    print('>> avg_f1_macro:', round(sum(results_f1_macro) / len(results_f1_macro), 3))
    
    
tac = t.time()
duration = round((tac - tic)/60, 2)
print('\n>> Elapsed time: {}min\n'.format(duration))


################# RESULTS #################

Accumulated data:
>> results_acc [0.9802523313219967]
>> results_mcc [0.9610260026179218]
>> results_f1_weighted [0.9802471603562175]
>> results_f1_micro [0.9802523313219967]
>> results_f1_macro [0.9802473327217435]

AVGs:
>> avg_acc: 0.98
>> avg_mcc: 0.961
>> avg_f1_weighted: 0.98
>> avg_f1_micro: 0.98
>> avg_f1_macro: 0.98

################# RESULTS #################

Accumulated data:
>> results_acc [0.9802523313219967, 0.984092155787164]
>> results_mcc [0.9610260026179218, 0.9686098439893737]
>> results_f1_weighted [0.9802471603562175, 0.9840887948263183]
>> results_f1_micro [0.9802523313219967, 0.984092155787164]
>> results_f1_macro [0.9802473327217435, 0.9840889193063496]

AVGs:
>> avg_acc: 0.982
>> avg_mcc: 0.965
>> avg_f1_weighted: 0.982
>> avg_f1_micro: 0.982
>> avg_f1_macro: 0.982

################# RESULTS #################

Accumulated data:
>> results_acc [0.9802523313219967, 0.984092155787164, 0.9808008776741635]
>> results_mc

## Check final results

In [5]:
print('Final results with std')
metrics = ['acc', 'mcc', 'f1_weighted', 'f1_micro', 'f1_macro']
results = [results_acc, results_mcc, results_f1_weighted, results_f1_micro, results_f1_macro]

for m, r in zip(metrics, results):
    avg = round(statistics.mean(r), 3)
    std = round(statistics.stdev(r), 3)
    print('>> {}: {} +/- {}'.format(m, avg, std))

Final results with std
>> acc: 0.982 +/- 0.003
>> mcc: 0.964 +/- 0.006
>> f1_weighted: 0.982 +/- 0.003
>> f1_micro: 0.982 +/- 0.003
>> f1_macro: 0.982 +/- 0.003
