In [65]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
from pathlib import Path
import ontology.Schema as sc
import qai.QAI_Manager as q
import pickle
import os
from nlp.NLP_Processor import NLP_Processor
from classifier.ML_Classifier import ML_Classifier
from nlp.NER_Trainer import NER_Trainer 
from datetime import datetime
import spacy
import nlp.Solr_Connection as solr_connection
import pandas as pd

from config import *  #Change configurations in file "config.py"


%matplotlib inline


In [4]:
def train_QAIs():  
    schema = sc.getGraph(ontology_path)
    classes_index = sc.load_classes_index(schema)
    properties_index = sc.load_properties_index(schema)
    schema.close()
    QAI_Manager = q.QAI_Manager(QAIs_path,properties_index)
    ner_trainer = NER_Trainer(QAI_Manager.QAIs,classes_index,sparql_endpoint,graph_name,solr_host,solr_port,solr_core)
    ner_trainer.make_train_dataset(savePath=path_train_NER_temp)
    return QAI_Manager

In [5]:
def pre_process_classifier(QAI_Manager,use_semantic_features = True,number_qp_samples = 0):
    labels_path = os.path.join(path_train_NER_temp,"labels.sav")
    with open(labels_path,"rb") as file:
        labels_NER = pickle.load(file)
    nlp_processor = NLP_Processor(labels_NER,nlp_model_load,startup_solr=False)
    QAI_Manager.compute_SVs(nlp_processor)
    X,y = ML_Classifier.pre_process_data(QAI_Manager.QAIs,nlp_processor,use_semantic_features,number_qp_samples)
    return X,y


In [8]:
def train_classifier(X,y,model = None):
    classifier = ML_Classifier(model_path="persistence/classifier",model_file="ml_classifier.sav",model = model)
    time = classifier.fit(X,y)
    return classifier,time

In [7]:
QAIs_path = "input/medibot/MediBot.json"
QAI_Manager = train_QAIs()
X,y = pre_process_classifier(QAI_Manager,use_semantic_features = True,number_qp_samples = 0)


Creating train dataset
Created train dataset
Using semantic features True. QPs limited False


In [129]:
results = []

In [130]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append([type(clf.model).__name__,p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.003659


Score evaluations using cross validation with cv = 5:
Precision = 0.9833333333333332
Recall = 0.975
F-1 Score = 0.9733333333333334


In [131]:
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB()

clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append([type(clf.model).__name__,p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.005652


Score evaluations using cross validation with cv = 5:
Precision = 0.6641666666666666
Recall = 0.6875
F-1 Score = 0.6451190476190476


In [132]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial',n_jobs=-1)

clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append([type(clf.model).__name__,p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.773955


Score evaluations using cross validation with cv = 5:
Precision = 0.9583333333333333
Recall = 0.9375
F-1 Score = 0.9333333333333333


In [133]:
from sklearn.svm import SVC
model = SVC(gamma='auto')

clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append([type(clf.model).__name__+" auto",p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.008861


Score evaluations using cross validation with cv = 5:
Precision = 0.7491666666666666
Recall = 0.775
F-1 Score = 0.7319047619047618


In [134]:
from sklearn.svm import SVC
model = SVC(C=1, kernel="linear")


clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append([type(clf.model).__name__+" linear",p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.006610


Score evaluations using cross validation with cv = 5:
Precision = 0.9833333333333332
Recall = 0.975
F-1 Score = 0.9733333333333334


In [135]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=0)

clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append([type(clf.model).__name__,p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.006644


Score evaluations using cross validation with cv = 5:
Precision = 0.8583333333333332
Recall = 0.875
F-1 Score = 0.8608333333333332


In [136]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append([type(clf.model).__name__,p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.019199


Score evaluations using cross validation with cv = 5:
Precision = 0.7275
Recall = 0.7875
F-1 Score = 0.7383333333333333




In [137]:
from sklearn.neighbors.nearest_centroid import NearestCentroid
model = NearestCentroid()

clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append([type(clf.model).__name__,p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.007120


Score evaluations using cross validation with cv = 5:
Precision = 0.8558333333333332
Recall = 0.8375
F-1 Score = 0.8226190476190476


In [138]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier()

clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append([type(clf.model).__name__+" 1 layer",p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.195720


Score evaluations using cross validation with cv = 5:
Precision = 0.9666666666666666
Recall = 0.95
F-1 Score = 0.96


In [139]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(100,100))

clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append([type(clf.model).__name__+" 2 layers",p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.178302


Score evaluations using cross validation with cv = 5:
Precision = 0.9833333333333332
Recall = 0.9625
F-1 Score = 0.9600000000000002


In [140]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()

clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append([type(clf.model).__name__,p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.096929


Score evaluations using cross validation with cv = 5:
Precision = 0.2966666666666667
Recall = 0.425
F-1 Score = 0.2857142857142857


In [141]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
model = AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=100),n_estimators=100,learning_rate=0.5)

clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append([type(clf.model).__name__+" (RandomForest)",p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.096714


Score evaluations using cross validation with cv = 5:
Precision = 0.8833333333333332
Recall = 0.8875
F-1 Score = 0.8683333333333334


In [142]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()


clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append([type(clf.model).__name__,p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.005864


Score evaluations using cross validation with cv = 5:
Precision = 0.7929166666666667
Recall = 0.775
F-1 Score = 0.7577380952380952


In [143]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=1)


clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append(['Nearest Neighbor',p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.006308


Score evaluations using cross validation with cv = 5:
Precision = 0.8791666666666667
Recall = 0.875
F-1 Score = 0.86


In [144]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
model = QuadraticDiscriminantAnalysis()


clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append([type(clf.model).__name__,p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.005941


Score evaluations using cross validation with cv = 5:
Precision = 0.41380952380952374
Recall = 0.4375
F-1 Score = 0.3878968253968254


In [145]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import  VotingClassifier
gaussian = GaussianNB()
linear = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial',n_jobs=-1)

model = VotingClassifier(estimators=[('gaus', gaussian), ('line', linear)],voting='soft',n_jobs=-1)


clf,t = train_classifier(X,y,model = model)
p,r,f = clf.eval_model(X,y)
results.append(["Soft "+type(clf.model).__name__+" (GaussianNB + LogisticRegression)",p,r,f,t])

Using customized model
Training classifier done! Elapsed time: 0:00:00.138614


Score evaluations using cross validation with cv = 5:
Precision = 0.9833333333333332
Recall = 0.975
F-1 Score = 0.9733333333333334


In [149]:
evaluations = pd.DataFrame(results,columns=['Classifier','Precision','Recall','F1','Time'])
top_5 = evaluations.sort_values(['F1','Time'],ascending=[False,True]).head(5)
top_5.to_csv("top5_evaluations_semantic.csv", encoding='utf-8', index=False)
top_5

Unnamed: 0,Classifier,Precision,Recall,F1,Time
0,GaussianNB,0.983333,0.975,0.973333,00:00:00.003659
4,SVC linear,0.983333,0.975,0.973333,00:00:00.006610
15,Soft VotingClassifier (GaussianNB + LogisticRe...,0.983333,0.975,0.973333,00:00:00.138614
9,MLPClassifier 2 layers,0.983333,0.9625,0.96,00:00:00.178302
8,MLPClassifier 1 layer,0.966667,0.95,0.96,00:00:00.195720
