# Summary

- Naive Bayes is applied on the TFIDF from the abstracts
- Experiment results:

                     precision    recall  f1-score   support

              False       0.85      0.80      0.83      2672
               True       0.77      0.82      0.79      2106

        avg / total       0.82      0.81      0.81      4778

        Confusion Matrix
        2147  525
        371  1735


In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

import pickle


In [2]:
p_file = 'data_X'

with open(p_file, 'rb') as fin:
    data_X = pickle.load(fin)

In [3]:
p_file = 'RCT_labels'

with open(p_file, 'rb') as fin:
    data_Y = pickle.load(fin)

In [4]:
train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size=0.25)

In [5]:
train_YD = np.array(train_Y)
test_YD = np.array(test_Y)

In [6]:
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [7]:
model.fit(train_X, train_YD)

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i...   vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [8]:
# save trained model
with open('nbRCT_Abstract_model', 'wb') as fout:
    pickle.dump(model, fout)

In [9]:
predict_labels = model.predict(test_X)

In [10]:
cfn_matrix=confusion_matrix(test_YD, predict_labels)
print(classification_report (test_YD, predict_labels))
print(cfn_matrix)

             precision    recall  f1-score   support

      False       0.85      0.80      0.83      2672
       True       0.77      0.82      0.79      2106

avg / total       0.82      0.81      0.81      4778

[[2147  525]
 [ 371 1735]]
