# Model Creation - Naive Bayes Baseline
This notebook creates the model that classify the abstracts

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix

Import data

In [2]:
data = pd.read_pickle('preprocessed_abstracts.pkl')
data.iloc[0]['abstract']

'bojan pandžić  born 13 march 1982  swedish football referee  pandžić currently resides hisings backa  part gothenburg  he full international referee fifa since 2014  he became professional referee 2004 allsvenskan referee since 2009  pandzic refereed 42 matches allsvenskan  65 matches superettan 8 international matches 2014 '

Divide into training and test data. 80% train 20% test.

In [3]:
x = data['abstract']
y = data['label']

train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.2,random_state=1)

In [4]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

In [5]:
nb.fit(train_x, train_y)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [6]:
predictions = nb.predict(test_x)

In [10]:
print('accuracy %s' % accuracy_score(predictions, test_y))
print(classification_report(test_y, predictions))

accuracy 0.9857549857549858
             precision    recall  f1-score   support

     Animal       0.99      1.00      0.99      1976
       City       0.97      1.00      0.99      1834
    Country       0.00      0.00      0.00        56
     Person       0.99      1.00      0.99      1048

avg / total       0.97      0.99      0.98      4914



  'precision', 'predicted', average, warn_for)


In [8]:
print(confusion_matrix(test_y, predictions))

[[1044    4    0    0]
 [   1 1833    0    0]
 [   8    1 1967    0]
 [   1   42   13    0]]


In [9]:
for idx, row in enumerate(test_x):
    if test_y.iloc[idx] != predictions[idx]:
        print('Prediction:',predictions[idx],', True:',test_y.iloc[idx])
        print('Abstract:',row)

Prediction: Person , True: Animal
Abstract: clavelina   little bottle   genus tunicates  containing following species   clavelina amplexa kott  2002  clavelina arafurensis tokioka  1952  clavelina auracea monniot  1997  clavelina australis  herdman  1899   clavelina baudinensis kott  1957  clavelina borealis savigny  1816  clavelina brasiliensis  millar  1977   clavelina breve monniot  1997  clavelina coerulea oka  1934  clavelina concrescens hartmeyer  1924  clavelina cyclus tokioka  nishikawa  1975  clavelina cylindrica  quoy  gaimard  1834   clavelina dagysa  kott  1957   clavelina dellavallei  zirpolo  1825   clavelina detorta  sluiter  1904   clavelina elegans  oka  1927   clavelina enormis herdman  1880  clavelina fasciculata van name  1945  clavelina fecunda  sluiter  1904   clavelina gemmae turon  2005  clavelina huntsmani van name  1931  clavelina kottae  millar  1960   clavelina lepadiformis  müller  1776   clavelina maculata monniot  monniot  2001  clavelina meridionalis  he