# Model Creation - Naive Bayes Baseline
This notebook creates the model that classify the abstracts

In [84]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix

Import data

In [85]:
data = pd.read_pickle('preprocessed_abstracts.pkl')
data.iloc[0]['abstract']

'bojan pandžić  born 13 march 1982  swedish football referee  pandžić currently resides hisings backa  part gothenburg  he full international referee fifa since 2014  he became professional referee 2004 allsvenskan referee since 2009  pandzic refereed 42 matches allsvenskan  65 matches superettan 8 international matches 2014 '

Divide into training and test data. 80% train 20% test.

In [86]:
x = data['abstract']
y = data['label']

train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.2,random_state=1)

In [87]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

In [88]:
nb.fit(train_x, train_y)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [89]:
predictions = nb.predict(test_x)

In [90]:
print('accuracy %s' % accuracy_score(predictions, test_y))
print(classification_report(test_y, predictions,target_names=data['label'].unique()))

accuracy 0.9973262032085561
             precision    recall  f1-score   support

     Person       1.00      1.00      1.00      1955
       City       1.00      1.00      1.00      1846
     Animal       0.99      1.00      0.99      1061

avg / total       1.00      1.00      1.00      4862



In [91]:
print(confusion_matrix(test_y, predictions, labels=data['label'].unique()))

[[1056    5    0]
 [   0 1846    0]
 [   7    1 1947]]


In [92]:
for idx, row in enumerate(test_x):
    if test_y.iloc[idx] != predictions[idx]:
        print('Prediction:',predictions[idx],', True:',test_y.iloc[idx])
        print('Abstract:',row)

Prediction: City , True: Person
Abstract: par sanda  born june 22  1965  swedish entrepreneur  developer stock trader  his accomplishments include establishing pan capital group  also known pan capital aktiebolag  international stock market trading company redeveloping north beach village  area 1950  midcentury hotels resorts fort lauderdale  florida 
Prediction: City , True: Person
Abstract: marin aleksov founder ceo rosland capital  seller gold precious metals  aleksov founded santa monica  california based company 2008 
Prediction: Person , True: Animal
Abstract: awesome gem  foaled february 6  2003  thoroughbred racehorse  this son awesome again sold crupi s new castle farm west point thoroughbreds  150000 california 2005 barretts march 2yearold sale  his partnership includes paul blavin  scott cadwallader patrice arundel vista  ca  rob keen encinitas  california  the chestnut gelding recently risen become major stakes contender main track southern california racing circuit  awesom