# Model Creation - Logistic Regression
This notebook creates the model that classify the abstracts

In [17]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix

Import data

In [18]:
data = pd.read_pickle('preprocessed_abstracts.pkl')
data.iloc[0]['abstract']

'bojan pandžić  born 13 march 1982  swedish football referee  pandžić currently resides hisings backa  part gothenburg  he full international referee fifa since 2014  he became professional referee 2004 allsvenskan referee since 2009  pandzic refereed 42 matches allsvenskan  65 matches superettan 8 international matches 2014 '

Divide into training and test data. 80% train 20% test.

In [19]:
x = data['abstract']
y = data['label']

train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.2, random_state=1)

In [20]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LogisticRegression()),
              ])

In [21]:
nb.fit(train_x, train_y)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [22]:
predictions = nb.predict(test_x)

In [23]:
print('accuracy %s' % accuracy_score(predictions, test_y))
print(classification_report(test_y, predictions,target_names=data['label'].unique()))

accuracy 0.9991772932949403
             precision    recall  f1-score   support

     Person       1.00      1.00      1.00      1955
       City       1.00      1.00      1.00      1846
     Animal       1.00      1.00      1.00      1061

avg / total       1.00      1.00      1.00      4862



In [24]:
print(confusion_matrix(test_y, predictions, labels=data['label'].unique()))

[[1058    0    3]
 [   0 1846    0]
 [   1    0 1954]]


In [25]:
for idx, row in enumerate(test_x):
    if test_y.iloc[idx] != predictions[idx]:
        print('Prediction:',predictions[idx],', True:',test_y.iloc[idx])
        print('Abstract:',row)

Prediction: Animal , True: Person
Abstract: marin aleksov founder ceo rosland capital  seller gold precious metals  aleksov founded santa monica  california based company 2008 
Prediction: Animal , True: Person
Abstract: magnum st pierre dance producer owned operated ensuite studios san francisco 
Prediction: Person , True: Animal
Abstract: jason roach  born july 11  1984 saint john  new brunswick  canadian curler  roach twice  2004 2005  new brunswick men s junior championships playing third team skipped ryan sherrard  the team canadian junior curling championships first attempt 2004  they finished round robin 93 record  three way tie first  the team would go win playoff games  including final manitoba s daley peters  the team represented canada 2004 world junior curling championships  the team finished round robin 54 record  tied korea  however  lost tiebreaker match koreans  at 2005 canadian junior curling championships could repeat title  finishing 57 record  after juniors  roach w