# Model Creation
This notebook creates the model that classify the abstracts

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2

Import data

In [3]:
data = data = pd.read_pickle('preprocessed_abstracts.pkl')
data.iloc[0]['abstract']

'bojan pandžić  born 13 march 1982  swedish footbal refere  pandžić current resid hise backa  part gothenburg  he full intern refere fifa sinc 2014  he becam profession refere 2004 allsvenskan refere sinc 2009  pandzic refere 42 match allsvenskan  65 match superettan 8 intern match 2014 '

Calculate a tf-idf vector

In [4]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))
features = tfidf.fit_transform(data.abstract).toarray()
labels = data.label
features.shape

(24307, 27750)

In [6]:
categories = labels.unique()
categories

array(['Person', 'City', 'Animal'], dtype=object)

In [8]:

N = 2
for category, category_id in enumerate(categories):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(category))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# '0':
  . Most correlated unigrams:
. born
. swedish
  . Most correlated bigrams:
. ice hockey
. swedish footbal
# '1':
  . Most correlated unigrams:
. counti
. citi
  . Most correlated bigrams:
. the popul
. 2010 census
# '2':
  . Most correlated unigrams:
. genus
. famili
  . Most correlated bigrams:
. the popul
. 2010 census
