In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, nltk
import gensim
import codecs
from sner import Ner
import spacy
from sklearn.metrics import confusion_matrix, accuracy_score, average_precision_score
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.internals import find_jars_within_path
from nltk.tag import StanfordPOSTagger
from nltk.tag import StanfordNERTagger
import spacy
from sklearn import linear_model
from sklearn import svm
from sklearn.metrics import fbeta_score, accuracy_score
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer

## Read data

In [13]:
f_train = open('train5.txt', 'r+')
f_test = open('test.txt', 'r+')

train = pd.DataFrame(f_train.readlines(), columns = ['question'])
test = pd.DataFrame(f_test.readlines(), columns = ['question'])

In [14]:
train.head()

Unnamed: 0,question
0,DESC:manner How did serfdom develop in and the...
1,ENTY:cremat What films featured the character ...
2,DESC:manner How can I find a list of celebriti...
3,ENTY:animal What fowl grabs the spotlight afte...
4,ABBR:exp What is the full form of .com ?\n


## Extract trype of the question (before colon)

In [15]:
train['qType'] = train.question.apply(lambda x: x.split(' ', 1)[0])
train['question'] = train.question.apply(lambda x: x.split(' ', 1)[1])
train['coarse'] = train.qType.apply(lambda x: x.split(':')[0])
test['qType'] = test.question.apply(lambda x: x.split(' ', 1)[0])
test['question'] = test.question.apply(lambda x: x.split(' ', 1)[1])
test['coarse'] = test.qType.apply(lambda x: x.split(':')[0])


In [16]:
train.head()

Unnamed: 0,question,qType,coarse
0,How did serfdom develop in and then leave Russ...,DESC:manner,DESC
1,What films featured the character Popeye Doyle...,ENTY:cremat,ENTY
2,How can I find a list of celebrities ' real na...,DESC:manner,DESC
3,What fowl grabs the spotlight after the Chines...,ENTY:animal,ENTY
4,What is the full form of .com ?\n,ABBR:exp,ABBR


In [17]:
print(train.shape)
print(test.shape)

(5452, 3)
(500, 3)


In [18]:
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
# from nltk.stem.porter import PorterStemmer 
# from nltk.stem.snowball import SnowballStemmer
# from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dhamzeia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dhamzeia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
nlp = spacy.load("en_core_web_sm")

## Preprocess: remove non-alphanumeric

In [20]:
# preprocess
train['prep'] = [ re.sub(pattern='[^a-zA-Z0-9]',repl=' ', string = x.lower()) for x in train['question']]
test['prep'] =  [ re.sub(pattern='[^a-zA-Z0-9]',repl=' ', string = x.lower()) for x in test['question']]


## Define wh-words for extracting features

In [21]:
wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']

## get features for train data: head (WH) words, POS tag+ named entity + BoW



In [22]:
all_ner = []
all_tag = []
heads = []
for row in train['prep']:
    present_ner = []
    present_tag = []
    doc = nlp(row)
    for i, tok in enumerate(doc):
        if i==0:
            if str(tok) not in wh_words:
                heads.append('None')
            else:
                heads.append(tok)
        if tok.tag_!= '':
            present_tag.append(tok.tag_)
    for ent in doc.ents:
        if ent.label_ !='':
            present_ner.append(ent.label_)
    all_ner.append(' '.join(present_ner))
    all_tag.append(' '.join(present_tag))


## get features for test data



In [25]:
all_ner_test = []
all_tag_test = []
heads_test = []
for row in test['prep']:
    present_ner = []
    present_tag = []
    doc = nlp(row)
    for i, tok in enumerate(doc):
        if i==0:
            if str(tok) not in wh_words:
                heads_test.append('None')
            else:
                heads_test.append(tok)
        if tok.tag_!= '':
            present_tag.append(tok.tag_)
    for ent in doc.ents:
        if ent.label_ !='':
            present_ner.append(ent.label_)
    all_ner_test.append(' '.join(present_ner))
    all_tag_test.append(' '.join(present_tag))



## Get BoW features after removing stopwords and only contain words that appeared at least 5 times

In [61]:
count_vec_ner = CountVectorizer().fit(all_ner)
ner_ft = count_vec_ner.transform(all_ner)
ner_test_ft = count_vec_ner.transform(all_ner_test)

count_vec_tag = CountVectorizer().fit(all_tag)
tag_ft = count_vec_tag.transform(all_tag)
tag_test_ft = count_vec_tag.transform(all_tag_test)

count_vec_tok = CountVectorizer(stop_words = 'english', min_df = 5).fit(train['prep'])
tok_ft = count_vec_tok.transform(train['prep'])
tok_test_ft = count_vec_tok.transform(test['prep'])

In [62]:
train['head_chunk']= heads
test['head_chunk']= heads_test
heads_dummies = train.append(test).head_chunk.str.get_dummies()
head_ft = heads_dummies[0:len(train)]
head_test_ft = heads_dummies[len(train):]

In [63]:
# prepare data for training
x_all_ft_train = hstack([ner_ft, tag_ft, tok_ft, head_ft])
x_all_ft_train = x_all_ft_train.tocsr()

x_all_ft_test = hstack([ner_test_ft, tag_test_ft, tok_test_ft, head_test_ft])
x_all_ft_test = x_all_ft_test.tocsr()




In [65]:
x_all_ft_train.shape

(5452, 1114)

In [64]:
x_all_ft_test.shape

(500, 1114)

## Model training SVM

In [69]:


model_svm = svm.LinearSVC()
model_svm.fit(x_all_ft_train, train['coarse'].values)


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

## Evaluate model- accuracy

In [71]:
preds = model_svm.predict(x_all_ft_test)
print('svm = {}'.format(accuracy_score(test['coarse'].values, preds)))

svm = 0.828
