**imports**

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
from nltk.corpus.reader import ConllCorpusReader
from sklearn.feature_extraction import DictVectorizer
import numpy as np
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import classification_report
import spacy

# Data Preprocessing

**Preparing Train Set**

In [None]:
# get the training data from conll
train = ConllCorpusReader('CONLL2003', 'train.txt', ['words', 'pos', 'ignore', 'chunk'])
training_features = []
training_gold_labels = []

for token, pos, ne_label in train.iob_words():
    a_dict = {
        'word': token,
        'pos': pos,
    }
    training_features.append(a_dict)
    training_gold_labels.append(ne_label)

print(training_features [:10])
print(training_gold_labels [:10])

**Preparing Test Set**

In [None]:
# make a text from the test data
df = pd.read_csv("NER-final-test.csv")
text = ""
for index, row in df.iterrows():
    text += row['token'] + " "
    
print(text)

# NERC

In [None]:
# load spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text) # insert code here
print(doc)

for token in doc:
    print(token.text, token.tag)

In [None]:
#test data with spacy tagging
test_features = []
test_gold_labels = []

for index, row in df.iterrows():
    test_gold_labels.append(row['BIO NER tag'])

for word in doc:
    #print(word)
    a_dict = {
        'word': word.text,
        'pos' : word.tag_
    }
    test_features.append(a_dict)
print(test_features)

In [None]:
# put pos tags with nltk
# NOTE uses the variables as spacy tagging!!
sentences_nltk = sent_tokenize(text)
for i in sentences_nltk:
    print(i)
    
tokens_per_sentence = []
for sentence_nltk in sentences_nltk:
    sent_tokens = word_tokenize(sentence_nltk)
    tokens_per_sentence.append(sent_tokens)
#print(tokens_per_sentence)

pos_tags_per_sentence = []
for tokens in tokens_per_sentence:
    #print(tokens)
    pos_tags_per_sentence.append(nltk.pos_tag(tokens)) #part-of-speech tagging on each sentence
    print(nltk.pos_tag(tokens))

In [None]:
#test data with nltk tagging
# NOTE uses the variables as spacy tagging!!
test_features = []
test_gold_labels = []

for index, row in df.iterrows():
    test_gold_labels.append(row['BIO NER tag'])

for sentence in pos_tags_per_sentence:
    for word in sentence:
        #print(word)
        a_dict = {
            'word': word[0],
            'pos' : word[1]
        }
        test_features.append(a_dict)
print(test_gold_labels)

In [None]:
# vectorize features
vec = DictVectorizer()

the_array = vec.fit_transform(training_features + test_features)
lin_train = the_array[:len(training_features)]
lin_test = the_array[len(training_features):]

print (type(lin_train[0]))

In [None]:
# load svm
lin_clf = svm.LinearSVC()

In [74]:
# fit svm
lin_clf.fit(lin_train, training_gold_labels)
y_pred1 = lin_clf.predict(lin_test)

In [75]:
#print(predictions)
for i in range(len(test_features)):
    print(test_features[i], y_pred1[i], test_gold_labels[i])

{'word': 'It', 'pos': 'PRP'} O O
{'word': 'took', 'pos': 'VBD'} O O
{'word': 'eight', 'pos': 'CD'} O O
{'word': 'years', 'pos': 'NNS'} O O
{'word': 'for', 'pos': 'IN'} O O
{'word': 'Warner', 'pos': 'NNP'} I-PER B-ORG
{'word': 'Brothers', 'pos': 'NNPS'} I-ORG I-ORG
{'word': 'to', 'pos': 'TO'} O O
{'word': 'recover', 'pos': 'VB'} O O
{'word': 'from', 'pos': 'IN'} O O
{'word': 'the', 'pos': 'DT'} O O
{'word': 'disaster', 'pos': 'NN'} O O
{'word': 'that', 'pos': 'WDT'} O O
{'word': 'was', 'pos': 'VBD'} O O
{'word': 'this', 'pos': 'DT'} O O
{'word': 'movie', 'pos': 'NN'} O O
{'word': '.', 'pos': '.'} O O
{'word': 'All', 'pos': 'PDT'} O O
{'word': 'the', 'pos': 'DT'} O O
{'word': 'New', 'pos': 'NNP'} B-LOC B-ORG
{'word': 'York', 'pos': 'NNP'} I-LOC I-ORG
{'word': 'University', 'pos': 'NNP'} I-ORG I-ORG
{'word': 'students', 'pos': 'NNS'} O O
{'word': 'love', 'pos': 'VBP'} O O
{'word': 'this', 'pos': 'DT'} O O
{'word': 'diner', 'pos': 'NN'} O O
{'word': 'in', 'pos': 'IN'} O O
{'word': 'Soho', 

In [76]:
# make a report
report1 = classification_report(y_pred1,test_gold_labels,digits = 3)
print(report1)

              precision    recall  f1-score   support

       B-LOC      0.500     0.500     0.500         4
      B-MISC      0.667     0.667     0.667         3
       B-ORG      0.000     0.000     0.000         0
       B-PER      0.500     0.750     0.600         4
       I-LOC      1.000     0.667     0.800         3
      I-MISC      0.000     0.000     0.000         1
       I-ORG      0.667     0.500     0.571         4
       I-PER      0.875     0.636     0.737        11
           O      1.000     0.995     0.997       184

    accuracy                          0.939       214
   macro avg      0.579     0.524     0.541       214
weighted avg      0.959     0.939     0.947       214



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [79]:
# load and fit a simple multilayer perceptron
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(100, 10), random_state=1)
clf.fit(lin_train, training_gold_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(alpha=1e-05, hidden_layer_sizes=(100, 10), random_state=1,
              solver='lbfgs')

In [80]:
# make predictions and print report
y_pred2 = clf.predict(lin_test)
report2 = classification_report(y_pred2,test_gold_labels,digits = 3)
print(report2)

              precision    recall  f1-score   support

       B-LOC      0.750     0.600     0.667         5
      B-MISC      1.000     0.750     0.857         4
       B-ORG      0.250     0.111     0.154         9
       B-PER      0.500     0.429     0.462         7
       I-LOC      0.000     0.000     0.000         0
      I-MISC      0.000     0.000     0.000         0
       I-ORG      0.333     0.500     0.400         2
       I-PER      0.250     0.667     0.364         3
           O      1.000     0.995     0.997       184

    accuracy                          0.916       214
   macro avg      0.454     0.450     0.433       214
weighted avg      0.930     0.916     0.919       214



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [81]:
#print(predictions)
for i in range(len(test_features)):
    print(test_features[i], y_pred2[i], test_gold_labels[i])

{'word': 'It', 'pos': 'PRP'} O O
{'word': 'took', 'pos': 'VBD'} O O
{'word': 'eight', 'pos': 'CD'} O O
{'word': 'years', 'pos': 'NNS'} O O
{'word': 'for', 'pos': 'IN'} O O
{'word': 'Warner', 'pos': 'NNP'} I-PER B-ORG
{'word': 'Brothers', 'pos': 'NNPS'} B-ORG I-ORG
{'word': 'to', 'pos': 'TO'} O O
{'word': 'recover', 'pos': 'VB'} O O
{'word': 'from', 'pos': 'IN'} O O
{'word': 'the', 'pos': 'DT'} O O
{'word': 'disaster', 'pos': 'NN'} O O
{'word': 'that', 'pos': 'WDT'} O O
{'word': 'was', 'pos': 'VBD'} O O
{'word': 'this', 'pos': 'DT'} O O
{'word': 'movie', 'pos': 'NN'} O O
{'word': '.', 'pos': '.'} O O
{'word': 'All', 'pos': 'PDT'} O O
{'word': 'the', 'pos': 'DT'} O O
{'word': 'New', 'pos': 'NNP'} B-LOC B-ORG
{'word': 'York', 'pos': 'NNP'} B-PER I-ORG
{'word': 'University', 'pos': 'NNP'} I-ORG I-ORG
{'word': 'students', 'pos': 'NNS'} O O
{'word': 'love', 'pos': 'VBP'} O O
{'word': 'this', 'pos': 'DT'} O O
{'word': 'diner', 'pos': 'NN'} O O
{'word': 'in', 'pos': 'IN'} O O
{'word': 'Soho', 