```
Text mining project - Named Entity Recognition and Classification
group 4

CONLL
```

In [1]:
import pandas as pd
import nltk
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
from sklearn.metrics import classification_report

```
The test set does not have any POS tags yet. We are going to do POS tagging first and then train an SVM to classify Named entity's.
```

In [18]:
### This is the test set that we will be dealing with. As we see, there is no POS information yet.

ner_df = pd.read_table('NER-final-test.tsv')

np_ner_set = ner_df['token'].tolist()
ner_iob = ner_df['BIO NER tag'].tolist()


counter = 0
ner_tuples = []
for i in ner_iob:
    if i != 'O':
        ner_tuples.append((np_ner_set[counter], i))
    counter += 1

print(ner_tuples)

[('Warner', 'B-ORG'), ('Brothers', 'I-ORG'), ('New', 'B-ORG'), ('York', 'I-ORG'), ('University', 'I-ORG'), ('Soho', 'B-LOC'), ('Italian', 'B-MISC'), ('Jane', 'B-PER'), ('Austen', 'I-PER'), ('Carl', 'B-PER'), ('Brashear', 'I-PER'), ('Cuba', 'B-PER'), ('Gooding', 'I-PER'), ('Jr.', 'I-PER'), ('African', 'B-MISC'), ('American', 'I-MISC'), ('Navy', 'B-ORG'), ('Chris', 'B-PER'), ("O'Donnell", 'I-PER'), ('Amsterdam', 'B-LOC'), ('Blauwbrug', 'B-ORG'), ('Dame', 'B-PER'), ('Maggie', 'I-PER'), ('Smith', 'I-PER'), ('Mr.', 'B-PER'), ('Kruno', 'I-PER'), ('New', 'B-LOC'), ('York', 'I-LOC'), ('Los', 'B-LOC'), ('Angeles', 'I-LOC'), ('English', 'B-MISC')]


In [3]:
# adding POS using NLTK

pos_tagged = nltk.pos_tag(np_ner_set)

pos_list = []
type(pos_tagged)

for i in pos_tagged:
    pos_list.append(i[1])

ner_df['POS'] = pos_list
print(ner_df)

     sentence id  token id  token BIO NER tag  POS
0              0         0     It           O  PRP
1              0         1   took           O  VBD
2              0         2  eight           O   CD
3              0         3  years           O  NNS
4              0         4    for           O   IN
..           ...       ...    ...         ...  ...
209            9        12    get           O   VB
210            9        13   into           O   IN
211            9        14   this           O   DT
212            9        15    one           O   CD
213            9        16      .           O    .

[214 rows x 5 columns]


In [4]:
# Loading training data and transforming it to a usable format

from nltk.corpus.reader import ConllCorpusReader
### Adapt the path to point to the CONLL2003 folder on your local machine
train = ConllCorpusReader('/Users/tomsl/My Drive/AI/Year 3/P4_TextMining/ba-text-mining/ba-text-mining/lab_sessions/lab4/nerc_datasets/CONLL2003', 'train.txt', ['words', 'pos', 'ignore', 'chunk'])
training_X = []
training_y = []

for token, pos, ne_label in train.iob_words():
    a_dict = {
        'token': token, 'POS': pos
    }
    training_X.append(a_dict)
    training_y.append(ne_label)

print(len(training_y))

203621


In [5]:
# Getting the test data in the same format as the training data.

test_df = ner_df.drop(columns={'sentence id', 'token id', 'BIO NER tag'})

test_X = test_df.to_dict(orient='records')
test_y = ner_df['BIO NER tag'].tolist()

In [6]:
from collections import Counter 


print('In the training set, the counter function produces the following distribution: ' + str(Counter(training_y)))
print('In the test set, the counter function produces the following distribution: ' + str(Counter(test_y)))


class_count_array = list(Counter(training_y).items())
max = np.max(list(Counter(training_y).values()))


# creating weights to fight class imbalance
weights = []

for i in training_y:
    for j, k in class_count_array:
        if i == j:
            x = round((1/k)*max)
            weights.append(x)

print(weights[:10])


In the training set, the counter function produces the following distribution: Counter({'O': 169578, 'B-LOC': 7140, 'B-PER': 6600, 'B-ORG': 6321, 'I-PER': 4528, 'I-ORG': 3704, 'B-MISC': 3438, 'I-LOC': 1157, 'I-MISC': 1155})
In the test set, the counter function produces the following distribution: Counter({'O': 183, 'I-PER': 8, 'B-PER': 6, 'B-ORG': 4, 'B-LOC': 4, 'I-ORG': 3, 'B-MISC': 3, 'I-LOC': 2, 'I-MISC': 1})
[27, 1, 49, 1, 1, 1, 49, 1, 1, 26]


In [7]:
# Vectorize our data

vec = DictVectorizer()

whole_set = training_X + test_X


the_array = vec.fit_transform(whole_set)

training_array = the_array[:203621]
test_array = the_array[203621:]

print(training_array.shape)
print(test_array.shape)

(203621, 23683)
(214, 23683)


In [8]:
# Train an SVM on our training data

lin_clf = svm.LinearSVC()

lin_clf.fit(training_array, training_y)

pred = lin_clf.predict(training_array)
test_pred = lin_clf.predict(test_array)

In [9]:
report_training = classification_report(training_y, pred)
report_test = classification_report(test_y, test_pred)

print("Training report: \n" + report_training)

Training report: 
              precision    recall  f1-score   support

       B-LOC       0.89      0.92      0.90      7140
      B-MISC       0.90      0.88      0.89      3438
       B-ORG       0.88      0.86      0.87      6321
       B-PER       0.89      0.90      0.90      6600
       I-LOC       0.79      0.79      0.79      1157
      I-MISC       0.81      0.71      0.75      1155
       I-ORG       0.86      0.74      0.79      3704
       I-PER       0.85      0.85      0.85      4528
           O       1.00      1.00      1.00    169578

    accuracy                           0.98    203621
   macro avg       0.87      0.85      0.86    203621
weighted avg       0.97      0.98      0.97    203621



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# Train a model with weights

lin_clf_weighted = svm.LinearSVC()

lin_clf_weighted.fit(training_array, training_y, sample_weight=weights)

pred_weighted = lin_clf_weighted.predict(training_array)
test_pred_weighted = lin_clf_weighted.predict(test_array)




In [11]:
report_training_weighted = classification_report(training_y, pred_weighted)

print("Weighted training report: \n" + report_training_weighted)

Weighted training report: 
              precision    recall  f1-score   support

       B-LOC       0.90      0.91      0.91      7140
      B-MISC       0.85      0.90      0.87      3438
       B-ORG       0.87      0.85      0.86      6321
       B-PER       0.91      0.87      0.89      6600
       I-LOC       0.62      0.89      0.73      1157
      I-MISC       0.37      0.87      0.52      1155
       I-ORG       0.32      0.79      0.46      3704
       I-PER       0.81      0.88      0.85      4528
           O       1.00      0.95      0.98    169578

    accuracy                           0.94    203621
   macro avg       0.74      0.88      0.79    203621
weighted avg       0.96      0.94      0.95    203621



In [12]:
report_test = classification_report(test_y, test_pred)
report_weighted = classification_report(test_y, test_pred_weighted)

print("Test report: \n" + report_test)

Test report: 
              precision    recall  f1-score   support

       B-LOC       0.50      0.50      0.50         4
      B-MISC       0.67      0.67      0.67         3
       B-ORG       0.00      0.00      0.00         4
       B-PER       0.75      0.50      0.60         6
       I-LOC       0.67      1.00      0.80         2
      I-MISC       0.00      0.00      0.00         1
       I-ORG       0.50      0.67      0.57         3
       I-PER       0.64      0.88      0.74         8
           O       0.99      1.00      1.00       183

    accuracy                           0.94       214
   macro avg       0.52      0.58      0.54       214
weighted avg       0.93      0.94      0.93       214



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
print("Weighted test report: \n" + report_weighted)

Weighted test report: 
              precision    recall  f1-score   support

       B-LOC       0.50      0.50      0.50         4
      B-MISC       0.67      0.67      0.67         3
       B-ORG       0.00      0.00      0.00         4
       B-PER       0.75      0.50      0.60         6
       I-LOC       0.67      1.00      0.80         2
      I-MISC       0.00      0.00      0.00         1
       I-ORG       0.29      0.67      0.40         3
       I-PER       0.64      0.88      0.74         8
           O       0.99      0.97      0.98       183

    accuracy                           0.92       214
   macro avg       0.50      0.58      0.52       214
weighted avg       0.92      0.92      0.92       214



```
We can see from the classification reports above that the there is a performance decrease when introducing weights. For every non 'O' class, the prediction scores are outperformed by SVM trained without weights.
```

In [33]:
counter = 0 
pred_tuples = []
for i in test_pred:
    if i != 'O':
        pred_tuples.append((np_ner_set[counter], i))
    counter += 1


for i in range(len(pred_tuples)):
    print(ner_tuples[i])
    print(pred_tuples[i])

pred_tokens = []
for i in pred_tuples:
    pred_tokens.append(i[0])

ner_tokens = []
for i in ner_tuples:
    ner_tokens.append(i[0])

counter = 0
number_of_ners = len(ner_tokens)
counter_inv = number_of_ners
for i in ner_tokens:
    if i in pred_tokens:
        counter_inv -= 1
    else:
        print(i)
        
print(counter_inv)

precision = (number_of_ners-counter_inv)/number_of_ners
print("Precision: " + str(precision))

('Warner', 'B-ORG')
('Warner', 'I-PER')
('Brothers', 'I-ORG')
('Brothers', 'I-ORG')
('New', 'B-ORG')
('New', 'B-LOC')
('York', 'I-ORG')
('York', 'I-LOC')
('University', 'I-ORG')
('University', 'I-ORG')
('Soho', 'B-LOC')
('Soho', 'I-PER')
('Italian', 'B-MISC')
('Italian', 'B-MISC')
('Jane', 'B-PER')
('Jane', 'B-PER')
('Austen', 'I-PER')
('Austen', 'I-PER')
('Carl', 'B-PER')
('Carl', 'B-PER')
('Brashear', 'I-PER')
('Brashear', 'I-PER')
('Cuba', 'B-PER')
('Cuba', 'B-LOC')
('Gooding', 'I-PER')
('Gooding', 'I-PER')
('Jr.', 'I-PER')
('Jr.', 'I-PER')
('African', 'B-MISC')
('African', 'I-MISC')
('American', 'I-MISC')
('American', 'B-MISC')
('Navy', 'B-ORG')
('Navy', 'I-ORG')
('Chris', 'B-PER')
('Chris', 'B-PER')
("O'Donnell", 'I-PER')
("O'Donnell", 'I-PER')
('Amsterdam', 'B-LOC')
('Amsterdam', 'I-ORG')
('Blauwbrug', 'B-ORG')
('Blauwbrug', 'I-PER')
('Dame', 'B-PER')
('Dame', 'I-PER')
('Maggie', 'I-PER')
('Maggie', 'B-PER')
('Smith', 'I-PER')
('Smith', 'I-PER')
('Mr.', 'B-PER')
('Kruno', 'I-PER'