## Import and install


In [1]:
!pip install scikit-learn==0.22.2  -q # to solve "AttributeError: 'CRF' object has no attribute 'keep_tempfiles'" when using crfsuite
!pip install sklearn_crfsuite -q
!pip install eli5 -q

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import sys
sys.path.append('/content/drive/My Drive/DTA/Internship/')

In [4]:
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier, Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from collections import Counter
from process_for_baselines import process_data, test_classifier
from ner_crfsuite import train_crf, flat_classification_report
import pandas as pd
import numpy as np

# CLASSIFIERS

In [5]:
_, classes, new_classes, X_train, X_test, y_train, y_test = process_data("drive/MyDrive/DTA/Internship/C14NL.csv")
test_classifier(X_train, y_train, X_test, y_test, classes, new_classes, PassiveAggressiveClassifier) 

PassiveAggressiveClassifier(random_state=42)
              precision    recall  f1-score   support

      B-DATE       0.90      0.63      0.74        70
       B-LOC       0.80      0.62      0.70       164
     B-MONEY       0.46      0.51      0.48        71
      B-PERS       0.94      0.67      0.79       432
      I-DATE       0.57      0.23      0.33       288
       I-LOC       1.00      0.30      0.47        33
     I-MONEY       0.78      0.76      0.77       197
      I-PERS       0.47      0.27      0.34       410

   micro avg       0.72      0.49      0.58      1665
   macro avg       0.74      0.50      0.58      1665
weighted avg       0.71      0.49      0.57      1665



In [6]:
test_classifier(X_train, y_train, X_test, y_test, classes, new_classes, MultinomialNB, alpha=0.01) 

MultinomialNB(alpha=0.01)
              precision    recall  f1-score   support

      B-DATE       0.31      0.64      0.42        70
       B-LOC       0.54      0.68      0.60       164
     B-MONEY       0.24      0.54      0.33        71
      B-PERS       0.79      0.80      0.80       432
      I-DATE       0.53      0.33      0.40       288
       I-LOC       0.17      0.52      0.25        33
     I-MONEY       0.57      0.82      0.67       197
      I-PERS       0.57      0.35      0.44       410

   micro avg       0.54      0.58      0.56      1665
   macro avg       0.46      0.58      0.49      1665
weighted avg       0.58      0.58      0.56      1665



In [7]:
test_classifier(X_train, y_train, X_test, y_test, classes, new_classes, Perceptron, n_jobs=-1, max_iter=5)

Perceptron(max_iter=5, n_jobs=-1, random_state=42)
              precision    recall  f1-score   support

      B-DATE       0.90      0.61      0.73        70
       B-LOC       0.64      0.65      0.65       164
     B-MONEY       0.09      0.80      0.16        71
      B-PERS       0.94      0.71      0.81       432
      I-DATE       0.53      0.30      0.38       288
       I-LOC       1.00      0.24      0.39        33
     I-MONEY       0.59      0.79      0.68       197
      I-PERS       0.41      0.27      0.32       410

   micro avg       0.47      0.52      0.49      1665
   macro avg       0.64      0.55      0.52      1665
weighted avg       0.63      0.52      0.54      1665



In [8]:
test_classifier(X_train, y_train, X_test, y_test, classes, new_classes, SGDClassifier)

SGDClassifier(random_state=42)
              precision    recall  f1-score   support

      B-DATE       0.89      0.59      0.71        70
       B-LOC       0.86      0.45      0.59       164
     B-MONEY       0.47      0.30      0.36        71
      B-PERS       0.93      0.56      0.70       432
      I-DATE       0.72      0.22      0.34       288
       I-LOC       1.00      0.21      0.35        33
     I-MONEY       0.87      0.71      0.78       197
      I-PERS       0.90      0.09      0.17       410

   micro avg       0.85      0.37      0.52      1665
   macro avg       0.83      0.39      0.50      1665
weighted avg       0.85      0.37      0.48      1665



# CRF SUITE

In [5]:
df, _, new_classes, _, _, _, _ = process_data("drive/MyDrive/DTA/Internship/C14NL.csv")

In [6]:
crf, _, X_test, _, y_test = train_crf(df)
y_pred = crf.predict(X_test)
print(flat_classification_report(y_test, y_pred, labels = new_classes))

              precision    recall  f1-score   support

      B-DATE       0.87      0.86      0.87       129
       B-LOC       0.89      0.78      0.83       235
     B-MONEY       0.84      0.83      0.84       119
      B-PERS       0.94      0.82      0.88       679
      I-DATE       0.96      0.90      0.93       488
       I-LOC       0.90      0.61      0.73        44
     I-MONEY       0.84      0.86      0.85       330
      I-PERS       0.93      0.84      0.88       604

   micro avg       0.91      0.84      0.88      2628
   macro avg       0.90      0.81      0.85      2628
weighted avg       0.92      0.84      0.88      2628



In [15]:
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))
print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))
print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-PERS -> I-PERS  0.451502
I-DATE -> I-DATE  0.437716
B-DATE -> I-DATE  0.385789
B-MONEY -> I-MONEY 0.364858
I-MONEY -> I-MONEY 0.363493
I-PERS -> I-PERS  0.293811
O      -> B-LOC   0.275824
O      -> O       0.272508
B-LOC  -> I-LOC   0.265046
O      -> B-PERS  0.241094
I-LOC  -> I-LOC   0.220843
O      -> B-DATE  0.204829
O      -> B-MONEY 0.171163
I-PERS -> B-PERS  0.113904
B-LOC  -> O       0.056318
I-PERS -> O       0.040300
B-PERS -> O       0.033679
B-DATE -> O       0.029255
I-DATE -> B-DATE  0.028593
B-LOC  -> B-MONEY 0.020041

Top unlikely transitions:
B-LOC  -> I-PERS  -0.082857
I-PERS -> I-LOC   -0.084592
B-LOC  -> I-DATE  -0.087840
I-DATE -> B-LOC   -0.102097
I-PERS -> B-LOC   -0.105342
O      -> I-LOC   -0.112765
I-DATE -> B-PERS  -0.114827
B-PERS -> I-MONEY -0.118562
B-PERS -> I-DATE  -0.124122
I-DATE -> B-MONEY -0.133441
I-PERS -> I-MONEY -0.136772
I-DATE -> O       -0.137457
I-DATE -> I-PERS  -0.139042
I-DATE -> I-MONEY -0.140636
I-MONEY -> I-PE

- The most likely transition understandably includes several instances of B-class -> I-class for the same class. 
- Several I-class -> I-class (same class)transitions also appear to be very common (such as I-DATE -> I-DATE, which is the second most likely transition).
- O -> B-class of all classes are also very likely transitions
- O -> I-class of all classes are understandably the least likely transitions (same as the English dataset)
- I-PERS -> B-PERS is surprisingly likely, perhaps our dataset includes a lot of enumerations of personal names

In [16]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))
print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))
print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
0.573776 B-LOC    word.lower():auden
0.527893 B-PERS   word.lower():symoene
0.503007 O        word.lower():hem
0.494820 B-DATE   -1:word.lower():screef
0.487776 I-PERS   -1:word.lower():vander
0.474440 B-PERS   word.lower():marien
0.473987 B-PERS   word.lower():janne
0.465352 B-DATE   word.lower():kersauonde
0.456608 I-PERS   +1:word.lower():daniel
0.455877 B-PERS   word.lower():pietren
0.453020 O        word.lower():voers
0.448215 B-PERS   word.lower():jonghe
0.426829 B-PERS   word.lower():pieter
0.426577 B-DATE   word.lower():midden
0.421016 B-PERS   word.lower():katelinen
0.420281 I-DATE   -1:word.lower():.m.ccc.
0.420104 B-LOC    word.lower():velseke
0.414966 O        word.lower():sine
0.412633 I-PERS   -1:word.lower():der
0.408862 B-LOC    +1:word.lower():ambochte
0.407218 B-PERS   word.lower():gode
0.404432 B-PERS   word.lower():piet
0.399935 I-PERS   -1:word.lower():den
0.396202 B-PERS   word.lower():gillise
0.395444 I-DATE   -1:word.lower():midden
0.394233 B-LOC  

Compared to the English dataset:
- The "BOS" feature isn't in the top positive list, possibly because the sentences are so much longer so there are less instances of beginning of sentence.
- The few words before a certain word seem to have less influence on its tagging, especially if it's more than 1 word before (in the English dataset there are quite a few instances of "word[-3]" in the top positive
- The top negative does include several instances of "word[-2 or 3]" 

In [7]:
import eli5
eli5.show_weights(crf, top=10)



From \ To,O,B-DATE,I-DATE,B-LOC,I-LOC,B-MONEY,I-MONEY,B-PERS,I-PERS
O,0.273,0.205,-0.314,0.276,-0.113,0.171,-0.245,0.241,-0.356
B-DATE,0.029,-0.05,0.386,-0.002,-0.039,-0.033,-0.021,-0.066,-0.056
I-DATE,-0.137,0.029,0.438,-0.102,-0.076,-0.133,-0.141,-0.115,-0.139
B-LOC,0.056,-0.047,-0.088,-0.017,0.265,0.02,-0.052,0.02,-0.083
I-LOC,0.009,-0.015,-0.067,-0.024,0.221,0.012,-0.015,-0.014,-0.037
B-MONEY,-0.171,-0.011,-0.015,-0.025,-0.012,-0.014,0.365,-0.039,-0.003
I-MONEY,-0.055,-0.072,-0.074,-0.051,-0.053,-0.002,0.363,-0.061,-0.151
B-PERS,0.034,0.003,-0.124,-0.049,-0.04,-0.017,-0.119,-0.022,0.452
I-PERS,0.04,-0.069,-0.157,-0.105,-0.085,0.016,-0.137,0.114,0.294

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8
+0.503,word.lower():hem,,,,,,,
+0.453,word.lower():voers,,,,,,,
+0.415,word.lower():sine,,,,,,,
+0.351,+1:word.lower():roeden,,,,,,,
+0.324,word.lower():ons,,,,,,,
+0.324,word.lower():den,,,,,,,
+0.323,word.lower():heere,,,,,,,
+0.323,+1:word.lower():dandre,,,,,,,
+0.318,+1:word.lower():lants,,,,,,,
+0.312,word.lower():renten,,,,,,,

Weight?,Feature
+0.503,word.lower():hem
+0.453,word.lower():voers
+0.415,word.lower():sine
+0.351,+1:word.lower():roeden
+0.324,word.lower():ons
+0.324,word.lower():den
+0.323,word.lower():heere
+0.323,+1:word.lower():dandre
+0.318,+1:word.lower():lants
+0.312,word.lower():renten

Weight?,Feature
+0.495,-1:word.lower():screef
+0.465,word.lower():kersauonde
+0.427,word.lower():midden
+0.348,word.lower():kerssauonde
+0.334,word.lower():maerte
+0.304,-1:word.lower():telken
+0.304,word.lower():medewintere
+0.258,-1:word.lower():telke
+0.237,-1:word.lower():tote
+0.216,word[-2:]:c.

Weight?,Feature
+0.420,-1:word.lower():.m.ccc.
+0.395,-1:word.lower():midden
+0.359,-1:word.lower():sente
+0.344,EOS
+0.271,-1:word.lower():dach
+0.269,+1:word.lower():jnt
+0.260,+1:word.lower():daghe
+0.244,word.lower():daghe
+0.238,+1:word.lower():maent
+0.223,-1:word.lower():en

Weight?,Feature
+0.574,word.lower():auden
+0.420,word.lower():velseke
+0.409,+1:word.lower():ambochte
+0.394,word.lower():audende
+0.380,word.lower():taudende
+0.374,word.lower():vlaendren
+0.351,word.lower():huerne
+0.342,word.lower():schelde
+0.333,word.lower():taudenghe
+0.333,-1:word.lower():wethou-ders

Weight?,Feature
+0.344,-1:word.lower():sente
+0.272,-1:word.lower():neckers
+0.243,word.lower():jnghelant
+0.243,+1:word.lower():jnghelant
+0.213,+1:word.lower():vp
+0.203,-1:word.lower():molen
+0.195,word[-3:]:ant
+0.188,word.lower():heede
+0.164,-1:word.lower():steen
+0.161,word.lower():hoerenbeke

Weight?,Feature
+0.279,+1:word.lower():scell
+0.233,+1:word.lower():scellinghe
+0.231,word[-2:]:j.
+0.184,+1:word.lower():lib
+0.174,word[-2:]:i.
+0.173,-1:word.lower():ouer
+0.173,-1:word.istitle()
… 325 more positive …,… 325 more positive …
… 40 more negative …,… 40 more negative …
-0.208,-1:word.lower():en

Weight?,Feature
+0.355,word.lower():scellinghe
+0.319,+1:word.lower():suster
+0.260,word[-3:]:ise
+0.250,word.lower():parisisen
+0.250,+1:word.lower():parisisen
+0.249,word[-2:]:ss
+0.223,+1:word.lower():siars
+0.214,+1:word.lower():pene
… 499 more positive …,… 499 more positive …
… 72 more negative …,… 72 more negative …

Weight?,Feature
+0.528,word.lower():symoene
+0.474,word.lower():marien
+0.474,word.lower():janne
+0.456,word.lower():pietren
+0.448,word.lower():jonghe
+0.427,word.lower():pieter
+0.421,word.lower():katelinen
+0.407,word.lower():gode
+0.404,word.lower():piet
+0.396,word.lower():gillise

Weight?,Feature
+0.488,-1:word.lower():vander
+0.457,+1:word.lower():daniel
+0.413,-1:word.lower():der
+0.400,-1:word.lower():den
+0.382,-1:word.lower():vanden
+0.354,-1:word.lower():de
+0.349,-1:word.lower():vand
+0.326,word.lower():der
+0.324,-1:word.lower():ghiselbrecht
+0.318,-1:word.lower():everarts


- For the English dataset, the third highest feature for O is BOS, here it's not in the top 10 features for O, possibly due to sentence length, as mentioned previously.
- However, for I-DATE, EOS is the 4th highest feature. This is because the Middle Dutch dataset is divided by charters as opposed to sentences, and the charters almost always end in dates.


In [8]:
eli5.show_weights(crf, top=10, targets=['O', 'B-DATE', 'I-PERS'])

From \ To,O,B-DATE,I-PERS
O,0.273,0.205,-0.356
B-DATE,0.029,-0.05,-0.056
I-PERS,0.04,-0.069,0.294

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+0.503,word.lower():hem,
+0.453,word.lower():voers,
+0.415,word.lower():sine,
+0.351,+1:word.lower():roeden,
+0.324,word.lower():ons,
+0.324,word.lower():den,
+0.323,word.lower():heere,
+0.323,+1:word.lower():dandre,
+0.318,+1:word.lower():lants,
+0.312,word.lower():renten,

Weight?,Feature
+0.503,word.lower():hem
+0.453,word.lower():voers
+0.415,word.lower():sine
+0.351,+1:word.lower():roeden
+0.324,word.lower():ons
+0.324,word.lower():den
+0.323,word.lower():heere
+0.323,+1:word.lower():dandre
+0.318,+1:word.lower():lants
+0.312,word.lower():renten

Weight?,Feature
+0.495,-1:word.lower():screef
+0.465,word.lower():kersauonde
+0.427,word.lower():midden
+0.348,word.lower():kerssauonde
+0.334,word.lower():maerte
+0.304,-1:word.lower():telken
+0.304,word.lower():medewintere
+0.258,-1:word.lower():telke
+0.237,-1:word.lower():tote
+0.216,word[-2:]:c.

Weight?,Feature
+0.488,-1:word.lower():vander
+0.457,+1:word.lower():daniel
+0.413,-1:word.lower():der
+0.400,-1:word.lower():den
+0.382,-1:word.lower():vanden
+0.354,-1:word.lower():de
+0.349,-1:word.lower():vand
+0.326,word.lower():der
+0.324,-1:word.lower():ghiselbrecht
+0.318,-1:word.lower():everarts


In [9]:
eli5.show_weights(crf, top=10, feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])

Weight?,Feature
0.14,word.isupper()
0.012,word.istitle()

Weight?,Feature
-0.004,word.isupper()
-0.033,word.istitle()

Weight?,Feature
-0.066,word.isupper()
-0.141,word.istitle()

Weight?,Feature
0.086,word.istitle()
-0.001,word.isupper()

Weight?,Feature
-0.024,word.isupper()
-0.039,word.istitle()

Weight?,Feature
0.081,word.istitle()
0.001,word.isupper()

Weight?,Feature
-0.006,word.isupper()
-0.168,word.istitle()

Weight?,Feature
0.088,word.istitle()
-0.009,word.isupper()

Weight?,Feature
0.114,word.istitle()
-0.032,word.isupper()


- As opposed to the English dataset, words being in capital letters increases their chance of being "O" in the Dutch dataset
- Both isupper() and istitle() are top negative features for dates (B and I)
- Understandably, personal names are often istitle(), as personal always start with a capital letter