In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# **Import Libraries**

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# **Load Dataset**

In [3]:
df = pd.read_csv('/content/gdrive/MyDrive/Sequence Learning Assignments/CRF_POS_dataset.csv', encoding = "ISO-8859-1")
print(df.shape)

(567007, 3)


In [4]:
df.head(30)

Unnamed: 0,Num,Word,Tag_POS
0,1.0,A,DT
1,,37-year-old,JJ
2,,woman,NN
3,,has,VBZ
4,,become,VBN
5,,the,DT
6,,13th,JJ
7,,person,NN
8,,in,IN
9,,Egypt,NNP


In [5]:
df.isnull().sum()

Num        541078
Word            0
Tag_POS         0
dtype: int64

In [6]:
df = df.fillna(method='ffill')

In [7]:
df.head(30)

Unnamed: 0,Num,Word,Tag_POS
0,1.0,A,DT
1,1.0,37-year-old,JJ
2,1.0,woman,NN
3,1.0,has,VBZ
4,1.0,become,VBN
5,1.0,the,DT
6,1.0,13th,JJ
7,1.0,person,NN
8,1.0,in,IN
9,1.0,Egypt,NNP


In [8]:
df.isnull().sum()

Num        0
Word       0
Tag_POS    0
dtype: int64

In [9]:
df.Num.nunique(), df.Word.nunique(), df.Tag_POS.nunique()

(25929, 26398, 42)

In [10]:
df.groupby('Tag_POS').size()

Tag_POS
$         625
,       17704
.       25852
:         426
;         104
CC      12737
CD      13538
DT      52987
EX        343
FW          1
IN      65465
JJ      42521
JJR      1640
JJS      1683
LRB       393
MD       3759
NN      78624
NNP     71316
NNPS     1362
NNS     40985
PDT        84
POS      6094
PRP      7236
PRP$     4741
RB      10889
RBR       577
RBS       160
RP       1324
RRB       394
TO      12393
UH         13
VB      12939
VBD     21334
VBG     10313
VBN     17492
VBP      8683
VBZ     13512
WDT      2018
WP       1384
WP$        57
WRB      1182
``       2123
dtype: int64

# **Prepare Data**

**Clean Text**

In [11]:
def preprocess_text(str):
  import re
  import nltk
  from nltk.tokenize import word_tokenize
  
  # Remove and replace "'", "--", "-","[","]" by " "  
  str = re.sub(r'[\]\[\-\--\/.\'\,(;:)\\"!?]',r' ', str, flags=re.MULTILINE)
  
  words = word_tokenize(str)
  
  clean_text = ''
  
  for word in words:
    clean_text = clean_text+' '+word
  
  return clean_text

In [12]:
'''

for i in range(len(df)):
  df.loc[i, 'Word'] = preprocess_text(df.loc[i, 'Word'])

print(df.head(30))
'''

"\n\nfor i in range(len(df)):\n  df.loc[i, 'Word'] = preprocess_text(df.loc[i, 'Word'])\n\nprint(df.head(30))\n"

**Get Sentences**

In [13]:
agg_func = lambda s: [(w, p) for w, p in zip(s['Word'].values.tolist(), s['Tag_POS'].values.tolist())]
grouped = df.groupby('Num').apply(agg_func)
sentences = [s for s in grouped]
grouped.head()

Num
1.0    [(A, DT), (37-year-old, JJ), (woman, NN), (has...
2.0    [(Nadia, NNP), (Mohammed, NNP), (Abdel, NNP), ...
3.0    [(Health, NNP), (officials, NNS), (initially, ...
4.0    [(The, DT), (woman, NN), (raised, VBD), (poult...
5.0    [(Health, NNP), (officials, NNS), (announced, ...
dtype: object

**Feature Extraction**

In [14]:
print(type(sentences)), print(len(sentences)), print(sentences[0])

<class 'list'>
25929
[('A', 'DT'), ('37-year-old', 'JJ'), ('woman', 'NN'), ('has', 'VBZ'), ('become', 'VBN'), ('the', 'DT'), ('13th', 'JJ'), ('person', 'NN'), ('in', 'IN'), ('Egypt', 'NNP'), ('to', 'TO'), ('die', 'VB'), ('of', 'IN'), ('the', 'DT'), ('H5N1', 'NNP'), ('strain', 'NN'), ('of', 'IN'), ('bird', 'NN'), ('flu', 'NN'), ('.', '.')]


(None, None, None)

In [15]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [postag for token, postag in sent]

def sent2tokens(sent):
    return [token for token, postag in sent]

In [16]:
print(sentences[0])
sent2features(sentences[0])

[('A', 'DT'), ('37-year-old', 'JJ'), ('woman', 'NN'), ('has', 'VBZ'), ('become', 'VBN'), ('the', 'DT'), ('13th', 'JJ'), ('person', 'NN'), ('in', 'IN'), ('Egypt', 'NNP'), ('to', 'TO'), ('die', 'VB'), ('of', 'IN'), ('the', 'DT'), ('H5N1', 'NNP'), ('strain', 'NN'), ('of', 'IN'), ('bird', 'NN'), ('flu', 'NN'), ('.', '.')]


[{'+1:postag': 'JJ',
  '+1:postag[:2]': 'JJ',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': '37-year-old',
  'BOS': True,
  'bias': 1.0,
  'postag': 'DT',
  'postag[:2]': 'DT',
  'word.isdigit()': False,
  'word.istitle()': True,
  'word.isupper()': True,
  'word.lower()': 'a',
  'word[-2:]': 'A',
  'word[-3:]': 'A'},
 {'+1:postag': 'NN',
  '+1:postag[:2]': 'NN',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'woman',
  '-1:postag': 'DT',
  '-1:postag[:2]': 'DT',
  '-1:word.istitle()': True,
  '-1:word.isupper()': True,
  '-1:word.lower()': 'a',
  'bias': 1.0,
  'postag': 'JJ',
  'postag[:2]': 'JJ',
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': False,
  'word.lower()': '37-year-old',
  'word[-2:]': 'ld',
  'word[-3:]': 'old'},
 {'+1:postag': 'VBZ',
  '+1:postag[:2]': 'VB',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'has',
  '-1:postag': 'JJ',
  '-1:post

In [17]:
print(sentences[0])
print(sent2labels(sentences[0]))
print(sent2tokens(sentences[0]))

[('A', 'DT'), ('37-year-old', 'JJ'), ('woman', 'NN'), ('has', 'VBZ'), ('become', 'VBN'), ('the', 'DT'), ('13th', 'JJ'), ('person', 'NN'), ('in', 'IN'), ('Egypt', 'NNP'), ('to', 'TO'), ('die', 'VB'), ('of', 'IN'), ('the', 'DT'), ('H5N1', 'NNP'), ('strain', 'NN'), ('of', 'IN'), ('bird', 'NN'), ('flu', 'NN'), ('.', '.')]
['DT', 'JJ', 'NN', 'VBZ', 'VBN', 'DT', 'JJ', 'NN', 'IN', 'NNP', 'TO', 'VB', 'IN', 'DT', 'NNP', 'NN', 'IN', 'NN', 'NN', '.']
['A', '37-year-old', 'woman', 'has', 'become', 'the', '13th', 'person', 'in', 'Egypt', 'to', 'die', 'of', 'the', 'H5N1', 'strain', 'of', 'bird', 'flu', '.']


# **Build Conditional Random Fields - CRF Model**

In [18]:
! pip install sklearn_crfsuite
! pip install eli5



In [19]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

**Create X and y**

In [20]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [21]:
print(sentences[0])
print(y[0])
X[0]

[('A', 'DT'), ('37-year-old', 'JJ'), ('woman', 'NN'), ('has', 'VBZ'), ('become', 'VBN'), ('the', 'DT'), ('13th', 'JJ'), ('person', 'NN'), ('in', 'IN'), ('Egypt', 'NNP'), ('to', 'TO'), ('die', 'VB'), ('of', 'IN'), ('the', 'DT'), ('H5N1', 'NNP'), ('strain', 'NN'), ('of', 'IN'), ('bird', 'NN'), ('flu', 'NN'), ('.', '.')]
['DT', 'JJ', 'NN', 'VBZ', 'VBN', 'DT', 'JJ', 'NN', 'IN', 'NNP', 'TO', 'VB', 'IN', 'DT', 'NNP', 'NN', 'IN', 'NN', 'NN', '.']


[{'+1:postag': 'JJ',
  '+1:postag[:2]': 'JJ',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': '37-year-old',
  'BOS': True,
  'bias': 1.0,
  'postag': 'DT',
  'postag[:2]': 'DT',
  'word.isdigit()': False,
  'word.istitle()': True,
  'word.isupper()': True,
  'word.lower()': 'a',
  'word[-2:]': 'A',
  'word[-3:]': 'A'},
 {'+1:postag': 'NN',
  '+1:postag[:2]': 'NN',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'woman',
  '-1:postag': 'DT',
  '-1:postag[:2]': 'DT',
  '-1:word.istitle()': True,
  '-1:word.isupper()': True,
  '-1:word.lower()': 'a',
  'bias': 1.0,
  'postag': 'JJ',
  'postag[:2]': 'JJ',
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': False,
  'word.lower()': '37-year-old',
  'word[-2:]': 'ld',
  'word[-3:]': 'old'},
 {'+1:postag': 'VBZ',
  '+1:postag[:2]': 'VB',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'has',
  '-1:postag': 'JJ',
  '-1:post

In [22]:
new_classes = df['Tag_POS'].unique().tolist()
new_classes.sort()
print(new_classes)

['$', ',', '.', ':', ';', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LRB', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'RRB', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']


**Train using cross_validate_predict**

In [23]:
crf_cvp = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [25]:
from sklearn.model_selection import cross_val_predict

y_pred_cvp = cross_val_predict(crf_cvp, X, y, cv=3)
metrics.flat_f1_score(y, y_pred_cvp, average='weighted', labels=new_classes)



0.9999973545625818

In [28]:
print(metrics.flat_classification_report(y, y_pred_cvp, labels = new_classes))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           $       1.00      1.00      1.00       625
           ,       1.00      1.00      1.00     17704
           .       1.00      1.00      1.00     25852
           :       1.00      1.00      1.00       426
           ;       1.00      1.00      1.00       104
          CC       1.00      1.00      1.00     12737
          CD       1.00      1.00      1.00     13538
          DT       1.00      1.00      1.00     52987
          EX       1.00      1.00      1.00       343
          FW       0.00      0.00      0.00         1
          IN       1.00      1.00      1.00     65465
          JJ       1.00      1.00      1.00     42521
         JJR       1.00      1.00      1.00      1640
         JJS       1.00      1.00      1.00      1683
         LRB       1.00      1.00      1.00       393
          MD       1.00      1.00      1.00      3759
          NN       1.00      1.00      1.00     78624
         NNP       1.00    

**Split Train and Test Data**

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [30]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [31]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=new_classes)

0.9999919664081423

In [32]:
i = 63
y = y_pred[i]
X = [wordfeatures['word.lower()'] for wordfeatures in X_test[i]]
print(' '.join(X))
for word, entity in zip(X,y):
  if entity != 'O':
    print(word,entity)

if returned to peru , he faces charges of corruption and of authorizing death squads .
if IN
returned VBN
to TO
peru NNP
, ,
he PRP
faces VBZ
charges NNS
of IN
corruption NN
and CC
of IN
authorizing VBG
death NN
squads NNS
. .


In [33]:
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           $       1.00      1.00      1.00       184
           ,       1.00      1.00      1.00      5747
           .       1.00      1.00      1.00      8540
           :       1.00      1.00      1.00       153
           ;       1.00      1.00      1.00        31
          CC       1.00      1.00      1.00      4162
          CD       1.00      1.00      1.00      4456
          DT       1.00      1.00      1.00     17464
          EX       1.00      1.00      1.00       113
          FW       0.00      0.00      0.00         1
          IN       1.00      1.00      1.00     21555
          JJ       1.00      1.00      1.00     14181
         JJR       1.00      1.00      1.00       530
         JJS       1.00      1.00      1.00       540
         LRB       1.00      1.00      1.00       146
          MD       1.00      1.00      1.00      1248
          NN       1.00      1.00      1.00     26009
         NNP       1.00    