In [1]:
from io import open
from io import StringIO
import pandas as pd
import numpy as np
import re
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from collections import Counter
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split

In [2]:
# reading traing file
data_file = open("hi-ud-train.conllu","r", encoding="utf-8").read()
string_data = StringIO(data_file)
train = pd.read_csv(string_data, sep =",")

In [3]:
# converting data file to trainable data
def trainable_data(file):
    file = file.replace(np.nan,"###", regex=True)
    data_list = []
    sen = []
    tag_list = []
    y = []
    for i in range(len(file)):
        if(file['WORD'][i]=='###'):
            data_list.append(sen)
            sen = []
            tag_list.append(y)
            y = []
        
        else:
            sen.append(file['WORD'][i])
            y.append(file['POS_TAG'][i])
    return data_list,tag_list

In [4]:
train_list,y_train = trainable_data(train)

## Buliding the model

### Features Taken are:   
##### 1) Is it the first word of the sentence?  
Knowing if the given word is the first word of the sentence can help in providing greater emphasis on certain part of speech, for example a declarative sentence very often starts with a Noun as subject and with a Auxilary verb in case of an interogative sentence.  
##### 2) Is it the last word of the sentence?  
Punctuations very often marks the end of sentences.So taking into consideration if the given word is the last word in the sentence can provide vital information for PUNCT tag.  
##### 3) What is the previous word and the next word?
Certain part of speech tags go hand-in-hand and occurence of the first provide greater chnace for occurence of the second. For example, in hindi a Noun is often followed by prepostion or verb e.g. राम खेलता है, राम का घर, etc. Similarly, an Adjective very often comes before an Noun e.g. लाल कपडा,बडा घर, etc.  
##### 4) What is the word before the previous word?  
Taking the word before the previous word provides better contex to the following words.In addition to that in hindi words like दिन-रात,सुख-दुख etc. containing a hyphen sign in between them are very common and they are of same pos tag. Thus taking previos two words to train will make our model more robust.  
##### 5) What is the word after the next word?
This will also help in capturing the context of the word from the sentence.
##### 6) Is it a numeric?
For NUM pos tag this is important to distinguish pure numbers from non-numeric words.  
##### 7) Is it alphanumeric?
In train corpus there are words like 20वी which are alphanumeric hence to capture there attributes taking alphanumeric as feature seems vital.  
##### 8) What is the prefix of the word?
prefixes in hindi are very important to detremine the behaviour of word in the sentence.For example हारा is Ajective but तुम्हारा is Pronoun. 
##### 9) What is the suffix of the word?
Suffixes in hindi often change the pos tag of root word, for example वह is Pronoun but वहां is Adverb.

In [5]:
def features(sentence,index):
    # sentence is of the form [w1,w2,w3,..], index is the position of the word in the sentence
    return {
        'is_first_word': int(index==0),
        'is_last_word':int(index==len(sentence)-1),
        'prev_prev_word':'' if index==0 or index==1 else sentence[index-2], 
        'prev_word':'' if index==0 else sentence[index-1],
        'next_word':'' if index==len(sentence)-1 else sentence[index+1],
        'next_next_word':'' if index==len(sentence)-2 or index==len(sentence)-1 else sentence[index+2],
        'is_numeric':int(sentence[index].isdigit()),
        'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
        'prefix_1':sentence[index][0],
        'prefix_2': sentence[index][:2],
        'prefix_3':sentence[index][:3],
        'prefix_4':sentence[index][:4],
        'suffix_1':sentence[index][-1],
        'suffix_2':sentence[index][-2:],
        'suffix_3':sentence[index][-3:],
        'suffix_4':sentence[index][-4:],  
    }

In [6]:
def prepareData(data):
    # data is of the form [[w1,w2,w3,..],[w1,w2,w3,..],...]
    X_train = []
    for i in range(len(data)):
        temp = [] 
        for j in range(len(data[i])):
            temp.append(features(data[i],j))
        X_train.append(temp)
    return X_train

In [7]:
X_train = prepareData(train_list)

In [8]:
# Model buliding and hyperparameter optimazation
crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted')
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

rs.fit(X_train,y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.3min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error...e,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False),
          fit_params=None, iid='warn', n_iter=50, n_jobs=-1,
          param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f98ac570c88>, 'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f98655dc358>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn',
          scoring=make_scorer(flat_f1_score, average=weighted), verbose=1)

In [9]:
# Values of c1 and c2 after hyperparameter tunning
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.04259998226701805, 'c2': 0.01884534311587427}
best CV score: 0.8543741387677777
model size: 0.52M


In [10]:
# the overall accuracy of train set
crf = rs.best_estimator_
y_pred_train=crf.predict(X_train)
print(f"The overall accuracy on train set: {metrics.flat_accuracy_score(y_train,y_pred_train)}")

The overall accuracy on train set: 0.9998663816141101


## Evaluating on test set

In [11]:
# reading test file
test_file = open("hi-ud-test .conllu","r", encoding="utf-8").read()
test_string = StringIO(test_file)
test = pd.read_csv(test_string,sep='\t')

In [12]:
# preparing test data for evaluation
test = test.replace(np.nan,"###", regex=True)
test_list = []
sen = []
y_test = []
y = []
for i in range(len(test)):
    if(test['WORD'][i]=='###'):
        test_list.append(sen)
        sen = []
        y_test.append(y)
        y = []
        
    else:
        sen.append(test['WORD'][i])
        y.append(test['TAG'][i])

In [13]:
X_test = prepareData(test_list)

In [14]:
y_pred_test = crf.predict(X_test)
result = metrics.flat_accuracy_score(y_test,y_pred_test)
print(f"The Overall accuracy on test set: {result}")

The Overall accuracy on test set: 0.8497229916897507


## Analysing Transition features

In [15]:
print(f"Total number of Transition Features: {len(crf.transition_features_)}")

Total number of Transition Features: 159


In [16]:
# 10 Most common transition features
Counter(crf.transition_features_).most_common(10)

[(('VERB', 'AUX'), 4.810736),
 (('PROPN', 'PROPN'), 3.750578),
 (('ADJ', 'NOUN'), 3.495997),
 (('NUM', 'NOUN'), 2.940721),
 (('DET', 'NOUN'), 2.497383),
 (('NOUN', 'ADP'), 2.473905),
 (('PROPN', 'ADP'), 2.334321),
 (('VERB', 'SCONJ'), 2.027346),
 (('PART', 'NUM'), 1.912271),
 (('PRON', 'ADP'), 1.884884)]

In [17]:
# 10 least common transition features
Counter(crf.transition_features_).most_common()[-10:]

[(('PROPN', 'AUX'), -1.465426),
 (('AUX', 'ADJ'), -1.487373),
 (('PROPN', 'DET'), -1.555321),
 (('CCONJ', 'PART'), -1.62782),
 (('DET', 'CCONJ'), -1.690419),
 (('PRON', 'CCONJ'), -1.761658),
 (('CCONJ', 'AUX'), -1.777492),
 (('ADJ', 'PRON'), -2.580376),
 (('ADJ', 'ADP'), -2.855517),
 (('DET', 'ADP'), -3.110945)]

In [18]:
# Precision, Recall, F-score per unique POS tag on the train set
print(metrics.flat_classification_report(
      y_train, y_pred_train, labels=crf.classes_, digits=3
))

              precision    recall  f1-score   support

         DET      1.000     1.000     1.000       230
       PROPN      1.000     1.000     1.000       707
         ADP      1.000     1.000     1.000      1384
         ADV      1.000     1.000     1.000       110
         ADJ      1.000     1.000     1.000       569
        NOUN      1.000     1.000     1.000      1596
         NUM      1.000     1.000     1.000       152
         AUX      0.999     1.000     0.999       728
       PUNCT      1.000     1.000     1.000       563
        PRON      1.000     1.000     1.000       430
        VERB      1.000     0.998     0.999       639
       CCONJ      1.000     1.000     1.000       150
        PART      1.000     1.000     1.000       163
       SCONJ      1.000     1.000     1.000        61
           X      1.000     1.000     1.000         2

   micro avg      1.000     1.000     1.000      7484
   macro avg      1.000     1.000     1.000      7484
weighted avg      1.000   

In [19]:
# Precision, Recall, F-score per unique POS tag on the test set
print(metrics.flat_classification_report(
    y_test, y_pred_test, labels=crf.classes_, digits=3
))

              precision    recall  f1-score   support

         DET      0.865     0.889     0.877        36
       PROPN      0.606     0.535     0.568       144
         ADP      0.955     0.970     0.962       303
         ADV      0.688     0.524     0.595        21
         ADJ      0.654     0.723     0.687        94
        NOUN      0.789     0.855     0.821       324
         NUM      0.885     0.920     0.902        25
         AUX      0.943     0.957     0.950       138
       PUNCT      1.000     0.836     0.911       134
        PRON      0.862     0.862     0.862        65
        VERB      0.885     0.859     0.872        99
       CCONJ      1.000     1.000     1.000        25
        PART      1.000     0.970     0.985        33
       SCONJ      0.600     1.000     0.750         3
           X      0.000     0.000     0.000         0

   micro avg      0.850     0.850     0.850      1444
   macro avg      0.782     0.793     0.783      1444
weighted avg      0.851   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
