In [2]:
from nltk.tag import tnt
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [3]:
pmb_data = pd.read_csv('sem-pmb_4_0_0-gold.csv')

In [4]:
def data_extract(data):
  '''Returns dataset in correct format for the NLTK TnT tagger tool'''
  df = data.groupby("sent_file")
  column_val = []
  for k, v in df:
    column_val.append(v)

  sentences = []
  semtags = []
  for n in column_val:
    for x, y in n.items():
      if x == 'token':
        sentences.append(' '.join(map(str, y)))
      elif x == 'semtag':
        semtags.append(' '.join(map(str, y)))
  return sentences, semtags

def combine(sent, tags):
  combined= []
  sent = sent.split()
  tags = tags.split()
  for x, y in zip(sent, tags):
    combined.append((''.join(x), ''.join(y)))
  return combined

sents, semtags = data_extract(pmb_data)
data = []
for x, y in zip(sents, semtags):
  data.append(combine(x, y))

In [7]:
'''Splitting dataset into 80% train and 20% test'''
train_data, test_data = train_test_split(data, test_size=0.20, random_state=1234)

In [8]:
'''Setting up TnT tagger and training the train dataset'''
tnt_tagger = tnt.TnT()
tnt_tagger.train(train_data)

In [9]:
'''Calculating the accuracy score (calculated by built-in TnT function)'''
acc = tnt_tagger.accuracy(test_data)
print('Accuracy score on the test set: {}'.format(round(acc, 4)))

Accuracy score on the test set: 0.8972


In [10]:
def predict(test_set):
  '''Returns predicted list with the sentences, where every sentence is split 
  pairs, consisting of the words and their predicted label tag'''
  x = []
  x_test = []
  for sent in test_set:
    for word_pair in sent:
      x.append(word_pair[0])
    x_test.append(' '.join(x))
    x = []
  
  pred = []
  for n in x_test:
    n = n.split()
    pred.append(tnt_tagger.tag(n))
  return pred

In [11]:
predictions = predict(test_data)

In [12]:
'''Storing predicted and gold labels separtly'''
label_pred = []
label_gold = []
for sent, pred in zip(test_data, predictions):
  for s, p in zip(sent, pred):
    label_gold.append(s[1])
    label_pred.append(p[1])

In [13]:
print(classification_report(label_gold, label_pred, digits=4, zero_division=True))

              precision    recall  f1-score   support

         ALT     0.9773    0.9149    0.9451        47
         AND     0.9865    0.6952    0.8156       105
         APX     1.0000    1.0000    1.0000        18
         ART     1.0000    0.1818    0.3077        11
         BOT     1.0000    1.0000    1.0000         2
         BUT     1.0000    0.8421    0.9143        19
         CLO     0.9615    0.6410    0.7692        39
         COL     1.0000    1.0000    1.0000        27
         CON     0.9763    0.8154    0.8886      1766
         COO     0.8077    0.8077    0.8077        26
         CTC     1.0000    0.2500    0.4000         4
         DEF     0.9150    0.9819    0.9473      1601
         DEG     0.8333    0.8333    0.8333        60
         DIS     0.9596    0.8480    0.9003       980
         DOM     0.4000    0.4000    0.4000        10
         DOW     1.0000    1.0000    1.0000         5
         DST     0.9623    1.0000    0.9808        51
         EFS     1.0000    