#1. Introduction

This notebook will contain steps to train and predict with a CRF model using PyCRFSuite for the NER task as well as Crossvalidation for evaluation. The files you will need for this colab are "PETv1.1-entities.jsonl", "complete_combined_leschneiderdata_NER.jsonl" and "glove.6B.300d.txt". The GloVe pre-trained model can be downloaded from the link in "Pre-Trained NER Models Links".

In [None]:
#@title 1.1. Importing and Installing libraries
%%capture
import os
# Define the path to the flag file, we do this so you can rerun the whole colab on different examples without having to wait 4 minutes each time.
flag_file_installations = '/content/installed_flag'

# Check if the flag file exists
if not os.path.exists(flag_file_installations):

  !pip install python-crfsuite
  !python -m spacy download en_core_web_md

  import json
  import pandas as pd
  import copy
  import numpy as np
  from itertools import product
  import spacy
  import json
  from itertools import product
  from sklearn.preprocessing import LabelEncoder
  from google.colab import drive
  import nltk
  from nltk import pos_tag
  from nltk.tokenize import word_tokenize
  import typing
  import pycrfsuite  # For CRF training
  from sklearn.model_selection import train_test_split
  from sklearn.metrics import f1_score, classification_report
  from sklearn.model_selection import KFold
  from sklearn.metrics import classification_report, precision_recall_fscore_support, f1_score

  import random
  SEED = 42

  nltk.download('averaged_perceptron_tagger')
  drive.mount('/content/drive')
  # Create the flag file
  with open(flag_file_installations, 'w') as f:
      f.write('Installed')
else:
    print("Packages already installed. Skipping installations.")

In [None]:
#@title Loading Data
def load_and_group_ner_data(file_path):
    grouped_data = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line)
            document_name = entry['document name']  # Adjusted to use 'document name'
            if document_name not in grouped_data:
                grouped_data[document_name] = []
            grouped_data[document_name].append(entry)
            #print(entry)
    # Sort each group by 'sentence-ID'
    for doc in grouped_data.values():
        doc.sort(key=lambda x: x['sentence-ID'])

    return list(grouped_data.values())
try:
  PET_Folder = '/content/drive/MyDrive/THESIS/DATA/PET/actual PET data from Patrizio Bellan/PETv1.1-entities.jsonl'
  LESCHNEIDER_Folder = '/content/drive/MyDrive/THESIS/DATA/LESCHNEIDER DATA/Documents/FORMATTED_DATA_ELEMENTS/complete_combined_leschneiderdata_NER.jsonl'

  # Load and group data from both files calling on a function above
  grouped_data_1 = load_and_group_ner_data(PET_Folder)
  grouped_data_2 = load_and_group_ner_data(LESCHNEIDER_Folder)

  # Append the contents of the second list to the first
  unflattened_data = grouped_data_1 + grouped_data_2 #Combined data

  PET_data = [item for group in grouped_data_1 for item in group]

  LESCHNEIDER_data = [item for group in grouped_data_2 for item in group]

  #Because the data is grouped according to doc_name, the unflattened_data is a list that contains lists. By unpacking we create a list containing not lists but the actual documents.
  combined_data = [item for group in unflattened_data for item in group] #Now input_data containts all the sentences.

  if combined_data or PET_data or LESCHNEIDER_data:
    print("Data Loaded")



except FileNotFoundError as fnf:
  print("File was not found or incorrect file directory, please try to run cell again")



Data Loaded


#2. Pre-Processing of Input Data

In [None]:
#@title pre-processing functions
# Load GloVe model
def load_glove_model(glove_file_path):
    """Load the GloVe model as a dictionary."""
    embedding_dict = {}
    with open(glove_file_path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = list(map(float, values[1:]))
            embedding_dict[word] = vector
    return embedding_dict

def word2features(sent, i, glove_model):

    # Extract the current word and its POS tag
    word = sent[i]['token']
    postag = sent[i]['pos_tag']

    # Initialize a dictionary to hold the features
    features = {
        'bias': 1.0,  # acts as an intercept term
        'word.lower()': word.lower(),  # Lowercase form of the word
        'word[-3:]': word[-3:],  # Last three characters of the word (suffix)
        'word[-2:]': word[-2:],  # Last two characters of the word (suffix)
        'word.isupper()': word.isupper(),  # Is the word in uppercase?
        'word.istitle()': word.istitle(),  # is the word title-cased?
        'word.isdigit()': word.isdigit(),  # is the word a digit?
        'postag': postag,  # Full POS tag of the word
        'postag[:2]': postag[:2],  # First two characters of the POS tag
    }

    # Add GloVe embedding features for the current word
    if word in glove_model:
        glove_embedding = glove_model[word]
    else:
        glove_embedding = np.zeros(300)  # Use zero vector if word is not in GloVe model

    # Add each dimension of the GloVe embedding as a feature
    for idx, val in enumerate(glove_embedding):
        features[f'glove_{idx}'] = val

    # Add features for the previous word (if it exists)
    if i > 0:
        word1 = sent[i-1]['token']
        postag1 = sent[i-1]['pos_tag']
        features.update({
            '-1:word.lower()': word1.lower(),  # Lowercase form of the previous word
            '-1:word.istitle()': word1.istitle(),  # is the previous word title-cased?
            '-1:word.isupper()': word1.isupper(),  # is the previous word in uppercase?
            '-1:postag': postag1,  # Full POS tag of the previous word
            '-1:postag[:2]': postag1[:2],  # First two characters of the previous word's POS tag
        })
    else:
        features['BOS'] = True  # Indicate beginning of sentence

    # Add features for the next word (if it exists)
    if i < len(sent) - 1:
        word1 = sent[i+1]['token']
        postag1 = sent[i+1]['pos_tag']
        features.update({
            '+1:word.lower()': word1.lower(),  # Lowercase form of the next word
            '+1:word.istitle()': word1.istitle(),  # Is the next word title-cased?
            '+1:word.isupper()': word1.isupper(),  # Is the next word in uppercase?
            '+1:postag': postag1,  # Full POS tag of the next word
            '+1:postag[:2]': postag1[:2],  # First two characters of the next word's POS tag
        })
    else:
        features['EOS'] = True  # Indicate end of sentence

    return features

# Extract features for CRF
def sent2features(sent):
    return [word2features(sent, i, glove_model) for i in range(len(sent))]

# Modify preprocess_data function to include word2features
# Function to prepare data
def prepare_data(input_data):
    sentences = []
    labels = []

    for entry in input_data:
        tokens = entry['tokens']
        ner_tags = entry['ner-tags']
        sentence = []

        for token, ner_tag in zip(tokens, ner_tags):
            pos = pos_tag([token])[0][1]
            sentence.append({'token': token, 'pos_tag': pos})

        sentences.append(sentence)
        labels.append(ner_tags)

    return sentences, labels

def get_sorted_labels_from_flat(all_y_test_flat):

    pairs = []
    non_paired_labels = []
    unique_labels = set(all_y_test_flat)

    for label in unique_labels:
        if label.startswith("B-"):
            corresponding_i_label = label.replace("B-", "I-")
            if corresponding_i_label in unique_labels:
                pairs.append((label, corresponding_i_label))
        elif label == 'O':
            non_paired_labels.append(label)

    # Sort pairs alphabetically based on the B label
    pairs.sort(key=lambda pair: pair[0])

    # Flatten the sorted pairs
    sorted_labels = [label for pair in pairs for label in pair]
    sorted_labels.extend(non_paired_labels)

    return sorted_labels


In [None]:
#@title pre-processing and splitting data
# Path to your GloVe file
glove_path = '/content/drive/MyDrive/THESIS/CODING/NAM_TESTING/TESTING DATA/RE_TRAINING_DATA/GloVe pre-trained/glove.6B.300d.txt'

# Load the GloVe model
glove_model = load_glove_model(glove_path)



In [None]:
#@title CrossValidation for CRF
def crossvalidation_CRF(sentences, labels, SEED):
  # Split the data into training and testing sets

  kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
  f1_scores = []
  all_y_test_flat = []
  all_y_pred_flat = []

  for train_index, test_index in kf.split(sentences):
      X_train, X_test = [sentences[i] for i in train_index], [sentences[i] for i in test_index]
      y_train, y_test = [labels[i] for i in train_index], [labels[i] for i in test_index]

      # Prepare feature sets
      X_train_feats = [sent2features(s) for s in X_train]
      X_test_feats = [sent2features(s) for s in X_test]

      # Train the CRF model
      trainer = pycrfsuite.Trainer(verbose=False)
      for xseq, yseq in zip(X_train_feats, y_train):
          trainer.append(xseq, yseq)

      # Set parameters for the CRF
      trainer.set_params({
          'c1': 1.0,   # coefficient for L1 penalty
          'c2': 1e-3,  # coefficient for L2 penalty
          'max_iterations': 75,  # stop earlier
          'feature.possible_transitions': True
      })

      # Train the model
      trainer.train('crf.model')

      # Load the trained model
      tagger = pycrfsuite.Tagger()
      tagger.open('crf.model')

      # Predict on the test set
      y_pred = [tagger.tag(xseq) for xseq in X_test_feats]

      # Flatten the predictions and true labels
      y_test_flat = [y for y_seq in y_test for y in y_seq]
      y_pred_flat = [y for y_seq in y_pred for y in y_seq]

      # Store results
      all_y_test_flat.extend(y_test_flat)
      all_y_pred_flat.extend(y_pred_flat)

      # Calculate the micro F1 score for this fold and store it
      micro_f1 = f1_score(y_test_flat, y_pred_flat, average='micro')
      macro_f1 = f1_score(y_test_flat, y_pred_flat, average='macro')
      f1_scores.append(micro_f1)

  # Calculate and print the overall micro F1 score

  labels_flat = get_sorted_labels_from_flat([item for group in labels for item in group]) #we flatten it first before we put it in the sorting function

  print('these are the labels_flat:',labels_flat)
  overall_micro_f1 = f1_score(all_y_test_flat, all_y_pred_flat, average='micro', labels=labels_flat)
  overall_macro_f1 = f1_score(all_y_test_flat, all_y_pred_flat, average='macro', labels=labels_flat)
  print(f"Overall Micro F1 Score: {overall_micro_f1:.4f}")

  # Calculate overall precision, recall, and F1 score
  precision, recall, f1, _ = precision_recall_fscore_support(all_y_test_flat, all_y_pred_flat, average='micro', labels= labels_flat)
  print(f"Overall Precision: {precision:.4f}")
  print(f"Overall Recall: {recall:.4f}")
  print(f"Overall F1 Score: {f1:.4f}")



  # Print the classification report for the overall results
  report = classification_report(all_y_test_flat, all_y_pred_flat, labels=labels_flat)
  print(report)

  return overall_micro_f1, overall_macro_f1, f1_scores, report


In [None]:
#@title 1. Crossvalidation on PET Baseline

# Process the input data
sentences_PET, labels_PET = prepare_data(PET_data)

# Cross-validation
overall_micro_f1_PET, overall_macro_f1_PET, f1_scores_PET, report_PET = crossvalidation_CRF(sentences_PET, labels_PET, SEED)

these are the labels_flat: ['B-AND Gateway', 'I-AND Gateway', 'B-Activity', 'I-Activity', 'B-Activity Data', 'I-Activity Data', 'B-Actor', 'I-Actor', 'B-Condition Specification', 'I-Condition Specification', 'B-Further Specification', 'I-Further Specification', 'B-XOR Gateway', 'I-XOR Gateway', 'O']
Overall Micro F1 Score: 0.7237
Overall Precision: 0.7237
Overall Recall: 0.7237
Overall F1 Score: 0.7237


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

            B-AND Gateway       0.00      0.00      0.00         8
            I-AND Gateway       0.00      0.00      0.00         9
               B-Activity       0.81      0.76      0.78       502
               I-Activity       0.53      0.33      0.41        49
          B-Activity Data       0.76      0.71      0.73       459
          I-Activity Data       0.69      0.70      0.69      1158
                  B-Actor       0.81      0.78      0.80       449
                  I-Actor       0.77      0.77      0.77       598
B-Condition Specification       0.87      0.65      0.74        80
I-Condition Specification       0.74      0.59      0.65       403
  B-Further Specification       0.42      0.23      0.30        64
  I-Further Specification       0.27      0.19      0.22       268
            B-XOR Gateway       0.83      0.74      0.78       117
            I-XOR Gateway       0.80      0.39      0.52     

In [None]:
#@title 2. Training on PET, Testing on LESCHNEIDER

sentences_PET2, labels_PET2 = prepare_data(PET_data) # for clarity
sentences_LESCHNEIDER, labels_LESCHNEIDER = prepare_data(LESCHNEIDER_data)

X_train, y_train = sentences_PET2, labels_PET2
X_test, y_test = sentences_LESCHNEIDER, labels_LESCHNEIDER

# Prepare feature sets
X_train_feats = [sent2features(s) for s in X_train]
X_test_feats = [sent2features(s) for s in X_test]

# Train the CRF model
trainer2 = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train_feats, y_train):
    trainer2.append(xseq, yseq)

# Set parameters for the CRF
trainer2.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier, we tested several
    'feature.possible_transitions': True
})

# Train the model
trainer2.train('crf.model2')

# Load the trained model
tagger = pycrfsuite.Tagger()
tagger.open('crf.model2')

# Predict on the test set
y_pred = [tagger.tag(xseq) for xseq in X_test_feats]

# Flatten the predictions and true labels
y_test_flat = [y for y_seq in y_test for y in y_seq]
y_pred_flat = [y for y_seq in y_pred for y in y_seq]

# Calculate the micro F1 score and store it
labels_flat = list(get_sorted_labels_from_flat(y_test_flat+y_pred_flat))

micro_f1 = f1_score(y_test_flat, y_pred_flat, average='micro', labels= labels_flat)
macro_f1 = f1_score(y_test_flat, y_pred_flat, average='macro', labels= labels_flat)
#weighted_f1 = f1_score(y_test_flat, y_pred_flat, average='weighted', labels= labels_flat)

print(f"Micro F1 Score: {micro_f1:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
#print(f"Weighted F1 Score: {weighted_f1:.4f}")

report = classification_report(y_test_flat, y_pred_flat, labels= labels_flat)
print(report)

Micro F1 Score: 0.6414
Macro F1 Score: 0.4921
                           precision    recall  f1-score   support

            B-AND Gateway       0.00      0.00      0.00        32
            I-AND Gateway       0.00      0.00      0.00        77
               B-Activity       0.80      0.58      0.67       111
               I-Activity       1.00      0.11      0.20         9
          B-Activity Data       0.79      0.52      0.63       109
          I-Activity Data       0.71      0.56      0.63       184
                  B-Actor       0.95      0.63      0.76        93
                  I-Actor       0.97      0.66      0.79       101
B-Condition Specification       0.88      0.78      0.82        18
I-Condition Specification       0.95      0.86      0.90        65
  B-Further Specification       0.22      0.09      0.13        22
  I-Further Specification       0.28      0.24      0.26        68
            B-XOR Gateway       1.00      0.84      0.91        19
            I-X

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#@title 3. Crossvalidation on Combined Dataset

sentences_combined, labels_combined = prepare_data(combined_data)

# Cross-validation
overall_micro_f1_combined, overall_macro_f1_combined, f1_scores_combined, report_combined = crossvalidation_CRF(sentences_combined, labels_combined, SEED)

these are the labels_flat: ['B-AND Gateway', 'I-AND Gateway', 'B-Activity', 'I-Activity', 'B-Activity Data', 'I-Activity Data', 'B-Actor', 'I-Actor', 'B-Condition Specification', 'I-Condition Specification', 'B-Further Specification', 'I-Further Specification', 'B-XOR Gateway', 'I-XOR Gateway', 'O']
Overall Micro F1 Score: 0.7283
                           precision    recall  f1-score   support

            B-AND Gateway       0.50      0.15      0.23        40
            I-AND Gateway       0.07      0.06      0.07        86
               B-Activity       0.80      0.77      0.78       613
               I-Activity       0.67      0.38      0.48        58
          B-Activity Data       0.75      0.70      0.72       568
          I-Activity Data       0.70      0.71      0.70      1342
                  B-Actor       0.82      0.81      0.81       542
                  I-Actor       0.78      0.78      0.78       699
B-Condition Specification       0.88      0.69      0.78        