In [None]:
!pip install transformers
import transformers
import pandas as pd
import numpy as np
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.cuda

from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from sklearn import preprocessing
from collections import defaultdict
from google.colab import drive

drive.mount('/content/drive')
dataset_mlt = pd.read_csv('/content/drive/MyDrive/Maltese_Unimorph_database_final.txt', sep="  ", header=None, error_bad_lines=False)

dataset_copy_mlt=dataset_mlt.copy()
list_data_mlt = dataset_copy_mlt.values.tolist()
print(list_data_mlt[0:10])


In [None]:
split_data_mlt=[]

def split(corpus, spit_data):
  for sets in corpus:
    for x in sets:
      parts = x.split("\t")
      spit_data.append(parts)

split(list_data_mlt,split_data_mlt)
print(split_data_mlt[0:100])


In [None]:
# removing duplicates
def remove_duplicates(data_array):
    unique_data_array = []
    second_element_set = set()
    for subarray in data_array:
        if subarray[1] not in second_element_set:
            unique_data_array.append(subarray)
            second_element_set.add(subarray[1])
    return unique_data_array

unique_data_mlt = remove_duplicates(split_data_mlt)
print(unique_data_mlt[0:100])

#getting the longest word for padding 
len_longest=0
for string in unique_data_mlt:
    x=string[1]
    if len(x)>len_longest:
      len_longest=len(x)

In [None]:
# splitting into words and grammar
def split_word_and_grammar(unique_data_array):
    last_elements = [subarray[-1] for subarray in unique_data_array]
    word_list = [(subarray[0],subarray[1]) for subarray in unique_data_array]
    return word_list, last_elements

word_list_mlt, last_elements_mlt = split_word_and_grammar(unique_data_mlt)
print(last_elements_mlt[0:100])

# splitting grammar
def splitting_grammar(last_elements):
    list_of_lists = []
    for info in last_elements:
        parts = info.split(';')
        num_parts = [int(part) if part.isdigit() else part for part in parts]
        list_of_lists.append(num_parts)
    return list_of_lists

list_of_lists_mlt = splitting_grammar(last_elements_mlt)


# adding to lists in alt order
split_data = []
list_of_lists = []
for i in range(len(word_list_mlt)):
      # adding mlt word and grammar
      array=(word_list_mlt[i][0],) + word_list_mlt[i][1:]
      split_data.append(array)
      list_of_lists.append(list_of_lists_mlt[i])

print(split_data[0:100])
print(list_of_lists[0:100])

In [None]:
# splitting data
tokeniser = AutoTokenizer.from_pretrained("MLRS/mBERTu",char_level=True) #,char_level=True
model = AutoModelForMaskedLM.from_pretrained("MLRS/mBERTu")

#tokeniser = AutoTokenizer.from_pretrained("bert-base-multilingual-cased",char_level=True)
#model = AutoModelForMaskedLM.from_pretrained("bert-base-multilingual-cased")

from sklearn.model_selection import train_test_split
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

X = split_data
y = list_of_lists

X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.2)
print(X_train[0:100])
print(y_train[0:100])
print(len(X_train))

In [None]:
# tokenize the training set
token_indexes = tokeniser(X_train, return_tensors='pt', padding=True, truncation=True, is_split_into_words=False, max_length=512)
indexed_train_x = token_indexes['input_ids'].to(device).squeeze()
mask_train_x = token_indexes['attention_mask'].to(device).squeeze()

# tokenize the test set
new_tuples = [(("",) + t[1:]) for t in X_test]
token_indexes_test = tokeniser(new_tuples, return_tensors='pt', padding=True, truncation=True, is_split_into_words=False, max_length=512)
indexed_test_x = token_indexes_test['input_ids'].to(device).squeeze()
mask_test_x = token_indexes_test['attention_mask'].to(device).squeeze()

# padding w/0 so gramm info is all the same length
def padding_grammar(Y):
    max_len = max(len(info) for info in Y)
    padded_array = [info + ['<PAD>'] * (max_len - len(info)) for info in Y]
    return padded_array
padded_morph_info_train = padding_grammar(y_train)
padded_morph_info_test = padding_grammar(y_test)
print(padded_morph_info_train[0:10])

all_morph_info = padded_morph_info_train + padded_morph_info_test
# creating a dictionary for mapping 
# Creating a dictionary for mapping
morph_vocab = defaultdict(lambda: len(morph_vocab))
morph_vocab['<PAD>'] = 0  # Assigning index 0 to 'pad'

for info in all_morph_info:
    # Extracting the unique values at this position
    values = set(i for i in info)
    # Assigning index
    for value in values:
        if value not in morph_vocab and value != '<PAD>':
            morph_vocab[value] = len(morph_vocab)

print(morph_vocab)


In [None]:
# map the morphological information to indexes using the vocabulary
def get_targets_one_hot(padded_info):
  targets = [[morph_vocab[value] for value in info] for info in padded_info]
  targets = [[morph_vocab['<PAD>'] if value == 0 else value for value in info] for info in targets]
  targets=torch.tensor(targets).to(device)
  
  # one-hot encode the targets
  num_classes = len(morph_vocab)
  targets = torch.nn.functional.one_hot(targets, num_classes=num_classes).to(torch.float32)
  targets = torch.tensor(targets, requires_grad=True).to(device)
  return targets

targets_train = get_targets_one_hot(padded_morph_info_train)
targets_test = get_targets_one_hot(padded_morph_info_test)

print(targets_train[0:100])
print(indexed_train_x[0:100])

In [None]:
class Model(torch.nn.Module):
    def __init__(self, bert):
        super().__init__()
        self.bert = bert
        self.output_layer1 = torch.nn.Linear(768, 512)
        self.bn1 = torch.nn.BatchNorm1d(512)  # Add batch normalization for faster training and better generalization
        self.drop1 = torch.nn.Dropout(p=0.3)
        self.output_layer2 = torch.nn.Linear(512, 256)  # Add another fully connected layer for improved feature extraction
        self.bn2 = torch.nn.BatchNorm1d(256)
        self.drop2 = torch.nn.Dropout(p=0.1)
        self.output_layer3 = torch.nn.Linear(256, 10*len(morph_vocab))
        
    def forward(self, x, mask):
        with torch.cuda.amp.autocast():
            vecs = self.bert(x, attention_mask=mask, output_hidden_states=True).hidden_states[8][:, 0, :] 
            output = self.output_layer1(vecs)
            output = self.bn1(output)
            output = torch.relu(output)
            output = self.drop1(output)
            output = self.output_layer2(output)
            output = self.bn2(output)
            output = torch.relu(output)
            output = self.drop2(output)
            output = self.output_layer3(output).view(-1, 10, len(morph_vocab))
            output = torch.sigmoid(output)
        return output

bert = transformers.BertForMaskedLM.from_pretrained('MLRS/mBERTu')
#bert = transformers.BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
model = Model(bert)
model.to(device)


In [None]:
'''
def grammatical_accuracy(output, target):

    output = output.detach().cpu().numpy()
    target = target.detach().cpu().numpy()

    # find index of the predicted and target labels
    output_idx = np.argmax(output, axis=1)
    target_idx = np.argmax(target, axis=1)  

    # distance between predicted and targer
    distance = np.abs(output_idx - target_idx)

    correct = np.sum(distance == 0) + np.sum(distance == 1)
    total = distance.size
    accuracy = float(correct) / total

    return accuracy
'''

def grammatical_accuracy(output, target):
    output = output.detach().cpu().numpy()
    target = target.detach().cpu().numpy()

    output = np.argmax(output, axis=1)
    target = np.argmax(target, axis=1)
    accuracy = np.mean(output == target) * 100
    return accuracy

accuracy_count=[]
def get_acc(matches):
    if len(matches) == 0:
        return 0
    accuracy_count.append(sum(matches)/len(matches))
    return sum(matches)/len(matches)
#return round(100*sum(preds)/len(preds),3)


In [None]:
batch_size=64

#bert 32,1e-5,2,150
#bertu 62,1e-5,8,300
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.1)

print('step', 'error', 'accuracy')
train_errors = []
all_true=[]
all_pred=[]
accumulation_steps = 4
for step in range(1, 2000+1):
    optimizer.zero_grad()
    model.train(True)
    #.CrossEntropyLoss()
    loss_fn = torch.nn.CrossEntropyLoss()
    all_outputs = []
    all_targets = []
    for i in range(accumulation_steps):
     # print(targets_train.shape)
      output = model(indexed_train_x[i*batch_size:(i+1)*batch_size], mask_train_x[i*batch_size:(i+1)*batch_size])
     
      # MSELoss   
      error = loss_fn(output, targets_train[i*batch_size:(i+1)*batch_size])
      error = error/accumulation_steps
      error.backward()

      all_outputs.append(output.cpu().detach().numpy())
      all_targets.append(targets_train[i*batch_size:(i+1)*batch_size].cpu().detach().numpy())

    optimizer.step()
    model.train(False)

    train_errors.append(error.detach().tolist())

    train_accs = []
    
    # one_hot_output = np.round(output.cpu().detach()).numpy().astype(int)
    
    
    for x in range(0,batch_size): # every vector in the output
      targets = targets_train[i*batch_size:(i+1)*batch_size].cpu().detach()
      targets = targets[x].argmax(dim=1).tolist()
      predicted_labels = output.cpu().detach()[x].argmax(dim=1).tolist()
      results=0

      all_true.append(targets)
      all_pred.append(predicted_labels)

     # print('t',targets)
     # print('p',predicted_labels)
      for vecs1 in predicted_labels: # every vector in output vector i
        for vecs2 in targets: # every vector in target vector i
            if vecs1==vecs2:
                results=results+1
           
      if results>=len(predicted_labels): 
        train_accs.append(1)
      else:
        train_accs.append(0)  
    accuracy=get_acc(train_accs)

    if step % 10 == 0:
        print(step, train_errors[-1], accuracy)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score
!pip install editdistance
import editdistance

m = MultiLabelBinarizer().fit(all_true)

f1_weighted=f1_score(m.transform(all_true),
         m.transform(all_pred),
         average='weighted')

f1_macro=f1_score(m.transform(all_true),
         m.transform(all_pred),
         average='macro')

print("F1 score weighted:", f1_weighted)
print("F1 score macro:", f1_macro)

precision = precision_score(m.transform(all_true), m.transform(all_pred), average='micro')
recall = recall_score(m.transform(all_true), m.transform(all_pred), average='micro')
print("Precision:", precision)
print("Recall:", recall)


f1_micro = f1_score(m.transform(all_true),  m.transform(all_pred), average='micro')
print("F1 score micro:", f1_micro)

all_true_flat = [label for sublist in all_true for label in sublist]
all_pred_flat = [label for sublist in all_pred for label in sublist]


# problem with lev distance, uses order, order doesnt matter a lot here
reverse_dict = {v: k for k, v in morph_vocab.items()} 
distances = []
for i in range(len(all_true)):
    predicted_labels = [reverse_dict[j] for j in all_pred[i]]
    true_labels = [reverse_dict[j] for j in all_true[i]]
    distance = editdistance.eval(predicted_labels, true_labels)
    distances.append(distance)

average_distance = sum(distances) / len(distances)
print("Levenshtein distances:", average_distance)

avg= sum(accuracy_count)/len(accuracy_count)
print("the average is: ", avg)

In [None]:
second_values = [v for k, v in morph_vocab.items()]
print(second_values)

In [None]:
m = MultiLabelBinarizer().fit(all_true)

# get the list of all possible morphosyntactic tags in the dataset
labels =second_values
print(labels)

# initialize dictionaries to store the scores for each label
precision_dict = {}
recall_dict = {}
f1_dict = {}

for label in labels:
    # transform the true and predicted labels to binary format for the current label
    true_labels_label = [1 if label in sublist else 0 for sublist in all_true]
    pred_labels_label = [1 if label in sublist else 0 for sublist in all_pred]
    
    # calculate precision, recall, and F1 score for the current label
    precision_dict[label] = precision_score(true_labels_label, pred_labels_label)
    recall_dict[label] = recall_score(true_labels_label, pred_labels_label)
    f1_dict[label] = f1_score(true_labels_label, pred_labels_label)

reverse_dict = {v: k for k, v in morph_vocab.items()} 
# print the scores for each label
for label in labels:
    print("Label:", reverse_dict[label])
    print("Precision:", precision_dict[label])
    print("Recall:", recall_dict[label])
    print("F1 score:", f1_dict[label])

In [None]:
from prettytable import PrettyTable

# create a table object with column names
table = PrettyTable(['Label', 'Precision', 'Recall', 'F1 Score'])

# add rows to the table
for label in labels:
    table.add_row([reverse_dict[label], precision_dict[label], recall_dict[label], f1_dict[label]])

# print the table
print(table)

In [None]:
with torch.no_grad():
    print('sent', 'prediction')
    for i in range(len(indexed_test_x) // batch_size):
        outputs = model(indexed_test_x[i*batch_size:(i+1)*batch_size], mask_test_x[i*batch_size:(i+1)*batch_size])

        # reversing the dict to get the grammar from the index
        reverse_dict = {v: k for k, v in morph_vocab.items()} 

        for j, x in enumerate(outputs):
            predicted_labels = [reverse_dict[i] for i in x.argmax(dim=1).tolist()]

            # changing from char to word
            input_word = ''.join(tokeniser.decode(indexed_test_x[i*batch_size:(i+1)*batch_size][j]).split())

            print(input_word, predicted_labels)
            print("actual")
            print(y_test[i*batch_size:(i+1)*batch_size][j])

In [None]:
(fig, ax) = plt.subplots(1, 1)
ax.set_xlabel('step')
ax.set_ylabel('$E$')
ax.plot(range(1, len(train_errors) + 1), train_errors, color='blue', linestyle='-', linewidth=3)
ax.grid()

print('   ')
(fig, ax) = plt.subplots(1, 1)
ax.set_xlabel('step')
ax.set_ylabel('$A$')
ax.plot(range(1, len(accuracy_count) + 1), accuracy_count, color='blue', linestyle='-', linewidth=3)
ax.grid()