NLP Homework 2

In [None]:
!wget https://raw.githubusercontent.com/AndrejsPetrovs/NLP_hw/main/dev.tsv
!wget https://raw.githubusercontent.com/AndrejsPetrovs/NLP_hw/main/ekman_mapping.json
!wget https://raw.githubusercontent.com/AndrejsPetrovs/NLP_hw/main/emotions.txt
!wget https://raw.githubusercontent.com/AndrejsPetrovs/NLP_hw/main/stoplist.txt
!wget https://raw.githubusercontent.com/AndrejsPetrovs/NLP_hw/main/test.tsv
!wget https://raw.githubusercontent.com/AndrejsPetrovs/NLP_hw/main/train.tsv

In [None]:
!pip install nltk
!pip install scikit-learn
!pip install seaborn

In [None]:
import nltk
nltk.download('punkt')

from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import numpy

import seaborn
import matplotlib.pyplot as mplot

import re
import os
import sys
import pickle
import datetime
import json
import csv
from nltk.tokenize import sent_tokenize, word_tokenize

Script to create emotion mappings and new data files

In [None]:
# Map emotions to numbers according to emotions.txt file
emotions = []
with open("emotions.txt") as f:
    for row in f:
        if(row[-1]=="\n"):
            row=row[:-1]
        emotions+=[row]

targets=["dev.tsv", "test.tsv", "train.tsv"]
targetsout=["dev2.tsv", "test2.tsv", "train2.tsv"]

# Map emotions according to ekman_mapping.json file
mappingfile="ekman_mapping.json"
with open(mappingfile) as f:
    mappingwords=json.load(f)
mapping=[0]*28
mapping[27]=6

classes=[""]*7
classes[6]="neutral"
wnum=0
for w in mappingwords:
    classes[wnum]=w
    for w2 in mappingwords[w]:
        mapping[emotions.index(w2)]=wnum
    wnum+=1

for i in range(len(classes)):
    print(f"{i}: {classes[i]}")


alltxt = ""

# Create new data files according to the mapping and format
for i in range(len(targets)):
    with open(targets[i], encoding="utf8") as fin:
        with open(targetsout[i], "w", encoding="utf8") as fout:
            reader = csv.reader(fin, delimiter="\t")
            writer = csv.writer(fout, delimiter="\t", lineterminator="\n")
            for row in reader:
                txt = row[0]
                if i==1:
                    alltxt+=(txt+" ")
                # Write multiple rows if a piece of text has many tags, but make sure that they map to different emotions
                tags = row[1].split(",")
                for j in range(len(tags)):
                    tags[j]=mapping[int(tags[j])]
                tags=list(set(tags))
                for tag in tags:
                    writer.writerow([str(tag), txt])


Code for text preprocessing, taken from https://github.com/LUMII-AILab/NLP_Course/blob/main/notebooks/NaiveBayes.ipynb

In [None]:
def normalize_text(text):
	text = text.lower()
	text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text) # e-mail addresses
	text = re.sub(r'https?://[A-Za-z0-9./-]+|www\.[A-Za-z0-9./-]+', '', text)				# URLs
	text = re.sub(r'\d+', "100", text)																					    # numbers

	return text.strip()


def normalize_vector(vector):
	words = list(vector.keys())

	for w in words:
		if w in STOPLIST or len(w) == 1 or w not in WHITELIST:
			vector.pop(w)

	return vector


def vectorize_text(text):
	return normalize_vector({word: True for word in nltk.word_tokenize(normalize_text(text))})

Script for word frequency list creation

In [None]:
# Create word frequency file using nltk
alltxt = normalize_text(alltxt)

allsents = sent_tokenize(alltxt)
freqlist = {}
for s in allsents:
    s = re.sub(r'\s+', ' ', s.strip())
    w = word_tokenize(s)
    for word in w:
        word=word.lower()
        if word not in freqlist:
            freqlist[word]=0
        freqlist[word]+=1


freqlistfile = "freqlist.tsv"


with open(freqlistfile, "w", encoding="utf8") as f:
    writer = csv.writer(f, delimiter="\t", lineterminator="\n")
    for word in freqlist.keys():
        writer.writerow([str(freqlist[word]), word])



More code for preprocessing from the example

In [None]:
def initialise(stop_txt, freq_tsv):
	global STOPLIST
	STOPLIST = set()

	with open(stop_txt) as txt:
		for word in txt:
			STOPLIST.add(normalize_text(word.strip()))

	print("[I] Word stoplist is read:", len(STOPLIST))

	global WHITELIST
	WHITELIST = set()

	with open(freq_tsv) as tsv:
		for entry in tsv:
			freq, word = entry.strip().split("\t")

			if int(freq) < 5: # Could also be 6 or 7 with minimal effect
				continue

			WHITELIST.add(normalize_text(word))

	print("[I] Word whitelist is read:", len(WHITELIST))

5-7 minimum word repetition threshold seems to give the best accuracy

In [None]:
def read_data(file):
	data_set = {}  # topic => samples

	with open(file) as data:
		for entry in data:
			topic, text = entry.strip().split("\t")

			sub_set = []
			if topic in data_set:
				sub_set = data_set[topic]

			sub_set.append((vectorize_text(text), topic))
			data_set[topic] = sub_set

	return data_set


def join_data(data_set):
	union = []

	for cat in data_set:
		union += data_set[cat]

	return union

Code for machine learning model evaluation. The base taken from the example, adapted for having separate train and test data sets

In [None]:
def cross_validate2(train_data_set, test_data_set):
    global LABELS
    LABELS = []

    train_data = numpy.array([])
    test_data = numpy.array([])


    for cat in train_data_set:
        LABELS.append(cat)
        if len(train_data) > 0:
            train_data = numpy.append(train_data, train_data_set[cat], axis=0)
        else:
            train_data = train_data_set[cat]


    for cat in test_data_set:
        if len(test_data) > 0:
            test_data = numpy.append(test_data, test_data_set[cat], axis=0)
        else:
            test_data = test_data_set[cat]

    # Naive Bayes classifier: training and evaluation
    nb = nltk.NaiveBayesClassifier.train(train_data)
    validation_accuracy = nltk.classify.accuracy(nb, test_data)

    validations = [validation_accuracy]
    gold_result = []
    silver_result = []

    for t in test_data:
        gold_result.append(t[1])
        silver_result.append(nb.classify(t[0]))

    return (validations, gold_result, silver_result)

In [None]:
def run_validation2(train_data_path, test_data_path):
		print("Cross-validation:\n")

		start_time = datetime.datetime.now().replace(microsecond=0)

		# Run cross-validation
		validations, gold, silver = cross_validate2(read_data(train_data_path), read_data(test_data_path))

		# Print the average accuracy: for each cross-validation step, and overall
		for step in validations:
				print("{0:.2f}  ".format(step), end='')
		print("{0:.0%}".format(numpy.mean(validations)))

		end_time = datetime.datetime.now().replace(microsecond=0)
		print("\nTotal validation time:", end_time - start_time, "\n")

		# Print an evaluation report
		print(classification_report(gold, silver))

		# Print a fancy confusion matrix
		matrix = confusion_matrix(gold, silver)
		seaborn.heatmap(matrix, xticklabels=LABELS, yticklabels=LABELS)
		mplot.xticks(rotation=90)
		mplot.show()
		# cf. print(nltk.ConfusionMatrix(gold_total, silver_total))

The rest of the functions from the example: model training and the inference part

In [None]:
def run_training(data_path, verbose):
		print("[I] Training an NB classifier...")
		start_time = datetime.datetime.now().replace(microsecond=0)

		# TRAINING
		# The final (production) model is trained by using all available data (train+test)
		nb = nltk.NaiveBayesClassifier.train(join_data(read_data(data_path)))

		end_time = datetime.datetime.now().replace(microsecond=0)
		print("[I] Training time:", end_time - start_time)

		if verbose:
				nb.show_most_informative_features(n=10) # Try with n=100

		# Save the model for later use
		with open("nb_classifier.pickle", "wb") as dmp:
				pickle.dump(nb, dmp)
				print("[I] NB classifier stored in a file")

In [None]:
def run_inference():
		# Load the pre-trained model
		with open("nb_classifier.pickle", "rb") as dmp:
				nb = pickle.load(dmp)
				print("[I] NB classifier loaded from a file")

		while True:
				text = input("\nEnter a text to classify: ")
				if len(text) == 0: break

				# Extract text features for classification
				text_feat = vectorize_text(text)
				print("\nFeatures:", text_feat.keys(), "\n")

				# INFERENCE
				# Calculate a probability distribution over the classes
				prob_dist = nb.prob_classify(text_feat)

				# Return the probability distribution
				for label in prob_dist.samples():
						print("{0}: {1:.3f}".format(classes[int(label)], prob_dist.prob(label)))

				# Return the most probable class
				print("\nPrediction:", classes[int(prob_dist.max())])

Execution of created functions. Base taken from the example, modified slightly to fit the changes in functions and make the model output the emotions instead of their numbers

In [None]:
# Initialise the stopword and word frequency lists
initialise('stoplist.txt', 'freqlist.tsv')

In [None]:
run_validation2("train2.tsv", "test2.tsv")

These are the results with 5-repetition threshold. Decreasing the number leads to lower precision, increasing it results in lower recall.

In [None]:
# Train and save the final model
run_training("train2.tsv", True) # True=verbose

In [None]:
# Run the pre-trained model
run_inference()

In conclusion, the created model shows worse results than the model presented in https://arxiv.org/pdf/2005.00547 (about 20% lower f1-score). It is hard for the model to recognize neutral class, because there are no words strongly associated with it. However, lexicon pruning helps improve the results: accuracy is 0.42 without it, 0.51 when pruning words that appear less than 5 times. Increasing the threshold leads to higher precision and lower recall, while decreasing it leads to higher recall and lower precision (to an extent). Overall accuracy and f1-score are maximized at 5-7 repetition threshold.