In [10]:
import json
import gzip
import numpy as np

X = 0
Y = 1

from sklearn import metrics, feature_extraction, naive_bayes, neural_network, tree
from sklearn.model_selection import GridSearchCV, train_test_split

In [11]:
from typing import List

class Data:
	def __init__(self, type_name, train_x, test_x, train_y, test_y):
		self.name = type_name
		self.train = (train_x, train_y)
		self.test = (test_x, test_y)

class WordsAsFeatures:
	data: List[Data] = []
	def __init__(self, file_name="performance.txt"):
		self.file = open(file_name, "w")
		self.file.write("############################\n")
		self.file.write("##  2. Words as Features  ##\n")
		self.file.write("############################\n\n")
		self.file.flush()

	def addTrainAndTestData(self, type_name, train_x, test_x, train_y, test_y):
		self.data.append(Data(type_name, train_x, test_x, train_y, test_y))

	def writePreformanceFile(self, title, type, predict_y, data, accuracy, params = []):
		# 2.4
		out = ""
		out += "--- " + title + " | Hyperparams: " + str(params or None) + " | " + type + " ---\n"
		out += str(metrics.confusion_matrix(data.test[Y], predict_y)) + "\n"
		out += str(metrics.classification_report(data.test[Y], predict_y, zero_division=0, )) + "\n"
		out += "Accuracy: " + str(accuracy) + "\n\n"
		self.file.write(out)

	def trainAndPredictEmotionAndSentiment(self, function, title, hyper_params=False):
		for d in self.data:
			function.fit(d.train[X], d.train[Y])

			predict_y = function.predict(d.test[X])
			accuracy = function.score(d.test[X], d.test[Y])
			params = function.best_params_ if hyper_params else None
			self.writePreformanceFile(title, d.name, predict_y, d, accuracy, params)
		self.file.flush()
			
	def clear(self):	
		self.data = []
	
	def createSeparator(self, title):
		self.file.write("\n=== " + title + " ===\n\n")
		self.file.flush()

	def close_file(self):
		self.file.close()

wf = WordsAsFeatures()


In [12]:
file = gzip.open('goemotions.json.gz', 'rb')
j = json.load(file)

text = [a for a, b, c in j]
emotion = [b for a, b, c in j]
mood = [c for a, b, c in j]

# 2.1 Process Data set and Display Size of Vocabulary

In [13]:
vectorizer = feature_extraction.text.CountVectorizer()
x_count_vectorizer = vectorizer.fit_transform(text)
print(len(vectorizer.get_feature_names_out()))

30449


# 2.2 Split the Dataset into 80% for Training and 20% for Testing

In [5]:
wf.addTrainAndTestData("Sentiment", *train_test_split(x_count_vectorizer, mood, test_size=0.2))
wf.addTrainAndTestData("Emotion", *train_test_split(x_count_vectorizer, emotion, test_size=0.2))

# 2.3 Train and Test the Following Classifiers
## 2.3.1 Base-MNB

In [6]:
base_mnb = naive_bayes.MultinomialNB()
wf.trainAndPredictEmotionAndSentiment(base_mnb, "Base Multinomial Naive Bayes")

## 2.3.2 Base-DT

In [None]:
base_dt = tree.DecisionTreeClassifier()
wf.trainAndPredictEmotionAndSentiment(base_dt, "Base Decision Tree")

## 2.3.3 Base-MLP

In [None]:
base_mlp = neural_network.MLPClassifier(verbose=True)
wf.trainAndPredictEmotionAndSentiment(base_mlp, "Base Multi-Layered Perceptron")

## 2.3.4 Top-MNB

In [None]:
top_mnb = GridSearchCV(naive_bayes.MultinomialNB(), {'alpha': [0, 0.5, 0.7, 0.8]})
wf.trainAndPredictEmotionAndSentiment(top_mnb, "Top Multinomial Naive Bayes", True)

## 2.3.5 Base-DT

In [None]:
top_dt = GridSearchCV(tree.DecisionTreeClassifier(), {'criterion': ['gini', 'entropy'], 'max_depth': [30, 80], 'min_samples_split': [2, 3, 4]})
wf.trainAndPredictEmotionAndSentiment(top_dt, "Top Decision Tree", True)

## 2.3.6 Top-MLP

In [None]:
top_mlp = GridSearchCV(neural_network.MLPClassifier(verbose=True), {'activation': ['logistic', 'tanh', 'relu', 'identity'], 'hidden_layer_sizes': [(30, 50,), (10, 10, 10,)], 'solver': ['adam', 'sgd']})
wf.trainAndPredictEmotionAndSentiment(top_mlp, "Top Multi-Layered Perceptron", True)

# 2.5: Use tf-tdf instead of word frequencies and redo all substeps of 2.3

In [7]:
wf.clear()
wf.createSeparator("tf-idf")

idf = feature_extraction.text.TfidfTransformer()
x_idf = idf.fit_transform(x_count_vectorizer)

In [8]:
wf.addTrainAndTestData("Sentiment", *train_test_split(x_idf, mood, test_size=0.2))
wf.addTrainAndTestData("Emotion", *train_test_split(x_idf, emotion, test_size=0.2))

In [9]:
base_mnb = naive_bayes.MultinomialNB()
wf.trainAndPredictEmotionAndSentiment(base_mnb, "Base Multinomial Naive Bayes")

In [9]:
base_dt = tree.DecisionTreeClassifier()
wf.trainAndPredictEmotionAndSentiment(base_dt, "Base Decision Tree")

In [10]:
base_mlp = neural_network.MLPClassifier(verbose=True)
wf.trainAndPredictEmotionAndSentiment(base_mlp, "Base Multi-Layered Perceptron")

Iteration 1, loss = 1.10624451
Iteration 2, loss = 0.93396357
Iteration 3, loss = 0.86639840
Iteration 4, loss = 0.82175971
Iteration 5, loss = 0.78789797
Iteration 6, loss = 0.75933955
Iteration 7, loss = 0.73416646
Iteration 8, loss = 0.71318528
Iteration 9, loss = 0.69495771
Iteration 10, loss = 0.67887336
Iteration 11, loss = 0.66538942
Iteration 12, loss = 0.65330648
Iteration 13, loss = 0.64321013
Iteration 14, loss = 0.63373427
Iteration 15, loss = 0.62594597
Iteration 16, loss = 0.61879669
Iteration 17, loss = 0.61213181
Iteration 18, loss = 0.60701403
Iteration 19, loss = 0.60152059
Iteration 20, loss = 0.59708922
Iteration 21, loss = 0.59260714
Iteration 22, loss = 0.58866359
Iteration 23, loss = 0.58464012
Iteration 24, loss = 0.58172824
Iteration 25, loss = 0.57859353
Iteration 26, loss = 0.57579104
Iteration 27, loss = 0.57292209
Iteration 28, loss = 0.57065715
Iteration 29, loss = 0.56818716
Iteration 30, loss = 0.56611532
Iteration 31, loss = 0.56387104
Iteration 32, los



In [11]:
top_mnb = GridSearchCV(naive_bayes.MultinomialNB(), {'alpha': [0, 0.5, 0.7, 0.8]})
wf.trainAndPredictEmotionAndSentiment(top_mnb, "Top Multinomial Naive Bayes", True)



In [12]:
top_dt = GridSearchCV(tree.DecisionTreeClassifier(), {'criterion': ['gini', 'entropy'], 'max_depth': [30, 80], 'min_samples_split': [2, 3, 4]})
wf.trainAndPredictEmotionAndSentiment(top_dt, "Top Decision Tree", True)

In [13]:
top_mlp = GridSearchCV(neural_network.MLPClassifier(verbose=True), {'activation': ['logistic', 'tanh', 'relu', 'identity'], 'hidden_layer_sizes': [(30, 50, ), (10, 10, 10,)], 'solver': ['adam', 'sgd']})
wf.trainAndPredictEmotionAndSentiment(top_mlp, "Top Multi-Layered Perceptron", True)

In [14]:
wf.close_file()