In [None]:
import json
import gzip
import numpy as np

X = 0
Y = 1

from sklearn import metrics, feature_extraction, naive_bayes, neural_network, tree
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder;

In [None]:
file = gzip.open('goemotions.json.gz', 'rb')
j = json.load(file)

text = [a for a, b, c in j]
emotion = [b for a, b, c in j]
mood = [c for a, b, c in j]

In [None]:
vectorizer = feature_extraction.text.CountVectorizer()
x = vectorizer.fit_transform(text)
print(len(vectorizer.get_feature_names_out()))

In [None]:
from io import FileIO
from typing import List

class Data:
	def __init__(self, type_name, train_x, test_x, train_y, test_y):
		self.name = type_name
		self.train = (train_x, train_y)
		self.test = (test_x, test_y)

class WordsAsFeatures:
	data: List[Data] = []
	def __init__(self, file_name="performance.txt"):
		self.file = open(file_name, "w")

	def addTrainAndTestData(self, type_name, train_x, test_x, train_y, test_y):
		self.data.append(Data(type_name, train_x, test_x, train_y, test_y))

	def writePreformanceFile(self, title, type, predict_y, data, accuracy, params = []):
		out = ""
		out += "--- " + title + " | Hyperparams: " + str(params or None) + " | " + type + " ---\n"
		out += str(metrics.confusion_matrix(data.test[1], predict_y)) + "\n"
		out += str(classification_report(data.test[1], predict_y, zero_division=0, )) + "\n"
		out += "Accuracy: " + str(accuracy) + "\n\n";
		self.file.write(out)

	def trainAndPredictEmotionAndSentiment(self, function, title, hyper_params=False):
		for d in self.data:
			function.fit(d.train[0], d.train[1])

			predict_y = function.predict(d.test[0])
			accuracy = function.score(d.test[0], d.test[1])
			params = function.best_params_ if hyper_params else None
			self.writePreformanceFile(title, d.name, predict_y, d, accuracy, params)
		self.file.flush()
			
	def close_file(self):
		self.file.close()

wf = WordsAsFeatures()


In [None]:
train_x, test_x, train_y, test_y = train_test_split(x, mood, test_size=0.2)
wf.addTrainAndTestData("Sentiment", train_x, test_x, train_y, test_y)
train_x, test_x, train_y, test_y = train_test_split(x, emotion, test_size=0.2)
wf.addTrainAndTestData("Emotion", train_x, test_x, train_y, test_y)

In [None]:
base_mnb = naive_bayes.MultinomialNB()
wf.trainAndPredictEmotionAndSentiment(base_mnb, "Base Multinomial Naive Bayes")

In [None]:
base_dt = tree.DecisionTreeClassifier()
wf.trainAndPredictEmotionAndSentiment(base_dt, "Base Decision Tree")

In [None]:
base_mlp = neural_network.MLPClassifier(verbose=True)
wf.trainAndPredictEmotionAndSentiment(base_mlp, "Base Multi-Layered Perceptron")

In [None]:
print(classification_report(test_y, base_mlp.predict(test_x)))

In [None]:
top_mnb = GridSearchCV(naive_bayes.MultinomialNB(), {'alpha': [0, 0.5, 0.7, 0.8]})
wf.trainAndPredictEmotionAndSentiment(top_mnb, "Top Multinomial Naive Bayes", True)

In [None]:
top_dt = GridSearchCV(tree.DecisionTreeClassifier(), {'criterion': ['gini', 'entropy'], 'max_depth': [30, 80], 'min_samples_split': [2, 3, 4]})
wf.trainAndPredictEmotionAndSentiment(top_dt, "Top Decision Tree", True)

In [None]:
# top_mlp = GridSearchCV(neural_network.MLPClassifier(verbose=True), {'activation': ['logistic', 'tanh', 'relu', 'identity'], 'hidden_layer_sizes': [(30, 50,), (10, 10, 10,)], 'solver': ['adam', 'sgd']})
top_mlp = neural_network.MLPClassifier(verbose=True, activation='logistic', hidden_layer_sizes=(15, 30, 25,), solver='adam')
wf.trainAndPredictEmotionAndSentiment(top_mlp, "Top Multi-Layered Perceptron", True)