# COMP 472 - Mini Project 1

### Import all necessary libraries

- gzip, json - Initial file parsing
- nltk - Word tokenization
- pandas - Data exploration
- numpy - Math
- matplotlib - Data exploration
- gensim - Load in pre-trained word2vec models
- sklearn - ML algorithms

In [None]:
# All imports 
import gzip
import json
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim.downloader as api

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

## 1. Dataset Preparation & Analysis

### 1.2 - Load dataset

Here we will be extracting and reading the JSON file into a Pandas DataFrame. We decided to use `pd.read_json()` rather than `json.load()` since DataFrames give us a more pleasant data type to work with.

In [None]:
# Read JSON file into Pandas DataFrame
f = gzip.open('goemotions.json.gz', 'rb')
df = pd.read_json(f)
df.columns = ['comment', 'emotion', 'sentiment']

# Close file
f.close()

Calling `DataFrame.head()` gives us a nice visual of our data.

In [None]:
df.head(10)

### 1.3 - Plotting data

Here we will be using matplotlib to visualize and explore our data. 

In [None]:
# Group sentiments by value and count
sentiment = df.groupby(['sentiment'])['sentiment'].count()

# Plot as pie chart
plt.title('GoEmotions by Sentiment')
plt.pie(sentiment, labels = sentiment.index, autopct = '%1.2f%%')

plt.savefig('sentiments.pdf')
plt.show()

In [None]:
# Group emotions by value and count, sort descending
emotion = df.groupby(['emotion'])['emotion'].count().sort_values(ascending=False)

# Plot as bar graph
plt.bar(emotion.index, emotion)
plt.xticks(
    rotation=90, 
    fontweight='light',
)

plt.title('GoEmotion by Emotion')
plt.xlabel('Emotion')
plt.ylabel('Number of Comments')

plt.savefig('emotions.pdf', bbox_inches='tight')
plt.show()

## 2. Words as Features

In [None]:
vectorizer = CountVectorizer()

comment_vector = vectorizer.fit_transform(df['comment'])

In [None]:
vocab_size = comment_vector.shape[1]
print("Vocabulary size: " + str(vocab_size))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(comment_vector, df, test_size=0.2)

In [None]:
nb_classifier = MultinomialNB()
nb_model = nb_classifier.fit(X_train, y_train['emotion'])

In [None]:
v = vectorizer.transform(['Thank you!'])
nb_model.predict(v)

## 3. Embeddings as Features

### 3.1 - Load word2vec model

We will be downloading the `word2vec-google-news-300` model using Gensim.

In [None]:
word2vec_model = api.load("word2vec-google-news-300")

### 3.2 - Tokenize data

Install required NLTK data and tokenize sentences into individual words.

In [None]:
# Uncomment when running for the first time
# nltk.download('popular')

tokenized_comments = df['comment'].apply(nltk.word_tokenize)
tokens = np.concatenate(tokenized_comments.to_numpy())
unique_tokens_count = len(np.unique(tokens))
print('Number of unique tokens in dataset: ' + str(unique_tokens_count))

### 3.3 - Calculate sentence vector (by taking average of word vectors)

Here I have created two functions. The first function calculates the average vector of a sentence by averaging its individual word vectors. The second function calls the first function for a collection of sentences and returns an array containing each average vector.

In [None]:
# Get average of single sentence
def get_avg_vector(sentence, model, vector_size):
    words = [word for word in sentence if word in model]
    if len(words) >= 1:
        return np.mean(model[words], axis=0)
    else:
        return np.zeros(vector_size) # Set length of vector to 0 for all dimensions

# Get averages of collection of sentences
def get_avg_vectors(tokenized, model, vector_size):
    avg_vectors = []
    for comment in tokenized:
        avg_vectors.append(get_avg_vector(comment, model, vector_size))
    return avg_vectors

# Get average vector
avg_word2vec = get_avg_vectors(tokenized_comments, word2vec_model, 300)

### 3.4 - Compute and display overall hit rate

The overall hit rate is calculated by counting the number of zero vectors and dividing it by the total amount of vectors in the dataset.

In [None]:
def count_hit_rate(tokens, model):
    count = 0
    for token in tokens:
        if token in model:
            count += 1
    return count / len(tokens)

hit_rate = count_hit_rate(tokens, word2vec_model)
print("Hit rate for dataset: " + str(hit_rate))

### 3.5 - Train base MLP models

In [None]:
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(avg_word2vec, df, test_size=0.2)

In [None]:
mlp_emotion = MLPClassifier(early_stopping=True)
mlp_emotion.fit(X_train_w2v, y_train_w2v['emotion'])

In [None]:
mlp_sentiment = MLPClassifier(early_stopping=True)
mlp_sentiment.fit(X_train_w2v, y_train_w2v['sentiment'])

### 3.6 - Train "Top" MLP models

For this question, we are allowed to choose whichever hyper-parameters we would like. I chose to try adding another hidden layer to see how it would affect our results compared to the base MLP models.

In [None]:
top_mlp_emotion = MLPClassifier(early_stopping=True, hidden_layer_sizes=(100, 100))
top_mlp_emotion.fit(X_train_w2v, y_train_w2v['emotion'])

In [None]:
top_mlp_sentiment = MLPClassifier(early_stopping=True, hidden_layer_sizes=(100, 100))
top_mlp_sentiment.fit(X_train_w2v, y_train_w2v['sentiment'])

### 3.7 - Classification report

Here we will simply be calling the `classification_report()` function and piping the output to a file called `performance.txt`.

In [None]:
y_pred_emotion = mlp_emotion.predict(X_test_w2v)
y_pred_sentiment = mlp_sentiment.predict(X_test_w2v)
y_pred_top_emotion = top_mlp_emotion.predict(X_test_w2v)
y_pred_top_sentiment = top_mlp_sentiment.predict(X_test_w2v)

with open('performance.txt', 'w') as f:
    f.write('Base-MLP emotion classifier\n\n')
    f.write(classification_report(y_pred_emotion, y_test_w2v['emotion']))
    f.write('\n\n')

    f.write('Base-MLP sentiment classifier\n\n')
    f.write(classification_report(y_pred_sentiment, y_test_w2v['sentiment']))
    f.write('\n\n')

    f.write('Top-MLP emotion classifier\n\n')
    f.write(classification_report(y_pred_top_emotion, y_test_w2v['emotion']))
    f.write('\n\n')

    f.write('Top-MLP sentiment classifier\n\n')
    f.write(classification_report(y_pred_top_sentiment, y_test_w2v['sentiment']))
    f.write('\n\n')

### 3.8 - Two other pretrained embedding models

Load two pretrained models using gensim.

In [None]:
# Load models
glove_twitter_model = api.load('glove-twitter-200')
glove_wiki_model = api.load('glove-wiki-gigaword-300')

Get average vectors with new model and split data.

In [None]:
# Twitter
avg_tw = get_avg_vectors(tokenized_comments, glove_twitter_model, 200)
X_train_tw, X_test_tw, y_train_tw, y_test_tw = train_test_split(avg_tw, df, test_size=0.2)

# Wiki
avg_wiki = get_avg_vectors(tokenized_comments, glove_wiki_model, 300)
X_train_wiki, X_test_wiki, y_train_wiki, y_test_wiki = train_test_split(avg_wiki, df, test_size=0.2)

We will now train our models using the new embedding models.

In [None]:
mlp_twitter_emotion = MLPClassifier(early_stopping=True)
mlp_twitter_emotion.fit(X_train_tw, y_train_tw['emotion'])

In [None]:
mlp_twitter_sentiment = MLPClassifier(early_stopping=True)
mlp_twitter_sentiment.fit(X_train_tw, y_train_tw['sentiment'])

In [None]:
mlp_wiki_emotion = MLPClassifier(early_stopping=True)
mlp_wiki_emotion.fit(X_train_wiki, y_train_wiki['emotion'])

In [None]:
mlp_wiki_sentiment = MLPClassifier(early_stopping=True)
mlp_wiki_sentiment.fit(X_train_wiki, y_train_wiki['sentiment'])

Predict and send results to performance.txt.

In [None]:
y_pred_tw_emotion = mlp_twitter_emotion.predict(X_test_tw)
y_pred_tw_sentiment = mlp_twitter_sentiment.predict(X_test_tw)
y_pred_wiki_emotion = mlp_wiki_emotion.predict(X_test_wiki)
y_pred_wiki_sentiment = mlp_wiki_sentiment.predict(X_test_wiki)

with open('performance.txt', 'a') as f:
    f.write('MLP emotion classifier: glove-twitter-200 embedding model\n\n')
    f.write(classification_report(y_pred_tw_emotion, y_test_tw['emotion']))
    f.write('\n\n')

    f.write('MLP sentiment classifier: glove-twitter-200 embedding model\n\n')
    f.write(classification_report(y_pred_tw_sentiment, y_test_tw['sentiment']))
    f.write('\n\n')

    f.write('MLP emotion classifier: glove-wiki-gigaword-300 embedding model\n\n')
    f.write(classification_report(y_pred_wiki_emotion, y_test_wiki['emotion']))
    f.write('\n\n')

    f.write('MLP sentiment classifier: glove-wiki-gigaword-300 embedding model\n\n')
    f.write(classification_report(y_pred_wiki_sentiment, y_test_wiki['sentiment']))
    f.write('\n\n')