In [1]:
import numpy as np

## Setup 

#### this class is purely for organisation and ease of access, instead of messy nested array access , we just create a posts object with attributes ( the post itself, its emotion, its sentiment )
                

In [2]:
class Posts : 
    def __init__(self, post, emotion, sentiment):
        self.post = post
        self.emotion = emotion
        self.sentiment = sentiment
        
    def print(self):
        print(f"post: {self.post} emotion: {self.emotion} sentiment: {self.sentiment}")

## 1.2 Loading file 

In [3]:
import json
import gzip

posts = []

# the file given to us is a gzip which has a json inside it, so we need to unzip first and then load the json file 

with gzip.open('goemotions.json.gz', 'r') as f: # unzipping
    data = json.loads(f.read(), encoding="utf-8") # loading json
    for line in data:
        posts.append(Posts(line[0],line[1],line[2])) #creating the object and appending to the list 
        
# basically posts is a list of objects where each object has its info as attributes ( see above )

FileNotFoundError: [Errno 2] No such file or directory: 'goemotions.json.gz'

## 1.3 Plotting Pie Charts

In [None]:
import matplotlib.pyplot as plt

emotion_count = {}

for f_list in data: ## this just counts the number of each emotion to plot the pie chart 
    for s_list in f_list[1:2]:
        if s_list in emotion_count.keys(): 
            continue
        counts = sum(x.count(s_list) for x in data)
        emotion_count[s_list] = counts
        

labels = []
sizes = []

for x, y in emotion_count.items():
    labels.append(x)
    sizes.append(y)


patches, texts = plt.pie(sizes,
   startangle=90,
   )

plt.legend(patches, labels, loc='center left', bbox_to_anchor=(-0.35, .5), fontsize=8)
plt.show()

In [None]:
sent_count = {}

for f_list in data: # same as above , only for sentiments this time 
    for s_list in f_list[2:]:
        if s_list in sent_count.keys(): 
            continue
        counts = sum(x.count(s_list) for x in data)
        sent_count[s_list] = counts
        

labels = []
sizes = []

for x, y in sent_count.items():
    labels.append(x)
    sizes.append(y)

# Plot
plt.pie(sizes, labels=labels)

plt.axis('equal')
plt.show()

## 2.1 Proccessing Dataset 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


#getting frequency of each word : pretty basic 
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(x.post for x in posts)
print("Vocabulary: ", vectorizer.vocabulary_)


In [None]:
print(f" The number of tokens is {len(vectorizer.vocabulary_)}") # no. of dinstinct words 

## 2.2 Train/Test split  

### ok so think of it this way: we feed the model a reddit post (our x ) and it should predict its emotion and sentiment of (our y's)

### note we are predicting sentiment and emotion , so we have 2 y's in this case, but they are done independently 
### 

In [None]:
from sklearn.model_selection import train_test_split


training_set, testing_set = train_test_split(posts, test_size=0.2, random_state = 42) 
# what this does is randomly splitting the posts list of objects into 80/20 train/test respectively,  

train_x = [x.post for x in training_set] # now we need to seperate our x's and y's for both train and split tests
train_emotion_y = [x.emotion for x in training_set] # this is the y you feed to your model
train_sentiment_y = [x.sentiment for x in training_set] # same shit ^

test_x = [x.post for x in testing_set]
test_emotion_y = [x.emotion for x in testing_set]
test_sentiment_y = [x.sentiment for x in testing_set]


vectorized_train_x = vectorizer.fit_transform(train_x) # this is the x you feed to your model 
vectorized_test_x = vectorizer.transform(test_x) # dw about this for now 

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()
clf_dec.fit(vectorized_train_x,train_emotion_y)
