In [57]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary
import json
# external implementation of the LDA model
from lda_model import LDA

In [9]:
df = pd.read_csv("./data/emotion_dataset.csv")
df.head()

Unnamed: 0,text,label
0,“Worry is a down payment on a problem you may ...,optimism
1,My roommate: it's okay that we can't spell bec...,anger
2,No but that's so cute. Atsu was probably shy a...,joy
3,Rooneys fucking untouchable isn't he? Been fuc...,anger
4,it's pretty depressing when u hit pan on ur fa...,sadness


In [41]:
text = re.sub(r"[^A-Za-z ]", "", df.text[0]).strip()
re.sub(r"[ ]{2,}", " ", text).split()

['Worry',
 'is',
 'a',
 'down',
 'payment',
 'on',
 'a',
 'problem',
 'you',
 'may',
 'never',
 'have',
 'Joyce',
 'Meyer',
 'motivation',
 'leadership',
 'worry']

## Text preprocessing
The following preprocessing is fit for this analysis:
- remove special characters
- make lower case and strip a trailing whitespace
- remove more than one whitespace
- tokenize
- remove stopwords
- remove meaninless words
- lemmatize

In [48]:
def preprocess(text):
    # remove special characters
    text = re.sub(r"[^a-zA-Z ]", "", text)

    # make lower case
    text = text.lower().strip()

    # remove more than one whitespace
    text = re.sub(r"[ ]{2,}", " ", text)

    # tokenize
    text = text.split(" ")
    
    # remove stopwords
    text = [word for word in text if word not in stopwords.words("english")]

    # remove useless words
    text = [word for word in text if word not in ["user", "im", ""]]

    # lemmatize
    lem = WordNetLemmatizer()
    result = [lem.lemmatize(word) for word in text]

    return result

In [49]:
# testing implementation
preprocess(df.text[0])

['worry',
 'payment',
 'problem',
 'may',
 'never',
 'joyce',
 'meyer',
 'motivation',
 'leadership',
 'worry']

In [50]:
# preprocessing
df["text_preprocessed"] = df.apply(lambda row: preprocess(row["text"]), axis = 1)

### Task 2
Fitting five normal LDA models with $K=4$ `n_features = 1000` and high `n_iter`.  

In [51]:
ldas = []
k = 4

vocabulary = Dictionary(df.text_preprocessed.to_list())
corpus = [vocabulary.doc2bow(text) for text in df.text_preprocessed]

for i in range(5):
    lda = LdaModel(corpus, num_topics=k,
                   iterations=1000,
                   id2word=vocabulary)
    ldas.append(lda)

In [52]:
ldas[0].show_topics()

[(0,
  '0.009*"like" + 0.009*"get" + 0.006*"amp" + 0.006*"one" + 0.005*"sad" + 0.005*"back" + 0.004*"people" + 0.004*"even" + 0.004*"day" + 0.004*"much"'),
 (1,
  '0.006*"like" + 0.005*"dont" + 0.005*"lively" + 0.004*"day" + 0.004*"watch" + 0.004*"fuming" + 0.004*"get" + 0.004*"lost" + 0.003*"amazing" + 0.003*"think"'),
 (2,
  '0.009*"amp" + 0.006*"people" + 0.006*"love" + 0.005*"fear" + 0.004*"depression" + 0.004*"make" + 0.004*"angry" + 0.004*"never" + 0.004*"time" + 0.004*"go"'),
 (3,
  '0.014*"dont" + 0.008*"like" + 0.007*"u" + 0.006*"know" + 0.006*"get" + 0.004*"feel" + 0.004*"life" + 0.004*"cant" + 0.004*"think" + 0.004*"would"')]

## LDA with seeding
We will first read in the seed words and then apply the external implementation of the lda to the data

In [64]:
with open("./data/seed_words.json") as f:
    seeds = json.load(f)
seeds

{'anger': ['outrageous',
  'infuriating',
  'ridiculous',
  'absurd',
  'exasperating',
  'disgusting',
  'insulting',
  'offensive',
  'intolerable',
  'unacceptable',
  'outrage',
  'insane',
  'angry',
  'upset',
  'boiling',
  'seething',
  'frustrated',
  'mad',
  'irritated',
  'livid',
  'indignant',
  'agitated',
  'annoyed',
  'pissed',
  'irate',
  'aggravated',
  'enraged',
  'bitter',
  'displeased',
  'disgruntled',
  'vexed',
  'temperamental',
  'cross',
  'testy',
  'impatient',
  'belligerent',
  'furious',
  'hostile',
  'offended',
  'exasperated',
  'resentful'],
 'sadness': ['sad',
  'unhappy',
  'melancholy',
  'dejected',
  'mournful',
  'downcast',
  'despondent',
  'blue',
  'dismal',
  'gloomy',
  'forlorn',
  'heartbroken',
  'woeful',
  'crestfallen',
  'disheartened',
  'grief',
  'sorrowful',
  'tearful',
  'somber',
  'bereaved',
  'lamenting',
  'doleful',
  'mournful',
  'lugubrious',
  'pensive',
  'heavyhearted',
  'woebegone',
  'troubled',
  'depres

In [96]:
from sklearn.feature_extraction.text import CountVectorizer

X = CountVectorizer().fit_transform(df.text_preprocessed.str.join(" "))

# lda = LDA(k, 1000)
# # needs an "arraylike" with a shape 
# # [[test[1] for test in row] for row in corpus]
# lda.fit(X)

In [95]:
X[1,X[1,] > 0]

array([1, 1, 1, 1, 1, 1, 1], dtype=int64)