# Reddit Depression Final Project
Link to the paper: https://dl.acm.org/doi/pdf/10.1145/3578503.3583621

Read through the paper fully before starting the assignment!

In [5]:
!pip install happiestfuntokenizing

Collecting happiestfuntokenizing
  Using cached happiestfuntokenizing-0.0.7-py3-none-any.whl
Installing collected packages: happiestfuntokenizing
Successfully installed happiestfuntokenizing-0.0.7


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from google.colab import drive
import pickle

FILEPATH = 'drive/My Drive/'
drive.mount('/content/drive',  force_remount=True)


Mounted at /content/drive


## Preprocessing

In [6]:
def load():
  """Load pickles"""
  with open(f"{FILEPATH}Copy of student.pkl", 'rb') as f:
    df = pickle.load(f)
    f.close()
  return df

df = load()

In [82]:
# A map from symptom name to included subreddits
subreddits = {
    "Anger": ["Anger"],
    "Anhedonia": ["anhedonia", "DeadBedrooms",],
    "Anxiety": ["Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack"],
    "Disordered Eating": ["bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous"],
    "Loneliness": ["ForeverAlone", "lonely"],
    "Sad mood": ["cry", "grief", "sad", "Sadness"],
    "Self-loathing": ["AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou"],
    "Sleep problem": ["insomnia", "sleep"],
    "Somatic complaint": ["cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus"],
    "Worthlessness": ["Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"]
}

# List of depression subreddits in the paper
depression_subreddits = ["Anger",
    "anhedonia", "DeadBedrooms",
    "Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack",
    "DecisionMaking", "shouldi",
    "bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous",
    "chronicfatigue", "Fatigue",
    "ForeverAlone", "lonely",
    "cry", "grief", "sad", "Sadness",
    "AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou",
    "insomnia", "sleep",
    "cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus",
    "AdultSelfHarm", "selfharm", "SuicideWatch",
    "Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"
]

def dataset_generation():
  """Build control and symptom datasets"""
  # remove posts with deleted authors
  cleaned = df[df["author"] != '[deleted]']
  output_map = {}
  for symptom in subreddits:
    output_map[symptom] = cleaned[cleaned['subreddit'].isin(subreddits[symptom])]["text"]

  all_depressed = cleaned[cleaned['subreddit'].isin(depression_subreddits)]
  print(all_depressed.head())
  # This groups all depressed posts by author and takes the minimum post timestamp
  users = all_depressed[["author", "created_utc"]].groupby(["author"]).min()
  print(users.head())

  not_depressed = cleaned[~cleaned['subreddit'].isin(depression_subreddits)]
  # This join will combine our minimum depressed post timestamp with our data for posts in non-depressed subreddits
  joined = not_depressed.join(users, on="author", how="inner", rsuffix="_depressed", validate="m:1")
  print(joined)

  # We can then filter by posts that are before this date
  control = joined[joined["created_utc"] < joined["created_utc_depressed"] - (180*86400)]["text"]
  return (output_map, control)

depressed, control = dataset_generation()

                                                 text           author  \
20  i'm trying hi, i'm sorry if my writing is bad,...        n90300118   
39  Only friend has been blanking me for what feel...  Throwaway34qwas   
67  Study hall social anxiety bruh We had a study ...         Shwin280   
72  Positive Thoughts For You - We Are Happy To Pu...       pthinkimag   
79  Starting from a blowup mattress Today was a ve...      MyCrazyLove   

       subreddit  created_utc     date  
20  SuicideWatch   1510374743  ression  
39        lonely   1505308711  ression  
67       Anxiety   1515634258  ression  
72       Anxiety   1515944819  ression  
79  SuicideWatch   1516594948  ression  
                  created_utc
author                       
---annon---        1505320579
---michelle---     1505417481
--Solus            1500350545
--broken_wings--   1500326111
--closer2thesun    1508050449
                                                      text  \
0        does your life feel like a w

In [33]:
for d in depressed:
  print(d, depressed[d].shape)
print(control.shape)

Anger (552,)
Anhedonia (5911,)
Anxiety (24428,)
Disordered Eating (1789,)
Loneliness (11485,)
Sad mood (2215,)
Self-loathing (9831,)
Sleep problem (3174,)
Somatic complaint (8322,)
Worthlessness (1804,)
(4369,)


In [34]:
from happiestfuntokenizing.happiestfuntokenizing import Tokenizer

tokenizer = Tokenizer()
def tokenize():
  """Tokenize"""
  tokenized = {}
  # apply tokenizer to each document
  for s in depressed:
    tokenized[s] = depressed[s].apply(tokenizer.tokenize)
    print(s)
  return tokenized, control.apply(tokenizer.tokenize)

depressed_tok, control_tok = tokenize()

Anger
Anhedonia
Anxiety
Disordered Eating
Loneliness
Sad mood
Self-loathing
Sleep problem
Somatic complaint
Worthlessness


In [35]:
from gensim.corpora import Dictionary

# Create a gensim dictionary with all control documents and add
# all the documents in the depressed subreddits
dct = Dictionary(control_tok)
for s in depressed_tok:
  dct.add_documents(depressed_tok[s])

# filter out the 100 most frequent stop words
dct.filter_n_most_frequent(100)

In [76]:
# Apply the gensim document to bag of words model to each document
depressed_bow = {}
for s in depressed:
  depressed_bow[s] = depressed_tok[s].apply(dct.doc2bow)
  print(s)
control_bow = control_tok.apply(dct.doc2bow)


Anger
Anhedonia
Anxiety
Disordered Eating
Loneliness
Sad mood
Self-loathing
Sleep problem
Somatic complaint
Worthlessness


## Reddit Topics with LDA

 - Don't use MALLET (as the paper does), use some other LDA implementation.

In [37]:
# TODO: Your LDA code!
from gensim.models import LdaMulticore
# Create a corpus with all documents
corpus = pd.concat([control_bow, pd.concat([depressed_bow[s] for s in depressed_bow])])
# Train our LDA on the corpus
lda = LdaMulticore(corpus, num_topics=200, id2word=dct)

In [63]:
import torch
torch.cuda.empty_cache()

## RoBERTa Embeddings

In [27]:
# TODO: Your RoBERTa code!
from transformers import RobertaModel, RobertaTokenizerFast

# Initialize our roberta model and tokenizer
model = RobertaModel.from_pretrained("roberta-base")
rob_tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
device = torch.device("cuda")
model.to(device)
model.eval()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [71]:
from os.path import exists
import math
batch_size = 64

def get_roberta_embeddings(dataset, name):
  filename = f"{FILEPATH}roberta_embeddings_{name}.csv"
  # cache embeddings (total generation takes about 40 minutes)
  if exists(filename):
    out = np.loadtxt(filename)
    return out
  out = []
  with torch.no_grad():
    # batch our documents
    for i in range(math.ceil(len(dataset) / batch_size)):
      batch = dataset[batch_size*i:batch_size*(i+1)]
      # tokenize our batch
      dataset_rob_tok = rob_tokenizer(batch.to_list(), return_tensors='pt', padding=True, truncation=True, max_length=512)
      dataset_rob_tok.to(device)
      # apply the roberta model to our tokens
      embedding = model(**dataset_rob_tok, output_hidden_states=True)
      # take the average of our 10th hidden state among all words as per the paper
      avg_10th_layer = torch.mean(embedding.hidden_states[9], 1).cpu().numpy()
      if i % 10 == 0:
        print(i, " batches out of ", len(dataset) // batch_size)
      out.extend(avg_10th_layer)
    np.savetxt(filename, out)
  return out

In [58]:
roberta_control = get_roberta_embeddings(control, "Control")

In [77]:
roberta_depressed = {}
for s in depressed:
  print(s)
  roberta_depressed[s] = get_roberta_embeddings(depressed[s], s)

Anger
Anhedonia
Anxiety
Disordered Eating
Loneliness
Sad mood
Self-loathing
Sleep problem
Somatic complaint
Worthlessness


In [39]:
def get_lda_topics(docs, name):
  filename = f"{FILEPATH}lda_topics_{name}.csv"
  # cache lda topics
  if exists(filename):
    out = np.loadtxt(filename)
    return out
  control_lda_topics = np.zeros([len(docs), 200])
  topics = [lda.get_document_topics(doc, minimum_probability=0.01) for doc in docs]
  # get_document_topics returns a list of (index, value) pairs,
  # so we need to convert into a topic-document matrix
  for doc_ind in range(len(topics)):
    for ind, data in topics[doc_ind]:
      control_lda_topics[doc_ind, ind] = data
  np.savetxt(filename, control_lda_topics)
  return control_lda_topics

In [40]:
lda_control = get_lda_topics(control_bow, "Control")

In [42]:
lda_depressed = {}
for s in depressed:
  print(s)
  lda_depressed[s] = get_lda_topics(depressed_bow[s], s)

Anger
Anhedonia
Anxiety
Disordered Eating
Loneliness
Sad mood
Self-loathing
Sleep problem
Somatic complaint
Worthlessness


## Main

In [75]:
def main(X, y):
  """
  Here's the basic structure of the main block! It should run
  5-fold cross validation with random forest to evaluate your RoBERTa and LDA
  performance.
  """
  rf_classifier = RandomForestClassifier()
  cv = KFold(n_splits=5, shuffle=True)
  results = cross_validate(rf_classifier, X=X, y=y, cv=cv, scoring='roc_auc', return_train_score=True)

  # TODO: Print your training and testing scores!
  print("test: ", np.average(results["test_score"]), " train: ",  np.average(results["train_score"]))
  pass

print("LDA")
for s in depressed:
  print(s)
  data = np.concatenate((lda_control, lda_depressed[s]))
  labels = [0 if i < len(lda_control) else 1 for i in range(len(data))]
  main(data, labels)

print("Roberta")
for s in roberta_depressed:
  print(s)
  data = np.concatenate((roberta_control, roberta_depressed[s]))
  labels = [0 if i < len(roberta_control) else 1 for i in range(len(data))]
  main(data, labels)


LDA
Anger
test:  0.9191300690107067  train:  0.9998989838594664
Anhedonia
test:  0.9558534749898884  train:  0.9994775185704938
Anxiety
test:  0.9073781050631629  train:  0.9993114521603328
Disordered Eating
test:  0.9510997453716102  train:  0.9990392950382831
Loneliness
test:  0.8592465088976875  train:  0.9996150062374669
Sad mood
test:  0.8606586881379963  train:  0.9986228766661593
Self-loathing
test:  0.8643708048882084  train:  0.9988977570094761
Sleep problem
test:  0.977634153624553  train:  0.9995266904378066
Somatic complaint
test:  0.9116715946316155  train:  0.9990556186613964
Worthlessness
test:  0.7645791837921697  train:  0.9973185372336524
Roberta
Anger
test:  0.9055782908088947  train:  1.0
Anhedonia
test:  0.9400506805251119  train:  1.0
Anxiety
test:  0.93563502223511  train:  1.0
Disordered Eating
test:  0.9242114433202809  train:  1.0
Loneliness
test:  0.8893823862304593  train:  0.9999623761421944
Sad mood
test:  0.915475828980411  train:  1.0
Self-loathing
test: