# Reddit Depression Final Project
Link to the paper: https://dl.acm.org/doi/pdf/10.1145/3578503.3583621

Read through the paper fully before starting the assignment!

In [3]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier

from google.colab import drive
drive.mount('/content/drive')

FILEPATH = 'drive/MyDrive/Colab Notebooks/CS1460 final project/student.pkl'

Mounted at /content/drive


## Preprocessing

In [4]:
def load(filepath: str):
  """Load pickles"""
  """Args:
      filepath (str): path to the pickle file
      Returns:
      data (pd dataframe): loaded pickle file
  """

  data = pd.read_pickle(FILEPATH)

  return data

  #pass

In [5]:
data = load(FILEPATH)
print(data.head())
print(data.info())

                                                text            author  \
0  does your life feel like a waste mines not a c...        trademeple   
1  Just relapsed again. Any advice I just got to ...          kenny818   
2  Audio and mic not working? So I have a HyperX ...          psyjinks   
3  PG&amp;E: Mylar balloon causes outage in centr...            Majnum   
4                                    Um... Forward?   OldManoftheNorth   

     subreddit  created_utc     date  
0   depression   1504920055  2017-09  
1        NoFap   1507890053  2017-10  
2  techsupport   1513558467  2017-12  
3  nottheonion   1499573023  2017-07  
4        memes   1516842851  2018-01  
<class 'pandas.core.frame.DataFrame'>
Index: 1958158 entries, 0 to 1969753
Data columns (total 5 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   text         object
 1   author       object
 2   subreddit    object
 3   created_utc  int64 
 4   date         object
dtypes: int64(1), object(4)
memory usag

In [6]:
# List of depression subreddits in the paper
depression_subreddits = ["Anger",
    "anhedonia", "DeadBedrooms",
    "Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack",
    "DecisionMaking", "shouldi",
    "bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous",
    "chronicfatigue", "Fatigue",
    "ForeverAlone", "lonely",
    "cry", "grief", "sad", "Sadness",
    "AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou",
    "insomnia", "sleep",
    "cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus",
    "AdultSelfHarm", "selfharm", "SuicideWatch",
    "Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"
]

In [7]:
loneliness_subreddits = ["ForeverAlone", "lonely"]

In [8]:
def dataset_generation(data, depression_subreddits, time_gap = 180):
  """Build control and symptom datasets"""
  """Args:
      data (pd dataframe): full dataset
      depression_subreddits (list[str]): subreddits with posts and themes representing depression symptoms
      time_gap (int): min number of days between control and symptom posts by some given user
      Returns:
      symptom_data (pd dataframe): symptom dataset
      control_data (pd dataframe): control dataset
  """

  time_gap_seconds = time_gap * 24 * 60 * 60
  symptom_data = data[data['subreddit'].isin(depression_subreddits)]
  print(f"Symptom dataset size: {symptom_data.shape[0]}")

  earliest_symptom_times = symptom_data.groupby('author')['created_utc'].min()

  control_data = data[
      (data['subreddit'].isin(depression_subreddits) == False) &
      (data['author'].isin(earliest_symptom_times.index)) &
      (data['created_utc'] <= data['author'].map(earliest_symptom_times) - time_gap_seconds)
  ]
  print(f"Control dataset size: {control_data.shape[0]}")

  return symptom_data, control_data



  #pass

In [9]:
symptoms_to_subreddits = {
    "anger": ["Anger"],
    "anhedonia": ["anhedonia", "DeadBedrooms"],
    "anxiety": ["Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack"],
    "concentration_deficit": ["DecisionMaking", "shouldi"],
    "disordered_eating": ["bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous"],
    "fatigue": ["chronicfatigue", "Fatigue"],
    "loneliness": ["ForeverAlone", "lonely"],
    "sad_mood": ["cry", "grief", "sad", "Sadness"],
    "self_loathing": ["AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou"],
    "sleep_problem": ["insomnia", "sleep"],
    "somatic_complaint": ["cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus"],
    "suicidal_thoughts_and_attempts": ["AdultSelfHarm", "selfharm", "SuicideWatch"],
    "worthlessness": ["Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"]

}

In [10]:
def dataset_generation_specified(data, depression_subreddits, symptoms_to_subreddits, time_gap = 180):
  """Build control and symptom datasets--13 smaller datasets for each symptom instead of one umbrella set as in the other version of the function"""
  """Args:
      data (pd dataframe): full dataset
      depression_subreddits (list[str]): subreddits with posts and themes representing depression symptoms
      symptoms_to_subreddits (dict): key: symptom name, value: list of subreddits pertaining to that symptom
      time_gap (int): min number of days between control and symptom posts by some given user
      Returns:
      symptom_data (pd dataframe): symptom dataset
      control_data (pd dataframe): control dataset
  """

  time_gap_seconds = time_gap * 24 * 60 * 60
  all_symptom_data = data[data['subreddit'].isin(depression_subreddits)]
  print(f"Symptom dataset size: {all_symptom_data.shape[0]}")

  earliest_symptom_times = all_symptom_data.groupby('author')['created_utc'].min()

  control_data = data[
      (data['subreddit'].isin(depression_subreddits) == False) &
      (data['author'].isin(earliest_symptom_times.index)) &
      (data['created_utc'] <= data['author'].map(earliest_symptom_times) - time_gap_seconds)
  ]
  print(f"Control dataset size: {control_data.shape[0]}")

  symptom_datasets = {}
  for symptom, subreddits in symptoms_to_subreddits.items():
    symptom_data = all_symptom_data[all_symptom_data['subreddit'].isin(subreddits)]
    symptom_datasets[symptom] = symptom_data
    print(f"{symptom} dataset size: {symptom_data.shape[0]}")

  return symptom_datasets, control_data

In [11]:
symptom_data, control_data = dataset_generation(data, depression_subreddits)
# save them to files in drive
symptom_data.to_pickle('drive/MyDrive/Colab Notebooks/CS1460 final project/symptom_data.pkl')
control_data.to_pickle('drive/MyDrive/Colab Notebooks/CS1460 final project/control_data.pkl')

Symptom dataset size: 94514
Control dataset size: 4369


In [12]:
# same as above but for specific symptoms
symptom_datasets, control_data = dataset_generation_specified(data, depression_subreddits, symptoms_to_subreddits)
for symptom, symptom_data in symptom_datasets.items():
  symptom_data.to_pickle(f'drive/MyDrive/Colab Notebooks/CS1460 final project/{symptom}_data.pkl')
control_data.to_pickle('drive/MyDrive/Colab Notebooks/CS1460 final project/control_data1.pkl')

Symptom dataset size: 94514
Control dataset size: 4369
anger dataset size: 555
anhedonia dataset size: 5934
anxiety dataset size: 24514
concentration_deficit dataset size: 10
disordered_eating dataset size: 1789
fatigue dataset size: 1
loneliness dataset size: 11535
sad_mood dataset size: 2222
self_loathing dataset size: 9865
sleep_problem dataset size: 3184
somatic_complaint dataset size: 8330
suicidal_thoughts_and_attempts dataset size: 26520
worthlessness dataset size: 1805


In [13]:
!pip install happiestfuntokenizing
from happiestfuntokenizing.happiestfuntokenizing import Tokenizer

Collecting happiestfuntokenizing
  Downloading happiestfuntokenizing-0.0.7.tar.gz (6.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: happiestfuntokenizing
  Building wheel for happiestfuntokenizing (setup.py) ... [?25l[?25hdone
  Created wheel for happiestfuntokenizing: filename=happiestfuntokenizing-0.0.7-py3-none-any.whl size=6711 sha256=476757eb702f1667a3775b80dbe7afeb5ef667d2d60a7920eb8de2827fd13a35
  Stored in directory: /root/.cache/pip/wheels/bf/c9/4d/310f0c60855eb7b428558f29d93cf464dbb64c1b8628753395
Successfully built happiestfuntokenizing
Installing collected packages: happiestfuntokenizing
Successfully installed happiestfuntokenizing-0.0.7


In [23]:
def tokenize(data):
  """Tokenize"""
  """Args:
      data (pd dataframe): dataset
      Returns:
      tokenized_data (pd dataframe): dataset with tokenized text
  """

  tokenizer = Tokenizer()

  tokenized_data = data.copy()
  tokenized_data['tokenized_text'] = tokenized_data['text'].apply(tokenizer.tokenize)

  return tokenized_data


  #pass

def further_cleaning(data):
  """Further cleaning"""
  """Args:
      data (pd dataframe): dataset
      Returns:
      cleaned_data (pd dataframe): dataset with further cleaning
  """

  cleaned_data = data.copy()

  cleaned_data['tokenized_text'] = cleaned_data['tokenized_text'].apply(lambda x: [word.lower() for word in x])
  cleaned_data['tokenized_text'] = cleaned_data['tokenized_text'].apply(lambda x: [word for word in x if word.isalnum()])

  return cleaned_data

In [24]:
#data_tokenized = tokenize(data)
#data_tokenized.to_pickle('drive/MyDrive/Colab Notebooks/CS1460 final project/data_tokenized.pkl')

control_data_tokenized = tokenize(control_data)
control_data_tokenized = further_cleaning(control_data_tokenized)
control_data_tokenized.to_pickle('drive/MyDrive/Colab Notebooks/CS1460 final project/control_data_tokenized.pkl')

symptom_data_tokenized = tokenize(symptom_data)
symptom_data_tokenized = further_cleaning(symptom_data_tokenized)
symptom_data_tokenized.to_pickle('drive/MyDrive/Colab Notebooks/CS1460 final project/symptom_data_tokenized.pkl')

In [25]:
print(control_data_tokenized.head())

                                                   text                author  \
315            Man, I do love me some Bandicoot crash.    BuddermanTheAmazing   
651   How good is this PC for my 700-750$ budget? Wa...   WildernessExploring   
730   When is the price of gpus going down? I know t...  NeighborhoodPizzaGuy   
1354  Our service is not available in your area. Hey...               xDEDANx   
1598                                               Wow            baby_kicked   

                 subreddit  created_utc     date  \
315        crappyoffbrands   1499236239  2017-07   
651             buildmeapc   1501296261  2017-07   
730           pcmasterrace   1500082729  2017-07   
1354                  njpw   1499941432  2017-07   
1598  indianpeoplefacebook   1500924182  2017-07   

                                         tokenized_text  
315      [man, i, do, love, me, some, bandicoot, crash]  
651   [how, good, is, this, pc, for, my, budget, wan...  
730   [when, is, the, pric

In [59]:
tokenized_symptom_data = dict()

for symptom, symptom_data in symptom_datasets.items():
  symptom_data_tokenized = tokenize(symptom_data)
  symptom_data_tokenized = further_cleaning(symptom_data_tokenized)
  symptom_data_tokenized.to_pickle(f'drive/MyDrive/Colab Notebooks/CS1460 final project/{symptom}_data_tokenized.pkl')
  tokenized_symptom_data[symptom] = symptom_data_tokenized

In [28]:
print(symptom_data_tokenized.head())

                                                   text         author  \
1640  Would you Rather Be Superman Or Have a Massive...        torero5   
3187  I have basically everything I’ve really wanted...    Wandertramp   
3842  how do i cope with defeat and failure? so i ju...  that_one_kid9   
4273  How could I be bored with so much work to do? ...        Sraktai   
5346  How to eat decent meals I have been struggling...      MrAfr1can   

              subreddit  created_utc     date  \
1640           selfhelp   1514386325  ression   
3187           selfhelp   1508588646  ression   
3842           selfhelp   1503614431  ression   
4273  whatsbotheringyou   1507628776  ression   
5346           selfhelp   1500861393  ression   

                                         tokenized_text  
1640  [would, you, rather, be, superman, or, have, a...  
3187  [i, have, basically, everything, i, ve, really...  
3842  [how, do, i, cope, with, defeat, and, failure,...  
4273  [how, could, i, be, bo

In [29]:
from collections import Counter

def stop_words(data):
  """Find top 100 words from Reddit dataset to use as stop words"""
  """Args:
      data (pd dataframe): dataset
      Returns:
      stop_words (list[str]): list of stop words
  """

  stop_words = []

  all_tokens = [token for token in data['tokenized_text']]
  all_tokens = [token for sublist in all_tokens for token in sublist]

  token_counts = Counter(all_tokens)
  output_stop_words = [token for token, count in token_counts.most_common(100)]

  return output_stop_words

  #pass

In [30]:
control_stop_words = stop_words(control_data_tokenized)
print(control_stop_words)

['i', 'the', 'to', 'and', 'a', 'of', 'my', 'in', 'it', 'is', 'for', 'that', 'this', 'but', 'on', 'you', 'with', 'was', 'have', 'me', 'so', 'be', 'or', 'just', 'if', 'not', 'what', 'like', 'are', 'as', 'at', 'do', 'about', 'up', 'out', 'can', 'all', 'he', 'from', 'we', 'they', 'her', 'how', 'would', 'she', 'get', 'when', 'one', 'an', 'know', 'had', 'there', 'some', 'been', 'will', 'time', 'any', 'because', 'no', 'more', 'am', 'want', 'your', 'has', 'really', 'people', 'now', 'them', 'amp', 'who', 'other', 'only', 'think', 'by', 'even', 'his', 'back', 'much', 'good', 'then', 'him', 'after', 'also', 'feel', 'go', 'going', 'removed', 'new', 'anyone', 'into', 'make', 'got', 'first', 'could', 'their', 'day', 'than', 'were', 'way', 'which']


In [31]:
def remove_stop_words(data, stop_words):
  """Remove stop words from dataset"""
  """Args:
      data (pd dataframe): dataset
      stop_words (list[str]): list of stop words
      Returns:
      data_without_stop_words (pd dataframe): dataset without stop words
  """
  data_without_stop_words = data.copy()
  data_without_stop_words['tokenized_text'] = data_without_stop_words['tokenized_text'].apply(lambda x: [word for word in x if word not in stop_words])

  return data_without_stop_words

In [32]:
control_data_without_stop_words = remove_stop_words(control_data_tokenized, control_stop_words)
symptom_data_without_stop_words = remove_stop_words(symptom_data_tokenized, control_stop_words)

In [33]:
print(control_data_without_stop_words.head())
print(symptom_data_without_stop_words.head())

                                                   text                author  \
315            Man, I do love me some Bandicoot crash.    BuddermanTheAmazing   
651   How good is this PC for my 700-750$ budget? Wa...   WildernessExploring   
730   When is the price of gpus going down? I know t...  NeighborhoodPizzaGuy   
1354  Our service is not available in your area. Hey...               xDEDANx   
1598                                               Wow            baby_kicked   

                 subreddit  created_utc     date  \
315        crappyoffbrands   1499236239  2017-07   
651             buildmeapc   1501296261  2017-07   
730           pcmasterrace   1500082729  2017-07   
1354                  njpw   1499941432  2017-07   
1598  indianpeoplefacebook   1500924182  2017-07   

                                         tokenized_text  
315                       [man, love, bandicoot, crash]  
651   [pc, budget, gaming, high, ultra, settings, th...  
730   [price, gpus, down, 

In [39]:
# split data up into several pieces and run methods
# I just realized I don't think I actually need this

def split_data(data, n):
  """Split data into n pieces"""
  """Args:
      data (pd dataframe): dataset
      n (int): number of pieces to split data into
      Returns:
      data_pieces (list[pd dataframe]): list of n pieces of data
  """

  return np.array_split(data, n)

def process_chunks(data_chunks):
  """Process chunks of data using the preprocessing methods I wrote above"""
  """Args:
      data_chunks (list[pd dataframe]): list of n pieces of data
      Returns:
      data_without_stop_words (list[pd dataframe]): list of n pieces of data without stop words
  """

  processed_chunks = []
  for chunk in data_chunks:
    chunk = tokenize(chunk)
    chunk = further_cleaning(chunk)
    chunk = remove_stop_words(chunk, control_stop_words)
    processed_chunks.append(chunk)

  return processed_chunks

def combine_chunks(data_chunks):
  """Combine chunks of data into one dataset"""
  """Args:
      data_chunks (list[pd dataframe]): list of n pieces of data
      Returns:
      combined_data (pd dataframe): combined dataset
  """

  combined_data = pd.concat(data_chunks)
  return combined_data

In [None]:
split_original_data = split_data(data, 10)
cleaned_original_data = process_chunks(split_original_data)
combined_original_data = combine_chunks(cleaned_original_data)

# I don't think I actually need this, nvm!

In [55]:
combined_data_without_stop_words = pd.concat([control_data_without_stop_words, symptom_data_without_stop_words])
print(combined_data_without_stop_words.head())

# I spent an embarrassingly long time trying to figure out how to combine two
# gensim Dictionary objects together into another Dictionary before I realized
# I could just do it up here beforehand

                                                   text                author  \
315            Man, I do love me some Bandicoot crash.    BuddermanTheAmazing   
651   How good is this PC for my 700-750$ budget? Wa...   WildernessExploring   
730   When is the price of gpus going down? I know t...  NeighborhoodPizzaGuy   
1354  Our service is not available in your area. Hey...               xDEDANx   
1598                                               Wow            baby_kicked   

                 subreddit  created_utc     date  \
315        crappyoffbrands   1499236239  2017-07   
651             buildmeapc   1501296261  2017-07   
730           pcmasterrace   1500082729  2017-07   
1354                  njpw   1499941432  2017-07   
1598  indianpeoplefacebook   1500924182  2017-07   

                                         tokenized_text  
315                       [man, love, bandicoot, crash]  
651   [pc, budget, gaming, high, ultra, settings, th...  
730   [price, gpus, down, 

## Reddit Topics with LDA

 - Don't use MALLET (as the paper does), use some other LDA implementation.

In [56]:
# We highly recommend you using the LdaMulticore interface, but feel free to use any other implementations if you prefer.
# from gensim.models import LdaMulticore

# TODO: Your LDA code!

from gensim.corpora import Dictionary
from gensim.models import LdaMulticore

dictionary = Dictionary(combined_data_without_stop_words['tokenized_text'])
print(f"Number of unique tokens: {len(dictionary)}")

corpus = [dictionary.doc2bow(text) for text in control_data_without_stop_words['tokenized_text']]
print(f"Corpus size: {len(dictionary)}")

dictionary.save('control_dict.gensim')
with open('control_corpus.pkl', 'wb') as f:
  pickle.dump(corpus, f)

def train_lda_model(dictionary, corpus, num_topics=200, alpha=5, passes=10, workers=4):
    """Train LDA model"""
    """Args:
        dictionary: gensim dictionary that maps words to IDs
        corpus: BoW corpus
        num_topics: number of topics
        alpha: hyperparameter for sparsity of topics per doc
        passes: number of passes
        workers: number of workers
        Returns:
        lda_model: trained LDA model
        topics: list of topics
    """

    lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha=alpha/num_topics, passes=passes, workers=workers)
    topic_features = []
    for doc in corpus:
      topics = lda_model.get_document_topics(doc)
      topic_features.append([prob for topic, prob in topics])

    topic_features = pd.DataFrame(topic_features)

    return lda_model, topics


Number of unique tokens: 23551
Corpus size: 23551


In [60]:
lda_model, topics = train_lda_model(dictionary, corpus)

lda_model.save('lda_model.gensim')

In [61]:
print(lda_model.print_topics())

# not too sure how to interpret this but ok

[(137, '0.013*"off" + 0.012*"10" + 0.011*"kits" + 0.011*"testing" + 0.011*"drug" + 0.011*"400" + 0.008*"owner" + 0.007*"use" + 0.007*"moments" + 0.007*"needed"'), (126, '0.010*"does" + 0.010*"need" + 0.009*"effect" + 0.009*"winter" + 0.007*"fireworks" + 0.006*"realize" + 0.006*"found" + 0.006*"genius" + 0.005*"two" + 0.005*"anything"'), (31, '0.011*"2meirl4meirl" + 0.009*"myself" + 0.008*"each" + 0.008*"why" + 0.007*"friendship" + 0.007*"rings" + 0.006*"person" + 0.006*"guys" + 0.006*"smart" + 0.006*"these"'), (192, '0.013*"police" + 0.012*"parents" + 0.011*"few" + 0.010*"nmum" + 0.010*"years" + 0.008*"never" + 0.008*"family" + 0.008*"life" + 0.008*"why" + 0.008*"school"'), (54, '0.024*"cheese" + 0.021*"month" + 0.015*"chocolate" + 0.015*"consuming" + 0.015*"describe" + 0.012*"mostly" + 0.009*"ingram" + 0.009*"sg" + 0.009*"craving" + 0.009*"damn"'), (144, '0.044*"watch" + 0.019*"episode" + 0.012*"question" + 0.012*"morning" + 0.012*"questions" + 0.011*"use" + 0.011*"friends" + 0.010*"b

In [63]:
for symptom in tokenized_symptom_data:
  symptom_corpus = [dictionary.doc2bow(text) for text in tokenized_symptom_data[symptom]['tokenized_text']]
  symptom_topic_features = []
  for doc in symptom_corpus:
    topics = lda_model.get_document_topics(doc, minimum_probability=0.0000)
    symptom_topic_features.append([prob for topic, prob in topics])

  symptom_topic_features = pd.DataFrame(symptom_topic_features, columns=[f'topic_{i}' for i in range(200)])
  symptom_topic_features.to_pickle(f'drive/MyDrive/Colab Notebooks/CS1460 final project/{symptom}_topic_features.pkl')

control_corpus = [dictionary.doc2bow(text) for text in control_data_without_stop_words['tokenized_text']]
control_topic_features = []
for doc in control_corpus:
  topics = lda_model.get_document_topics(doc, minimum_probability=0.0000)
  control_topic_features.append([prob for topic, prob in topics])

control_topic_features = pd.DataFrame(control_topic_features, columns=[f'topic_{i}' for i in range(200)])
control_topic_features.to_pickle('drive/MyDrive/Colab Notebooks/CS1460 final project/control_topic_features.pkl')

In [64]:
symptom_topic_features_df = pd.read_pickle('drive/MyDrive/Colab Notebooks/CS1460 final project/anxiety_topic_features.pkl')
print(symptom_topic_features_df.head())
print(symptom_topic_features.shape)
print(control_topic_features.head())
print(control_topic_features.shape)

    topic_0   topic_1   topic_2   topic_3   topic_4   topic_5   topic_6  \
0  0.044271  0.000313  0.000313  0.000313  0.000313  0.000313  0.000313   
1  0.002778  0.002778  0.002778  0.002778  0.002778  0.002778  0.002778   
2  0.002273  0.081088  0.002273  0.002273  0.002273  0.002273  0.002273   
3  0.000099  0.000099  0.000099  0.000099  0.000099  0.000099  0.000099   
4  0.001191  0.001191  0.001191  0.001191  0.001191  0.135397  0.001191   

    topic_7   topic_8   topic_9  ...  topic_190  topic_191  topic_192  \
0  0.000313  0.000313  0.000313  ...   0.000313   0.000313   0.134790   
1  0.002778  0.002778  0.002778  ...   0.002778   0.002778   0.002778   
2  0.155012  0.002273  0.002273  ...   0.002273   0.002273   0.002273   
3  0.000099  0.000099  0.000099  ...   0.000099   0.000099   0.137941   
4  0.001191  0.001191  0.001191  ...   0.001191   0.001191   0.001191   

   topic_193  topic_194  topic_195  topic_196  topic_197  topic_198  topic_199  
0   0.000313   0.000313   0.0

## RoBERTa Embeddings

In [74]:
# TODO: Your RoBERTa code!

from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
model = AutoModel.from_pretrained("distilroberta-base", output_hidden_states=True)

model.eval()

def get_roberta_embeddings(text_list, tokenizer, model, layer_index=5, max_length=512, device='cuda' if torch.cuda.is_available() else 'cpu'):
  """Get RoBERTa embeddings"""
  """Args:
      text_list: list of strings
      tokenizer: tokenizer
      model: model
      layer_index: index of layer to extract embeddings from
      max_length: max length of tokens
      device: device
      Returns:
      embeddings: list of embeddings
  """
  model.to(device)

  embeddings = []

  for text in text_list:
    inputs = tokenizer(text, return_tensors="pt", padding=True, max_length=max_length, truncation=True).to(device)
    with torch.no_grad():
      outputs = model(**inputs)

    hidden_states = outputs.hidden_states
    layer_hidden_state = hidden_states[layer_index]

    batch_embeddings = layer_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    embeddings.append(batch_embeddings)

  return embeddings



In [75]:
control_texts = control_data_without_stop_words['tokenized_text'].apply(lambda x: ' '.join(x)).tolist()
for symptom in tokenized_symptom_data:
  symptom_texts = tokenized_symptom_data[symptom]['tokenized_text'].apply(lambda x: ' '.join(x)).tolist()
  symptom_embeddings = get_roberta_embeddings(symptom_texts, tokenizer, model)
  np.save(f'drive/MyDrive/Colab Notebooks/CS1460 final project/{symptom}_roberta_embeddings.npy', symptom_embeddings)

control_embeddings = get_roberta_embeddings(control_texts, tokenizer, model)


np.save('control_embeddings.npy', control_embeddings)


KeyboardInterrupt: 

## Main

In [67]:
def main(X, y, symptom_name):
  """
  Here's the basic structure of the main block! It should run
  5-fold cross validation with random forest to evaluate your RoBERTa and LDA
  performance.
  """
  """Args:
      X (np array): features
      y (np array): labels
      symptom_name (str): name of the symptom
      Returns:
      None
  """

  rf_classifier = RandomForestClassifier()
  cv = KFold(n_splits=5, shuffle=True)
  results = cross_validate(rf_classifier, X=X, y=y, cv=cv, scoring='roc_auc', return_train_score=True)

  # TODO: Print your training and testing scores!

  print(f"Results for symptom: {symptom_name}")
  print(f"Training AUC Scores: {results['train_score']}")
  print(f"Mean Training AUC: {results['train_score'].mean():.4f}")
  print(f"Testing AUC Scores: {results['test_score']}")
  print(f"Mean Testing AUC: {results['test_score'].mean():.4f}\n")


  #pass

In [69]:
for symptom_name in symptoms_to_subreddits:
  symptom_topic_features_df = pd.read_pickle(f'drive/MyDrive/Colab Notebooks/CS1460 final project/{symptom_name}_topic_features.pkl')
  control_features_df = pd.read_pickle('drive/MyDrive/Colab Notebooks/CS1460 final project/control_topic_features.pkl')

  control_features_df['label'] = 0
  symptom_topic_features_df['label'] = 1

  combined_features_df = pd.concat([control_features_df, symptom_topic_features_df])

  X = combined_features_df.drop(columns=['label']).values
  y = combined_features_df['label'].values

  print(f"Evaluating symptom: {symptom_name}")
  main(X, y, symptom_name)

Evaluating symptom: anger
Results for symptom: anger
Training AUC Scores: [0.9986838  0.99899571 0.99878624 0.99875659 0.99904192]
Mean Training AUC: 0.9989
Testing AUC Scores: [0.94184957 0.93227013 0.94753973 0.94392562 0.93500393]
Mean Testing AUC: 0.9401

Evaluating symptom: anhedonia
Results for symptom: anhedonia
Training AUC Scores: [0.99951475 0.99948813 0.99953167 0.99957816 0.99954374]
Mean Training AUC: 0.9995
Testing AUC Scores: [0.97204433 0.97082255 0.96928778 0.97403482 0.97245053]
Mean Testing AUC: 0.9717

Evaluating symptom: anxiety
Results for symptom: anxiety
Training AUC Scores: [0.99981409 0.99986978 0.99984216 0.99984042 0.99982361]
Mean Training AUC: 0.9998
Testing AUC Scores: [0.97142096 0.971807   0.97149301 0.97121464 0.96634676]
Mean Testing AUC: 0.9705

Evaluating symptom: concentration_deficit
Results for symptom: concentration_deficit
Training AUC Scores: [1. 1. 1. 1. 1.]
Mean Training AUC: 1.0000
Testing AUC Scores: [0.45995423 0.7076659  0.45714286 0.810

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py", line 640, in roc_auc_score
    return _average_binary_score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_base.py", line 76, in _average_binary_score
    return binary_metric(y_true, y_score, sample_w

Results for symptom: fatigue
Training AUC Scores: [ 1.  1.  1.  1. nan]
Mean Training AUC: nan
Testing AUC Scores: [nan nan nan nan nan]
Mean Testing AUC: nan

Evaluating symptom: loneliness
Results for symptom: loneliness
Training AUC Scores: [0.99957846 0.99952324 0.99939676 0.99948642 0.9994633 ]
Mean Training AUC: 0.9995
Testing AUC Scores: [0.9177197  0.91086898 0.90765183 0.92050245 0.90432474]
Mean Testing AUC: 0.9122

Evaluating symptom: sad_mood
Results for symptom: sad_mood
Training AUC Scores: [0.99898535 0.99928234 0.99899907 0.99929781 0.99934273]
Mean Training AUC: 0.9992
Testing AUC Scores: [0.91433293 0.90557874 0.89420079 0.89631076 0.90149045]
Mean Testing AUC: 0.9024

Evaluating symptom: self_loathing
Results for symptom: self_loathing
Training AUC Scores: [0.99947343 0.99940909 0.99946443 0.99926076 0.99950263]
Mean Training AUC: 0.9994
Testing AUC Scores: [0.93389802 0.93086735 0.94054409 0.9376834  0.93757002]
Mean Testing AUC: 0.9361

Evaluating symptom: sleep_pr

In [None]:
for symptom_name in symptoms_to_subreddits:
  symptom_roberta_embeddings = np.load(f'drive/MyDrive/Colab Notebooks/CS1460 final project/{symptom_name}_roberta_embeddings.npy')
  control_roberta_embeddings = np.load(f'drive/MyDrive/Colab Notebooks/CS1460 final project/control_embeddings.npy')

  X = np.concatenate([symptom_roberta_embeddings, control_roberta_embeddings])
  y = np.concatenate([np.ones(symptom_roberta_embeddings.shape[0]), np.zeros(control_roberta_embeddings.shape[0])])

  main(X, y, symptom_name)