In [56]:
import zstandard as zstd
import io
import orjson as json
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import words
import regex as re

# Reading Data

In [60]:
def read_zst_lines(path: str, max_lines: int = None, subreddit_whitelist: set[str] = None):
    """
    Streams a .zst file line by line, yielding only JSON rows from whitelisted subreddits.
    Shows a tqdm progress bar for lines read.
    """
    dctx = zstd.ZstdDecompressor()
    with open(path, 'rb') as fh:
        with dctx.stream_reader(fh) as reader:
            text_stream = io.TextIOWrapper(reader, encoding='utf-8')
            sub_count = 0
            for i, line in enumerate(tqdm(text_stream, desc=f"Reading {path}", unit="lines")):
                if not subreddit_whitelist and max_lines and i >= max_lines:
                    break
                try:
                    if subreddit_whitelist and max_lines and sub_count >= max_lines:
                        break
                    post = json.loads(line)
                    sub = post.get("subreddit")
                    if subreddit_whitelist and sub not in subreddit_whitelist:
                        continue
                    yield post
                    sub_count += 1
                except (json.JSONDecodeError, UnicodeDecodeError):
                    continue

In [12]:
subs = {'teenagers'}
data = list(read_zst_lines("RS_2023-01.zst", max_lines=10000, subreddit_whitelist=subs))
df = pd.DataFrame(data)

Reading RS_2023-01.zst: 3038352lines [00:39, 76074.91lines/s]


# Data Preprocessing

In [284]:
df = pd.read_csv('depression_casualuk.csv')

In [18]:
def data_stats(df):
    # Basic statistics per subreddit
    subs = df['subreddit'].unique()

    for subreddit in subs:
        subset = df[df['subreddit'] == subreddit]
        print(f"Statistics for subreddit '{subreddit}':")
        print(f"Total posts: {len(subset)}")
        print(f"Average title length: {subset['title'].str.len().mean()}")
        print(f"Average selftext length: {subset['selftext'].str.len().mean()}")
        print(f"Maximum title length: {subset['title'].str.len().max()}")
        print(f"Maximum selftext length: {subset['selftext'].str.len().max()}")
        print()
data_stats(df)

Statistics for subreddit 'teenagers':
Total posts: 10000
Average title length: 46.7616
Average selftext length: 59.3895
Maximum title length: 300
Maximum selftext length: 13534



In [76]:
def preprocess_text(text: str) -> str:
    try:
        if text is None:
            return pd.NA
        if not isinstance(text, str):
            return pd.NA
        if '[removed]' in text or '[deleted]' in text:
            return pd.NA
        # text = re.sub(r'[^\x00-\x7F]+', '', text) # Remove non-ASCII
        text = re.sub(r'\p{Emoji}', '', text) # Remove emojis
        text = text.replace('\uFE0F', '') # Remove variation selector for emojis
        text = re.sub(r'http[s]?://\S+', '', text) # Remove URLs
        text = text.strip()
        return text
    except Exception as e:
        print(f"Error processing text: {text!r}\nException: {e}")

def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df = df[['subreddit', 'title', 'selftext']].copy()
    for col in ['title', 'selftext']:
        for idx, text in df[col].items():
            try:
                df.at[idx, col] = preprocess_text(text)
            except Exception as e:
                print(f"Error at row {idx}, column '{col}': {text!r}")
    df = df.dropna(subset=['title', 'selftext'], how='all')
    df = df.replace(pd.NA, '')
    return df

# df = filter_dataframe(df)

In [None]:
def detect_non_english(text):
    if not isinstance(text, str):
        return False
    text = re.sub(r'[^\w\s]', '', text)
    non_ascii = [c for c in text if ord(c) > 127]
    # if non_ascii:
    #     print(f"Non-ASCII characters: {non_ascii}")
    return bool(non_ascii)

def remove_non_english_posts(df):
    mask = df.apply(lambda row: detect_non_english(row['title']) or detect_non_english(row['selftext']), axis=1)
    return df[~mask]

text = 'happy new year'
# Remove the posts containing 'happy new year'
def remove_happy_new_year_posts(df: pd.DataFrame) -> pd.DataFrame:
    mask = df['title'].str.contains(text, case=True, na=False) & df['selftext'].isna()
    return df[~mask]

# df.to_csv('depression_casualuk_teenagers_cleaned_no_happy_new_year.csv', index=False)

# print("Detecting non-English content in the DataFrame...")
# for index, row in df.iterrows():
#     if detect_non_english(row['title']) or detect_non_english(row['selftext']):
#         print(f"Non-English content detected in row {index}:")
#         print(f"Title: {row['title']}")
#         print(f"Selftext: {row['selftext']}")
#         print("-" * 80)

# df = remove_non_english_posts(df)
# print("After removing non-English posts, checking again...")
# for index, row in df.iterrows():
#     if detect_non_english(row['title']) or detect_non_english(row['selftext']):
#         print(f"Non-English content detected in row {index}:")
#         print(f"Title: {row['title']}")
#         print(f"Selftext: {row['selftext']}")
#         print("-" * 80)

In [25]:
# Basic statistics per subreddit
subs = df['subreddit'].unique()

for subreddit in subs:
    subset = df[df['subreddit'] == subreddit]
    print(f"Statistics for subreddit '{subreddit}':")
    print(f"Total posts: {len(subset)}")
    print(f"Average title length: {subset['title'].str.len().mean()}")
    print(f"Average selftext length: {subset['selftext'].str.len().mean()}")
    print(f"Maximum title length: {subset['title'].str.len().max()}")
    print(f"Maximum selftext length: {subset['selftext'].str.len().max()}")
    print()

Statistics for subreddit 'teenagers':
Total posts: 9909
Average title length: 45.987687960440006
Average selftext length: 52.24260773034615
Maximum title length: 300
Maximum selftext length: 13508



In [26]:
df.to_csv('teenagers_cleaned.csv', index=False)

# Obtaining Non-Depression Subreddits

In [133]:
subs = {'Anxiety'}
data = list(read_zst_lines("RS_2023-01.zst", subreddit_whitelist=subs))
df = pd.DataFrame(data)

Reading RS_2023-01.zst: 36090941lines [26:38, 22578.60lines/s]


In [134]:
df = filter_dataframe(df)
df = remove_non_english_posts(df)
df = remove_happy_new_year_posts(df)

In [135]:
df

Unnamed: 0,subreddit,title,selftext
0,Anxiety,"When you are at the end, keep pushing.",
1,Anxiety,Anxiety so bad I want to kms,I have had panic attacks on and off for so man...
2,Anxiety,Autophobia,
3,Anxiety,Anxiety meds,Hey guys I’m an f and I want to get started on...
4,Anxiety,Tingling only in feet from Anxiety?,I had a foot issue around months ago (stabbin...
...,...,...,...
8258,Anxiety,Do you guys get butterflies in your chest/sens...,Especially in anticipation of something? Or ju...
8259,Anxiety,Anxiety just before sleeping,Is it common? To me this is kind of usual and ...
8260,Anxiety,Feeling overwhelmed while trying to take care ...,
8261,Anxiety,Wisdom teeth and mental health ?,i just got my wisdom teeth out yesterday and h...


In [139]:
len(df)

8216

# Further Preprocessing

In [128]:
df = pd.read_csv('anxiety.csv')

In [130]:
# For each .csv in src/data
# Remove happy new year posts
# Code below

import os

for filename in os.listdir():
    if filename == 'test.csv':
        continue
    if filename.endswith('.csv'):
        file_path = filename
        df = pd.read_csv(file_path)
        print(f"Before processing {filename}: {len(df)} posts")
        df = remove_happy_new_year_posts(df)
        print(f"After processing {filename}: {len(df)} posts")
        df.to_csv(file_path, index=False)

Before processing casualuk.csv: 8240 posts
After processing casualuk.csv: 8158 posts
Before processing suicidewatch.csv: 14418 posts
After processing suicidewatch.csv: 14382 posts
Before processing depression_casualuk_teenagers_no_happy_new_year.csv: 21024 posts
After processing depression_casualuk_teenagers_no_happy_new_year.csv: 20865 posts
Before processing adhd.csv: 13951 posts
After processing adhd.csv: 13935 posts


  df = pd.read_csv(file_path)


Before processing askreddit.csv: 287138 posts
After processing askreddit.csv: 285783 posts
Before processing anxiety.csv: 828 posts
After processing anxiety.csv: 815 posts
Before processing teenagers.csv: 89529 posts
After processing teenagers.csv: 89252 posts


# Data Analysis

In [28]:
# temp = pd.read_csv('depression_casualuk_cleaned.csv')
df = pd.read_csv('depression_casualuk_teenagers_cleaned.csv')

In [55]:
# Get size of depression and casualuk subreddits
depression = df[df['subreddit'] == 'depression']
casualuk = df[df['subreddit'] == 'CasualUK']
teenagers = df[df['subreddit'] == 'teenagers']
print(f"Size of 'depression' subreddit: {len(depression)}")
print(f"Size of 'CasualUK' subreddit: {len(casualuk)}")
print(f"Size of 'teenagers' subreddit: {len(teenagers)}")

Size of 'depression' subreddit: 6046
Size of 'CasualUK' subreddit: 5154
Size of 'teenagers' subreddit: 9824


In [46]:
# Perform text analysis on data

df = pd.read_csv("depression_casualuk_teenagers_cleaned_no_happy_new_year.csv")
# Replace np.nan with '' in 'selftext' column
df['selftext'] = df['selftext'].fillna('')

# Perform stopword removal, tokenization, and frequency counting
depression = df[df['subreddit'] == 'depression']
casualuk = df[df['subreddit'] == 'CasualUK']
teenagers = df[df['subreddit'] == 'teenagers']
def count_word_frequency(texts):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(texts)
    word_freq = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    return word_freq.sum().sort_values(ascending=False)
# Drop rows with NaN values in 'title'
depression = depression.dropna(subset=['title'])
casualuk = casualuk.dropna(subset=['title'])
teenagers = teenagers.dropna(subset=['title'])

depression_title_freq = count_word_frequency(depression['title'])
casualuk_title_freq = count_word_frequency(casualuk['title'])
teenagers_title_freq = count_word_frequency(teenagers['title'])
depression_selftext_freq = count_word_frequency(depression['selftext'])
casualuk_selftext_freq = count_word_frequency(casualuk['selftext'])
teenagers_selftext_freq = count_word_frequency(teenagers['selftext'])
print("Top 10 words in depression titles:")
print(depression_title_freq.head(10))
print("Top 10 words in CasualUK titles:")
print(casualuk_title_freq.head(10))
print("Top 10 words in teenagers titles:")
print(teenagers_title_freq.head(10))
print("Top 10 words in depression selftexts:")
print(depression_selftext_freq.head(10))
print("Top 10 words in CasualUK selftexts:")
print(casualuk_selftext_freq.head(10))
print("Top 10 words in teenagers selftexts:")
print(teenagers_selftext_freq.head(10))

Top 10 words in depression titles:
depression    503
feel          502
life          440
don           409
just          405
want          327
like          316
depressed     281
know          266
help          222
dtype: int64
Top 10 words in CasualUK titles:
just     300
uk       233
like     231
new      184
does     165
got      150
know     145
today    141
ve       123
day      121
dtype: int64
Top 10 words in teenagers titles:
like      598
just      548
new       432
year      426
guys      354
people    352
im        271
don       264
want      264
got       245
dtype: int64
Top 10 words in depression selftexts:
just      9995
like      8143
don       7340
feel      7328
life      4980
know      4920
want      4826
ve        4575
time      3686
really    3152
dtype: int64
Top 10 words in CasualUK selftexts:
just      630
ve        493
like      461
know      325
don       313
people    304
time      284
got       240
going     238
think     210
dtype: int64
Top 10 words in tee

In [47]:
# Perform LDA

def perform_lda(texts, n_topics=3, n_top_words=10):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(texts)
    
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(X)
    
    feature_names = vectorizer.get_feature_names_out()
    
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        top_features_ind = topic.argsort()[-n_top_words:][::-1]
        top_features = [feature_names[i] for i in top_features_ind]
        topics.append((topic_idx, top_features))
    
    return topics

# Perform LDA on depression titles and selftexts
depression_title_topics = perform_lda(depression['title'], n_topics=3, n_top_words=10)
depression_selftext_topics = perform_lda(depression['selftext'], n_topics=3, n_top_words=10)

# Perform LDA on CasualUK titles and selftexts
casualuk_title_topics = perform_lda(casualuk['title'], n_topics=3, n_top_words=10)
casualuk_selftext_topics = perform_lda(casualuk['selftext'], n_topics=3, n_top_words=10)

# Perform LDA on teenagers titles and selftexts
teenagers_title_topics = perform_lda(teenagers['title'], n_topics=3, n_top_words=10)
teenagers_selftext_topics = perform_lda(teenagers['selftext'], n_topics=3, n_top_words=10)

# Print LDA topics
def print_lda_topics(topics, subreddit, text_type):
    print(f"LDA Topics for {subreddit} {text_type}:")
    for topic_idx, top_features in topics:
        print(f"Topic {topic_idx}: {', '.join(top_features)}")
    print()

print_lda_topics(depression_title_topics, 'depression', 'titles')
print_lda_topics(depression_selftext_topics, 'depression', 'selftexts')
print_lda_topics(casualuk_title_topics, 'CasualUK', 'titles')
print_lda_topics(casualuk_selftext_topics, 'CasualUK', 'selftexts')
print_lda_topics(teenagers_title_topics, 'teenagers', 'titles')
print_lda_topics(teenagers_selftext_topics, 'teenagers', 'selftexts')

LDA Topics for depression titles:
Topic 0: feel, like, life, going, lost, better, depression, point, time, day
Topic 1: don, depression, know, want, anymore, just, need, advice, does, thoughts
Topic 2: depressed, life, help, just, tired, hate, im, feeling, people, depression

LDA Topics for depression selftexts:
Topic 0: did, like, depression, said, life, went, took, people, pain, day
Topic 1: just, like, don, feel, want, know, life, ve, people, time
Topic 2: just, ve, depression, like, feel, don, time, help, work, life

LDA Topics for CasualUK titles:
Topic 0: uk, new, got, time, january, morning, like, day, just, good
Topic 1: just, new, best, way, uk, tea, british, like, seen, late
Topic 2: like, just, know, does, people, did, today, uk, don, ve

LDA Topics for CasualUK selftexts:
Topic 0: like, just, night, ve, going, people, think, know, gt, don
Topic 1: just, thread, ve, like, people, time, know, don, welcome, really
Topic 2: just, ve, like, know, don, got, time, people, work, wa

# Check Depressive Posts in non-r/Depression Subreddits

In [None]:
# Check whether there are depressive posts in r/Teenagers and/or r/CasualUK

def check_depressive_posts(df):
    """
    Check for depressive posts in a given subreddit DataFrame.
    Returns a DataFrame of posts that may indicate depression.
    """
    keywords = ['depressed', 'depression', 'sad', 'unhappy', 'lonely', 'hopeless']
    mask = df['title'].str.contains('|'.join(keywords), case=False, na=False) | \
           df['selftext'].str.contains('|'.join(keywords), case=False, na=False)
    return df[mask]
teenagers_depressive_posts = check_depressive_posts(teenagers)
casualuk_depressive_posts = check_depressive_posts(casualuk)
print(f"Number of potentially depressive posts in r/teenagers: {len(teenagers_depressive_posts)}")
print(f"Number of potentially depressive posts in r/CasualUK: {len(casualuk_depressive_posts)}")
print(f"Number of potentially depressive posts in all subreddits: {len(teenagers_depressive_posts) + len(casualuk_depressive_posts)}")

Number of potentially depressive posts in r/teenagers: 185
Number of potentially depressive posts in r/CasualUK: 47
Number of potentially depressive posts in all subreddits: 232
