In [2]:
import zstandard as zstd
import io
import orjson as json
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import words
import regex as re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Reading Data

In [11]:
def read_zst_lines(path: str, max_lines: int = None, subreddit_whitelist: set[str] = None):
    """
    Streams a .zst file line by line, yielding only JSON rows from whitelisted subreddits.
    Shows a tqdm progress bar for lines read.
    """
    dctx = zstd.ZstdDecompressor()
    with open(path, 'rb') as fh:
        with dctx.stream_reader(fh) as reader:
            text_stream = io.TextIOWrapper(reader, encoding='utf-8')
            sub_count = 0
            for i, line in enumerate(tqdm(text_stream, desc=f"Reading {path}", unit="lines")):
                if not subreddit_whitelist and max_lines and i >= max_lines:
                    break
                try:
                    if subreddit_whitelist and sub_count >= max_lines:
                        break
                    post = json.loads(line)
                    sub = post.get("subreddit")
                    if subreddit_whitelist and sub not in subreddit_whitelist:
                        continue
                    yield post
                    sub_count += 1
                except (json.JSONDecodeError, UnicodeDecodeError):
                    continue

In [12]:
subs = {'teenagers'}
data = list(read_zst_lines("RS_2023-01.zst", max_lines=10000, subreddit_whitelist=subs))
df = pd.DataFrame(data)

Reading RS_2023-01.zst: 3038352lines [00:39, 76074.91lines/s]


# Data Preprocessing

In [284]:
df = pd.read_csv('depression_casualuk.csv')

In [18]:
def data_stats(df):
    # Basic statistics per subreddit
    subs = df['subreddit'].unique()

    for subreddit in subs:
        subset = df[df['subreddit'] == subreddit]
        print(f"Statistics for subreddit '{subreddit}':")
        print(f"Total posts: {len(subset)}")
        print(f"Average title length: {subset['title'].str.len().mean()}")
        print(f"Average selftext length: {subset['selftext'].str.len().mean()}")
        print(f"Maximum title length: {subset['title'].str.len().max()}")
        print(f"Maximum selftext length: {subset['selftext'].str.len().max()}")
        print()
data_stats(df)

Statistics for subreddit 'teenagers':
Total posts: 10000
Average title length: 46.7616
Average selftext length: 59.3895
Maximum title length: 300
Maximum selftext length: 13534



In [23]:
def preprocess_text(text: str) -> str:
    try:
        if text is None:
            return pd.NA
        if not isinstance(text, str):
            return pd.NA
        if '[removed]' in text or '[deleted]' in text:
            return pd.NA
        # text = re.sub(r'[^\x00-\x7F]+', '', text) # Remove non-ASCII
        text = re.sub(r'\p{Emoji}', '', text) # Remove emojis
        text = text.replace('\uFE0F', '') # Remove variation selector for emojis
        text = re.sub(r'http[s]?://\S+', '', text) # Remove URLs
        text = text.strip()
        return text
    except Exception as e:
        print(f"Error processing text: {text!r}\nException: {e}")

def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df = df[['subreddit', 'title', 'selftext']].copy()
    for col in ['title', 'selftext']:
        for idx, text in df[col].items():
            try:
                df.at[idx, col] = preprocess_text(text)
            except Exception as e:
                print(f"Error at row {idx}, column '{col}': {text!r}")
    df = df.dropna(subset=['title', 'selftext'], how='all')
    df = df.replace(pd.NA, '')
    return df

df = filter_dataframe(df)

In [24]:
def detect_non_english(text):
    if not isinstance(text, str):
        return False
    text = re.sub(r'[^\w\s]', '', text)
    non_ascii = [c for c in text if ord(c) > 127]
    # if non_ascii:
    #     print(f"Non-ASCII characters: {non_ascii}")
    return bool(non_ascii)

def remove_non_english_posts(df):
    mask = df.apply(lambda row: detect_non_english(row['title']) or detect_non_english(row['selftext']), axis=1)
    return df[~mask]

# for index, row in df.iterrows():
#     if detect_non_english(row['title']) or detect_non_english(row['selftext']):
#         print(f"Non-English content detected in row {index}:")
#         print(f"Title: {row['title']}")
#         print(f"Selftext: {row['selftext']}")
#         print("-" * 80)

df = remove_non_english_posts(df)

for index, row in df.iterrows():
    if detect_non_english(row['title']) or detect_non_english(row['selftext']):
        print(f"Non-English content detected in row {index}:")
        print(f"Title: {row['title']}")
        print(f"Selftext: {row['selftext']}")
        print("-" * 80)

In [25]:
# Basic statistics per subreddit
subs = df['subreddit'].unique()

for subreddit in subs:
    subset = df[df['subreddit'] == subreddit]
    print(f"Statistics for subreddit '{subreddit}':")
    print(f"Total posts: {len(subset)}")
    print(f"Average title length: {subset['title'].str.len().mean()}")
    print(f"Average selftext length: {subset['selftext'].str.len().mean()}")
    print(f"Maximum title length: {subset['title'].str.len().max()}")
    print(f"Maximum selftext length: {subset['selftext'].str.len().max()}")
    print()

Statistics for subreddit 'teenagers':
Total posts: 9909
Average title length: 45.987687960440006
Average selftext length: 52.24260773034615
Maximum title length: 300
Maximum selftext length: 13508



In [26]:
df.to_csv('teenagers_cleaned.csv', index=False)

# Data Analysis

In [27]:
temp = pd.read_csv('depression_casualuk_cleaned.csv')
df = pd.concat([df, temp], ignore_index=True)

In [29]:
# Get size of depression and casualuk subreddits
depression = df[df['subreddit'] == 'depression']
casualuk = df[df['subreddit'] == 'CasualUK']
teenagers = df[df['subreddit'] == 'teenagers']
print(f"Size of 'depression' subreddit: {len(depression)}")
print(f"Size of 'CasualUK' subreddit: {len(casualuk)}")
print(f"Size of 'teenagers' subreddit: {len(teenagers)}")

Size of 'depression' subreddit: 6046
Size of 'CasualUK' subreddit: 5167
Size of 'teenagers' subreddit: 9909


In [None]:
# Perform text analysis on data

data = pd.read_csv("depression_casualuk_teenagers_cleaned.csv")
# Example text analysis: Count word frequency in titles and selftexts
# Perform stopword removal, tokenization, and frequency counting
depression = data[data['subreddit'] == 'depression']
casualuk = data[data['subreddit'] == 'CasualUK']
def count_word_frequency(texts):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(texts)
    word_freq = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    return word_freq.sum().sort_values(ascending=False)
# Drop rows with NaN values in 'title' or 'selftext'
depression = depression.dropna(subset=['title', 'selftext'])
casualuk = casualuk.dropna(subset=['title', 'selftext'])

depression_title_freq = count_word_frequency(depression['title'])
casualuk_title_freq = count_word_frequency(casualuk['title'])
depression_selftext_freq = count_word_frequency(depression['selftext'])
casualuk_selftext_freq = count_word_frequency(casualuk['selftext'])
print("Top 10 words in depression titles:")
print(depression_title_freq.head(10))
print("Top 10 words in CasualUK titles:")
print(casualuk_title_freq.head(10))
print("Top 10 words in depression selftexts:")
print(depression_selftext_freq.head(10))
print("Top 10 words in CasualUK selftexts:")
print(casualuk_selftext_freq.head(10))



Top 10 words in depression titles:
feel          502
depression    502
life          440
don           409
just          404
want          327
like          316
depressed     281
know          266
help          222
dtype: int64
Top 10 words in CasualUK titles:
january    89
uk         80
just       75
new        71
does       63
thread     61
today      55
like       47
best       43
got        43
dtype: int64
Top 10 words in depression selftexts:
just      9995
like      8143
don       7340
feel      7328
life      4980
know      4920
want      4826
ve        4575
time      3686
really    3152
dtype: int64
Top 10 words in CasualUK selftexts:
just      630
ve        493
like      461
know      325
don       313
people    304
time      284
got       240
going     238
think     210
dtype: int64


In [299]:
# Perform LDA

def perform_lda(texts, n_topics=5, n_top_words=10):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(texts)
    
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(X)
    
    feature_names = vectorizer.get_feature_names_out()
    
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        top_features_ind = topic.argsort()[-n_top_words:][::-1]
        top_features = [feature_names[i] for i in top_features_ind]
        topics.append((topic_idx, top_features))
    
    return topics

# Perform LDA on depression titles and selftexts
depression_title_topics = perform_lda(depression['title'], n_topics=5, n_top_words=10)
depression_selftext_topics = perform_lda(depression['selftext'], n_topics=5, n_top_words=10)

# Perform LDA on CasualUK titles and selftexts
casualuk_title_topics = perform_lda(casualuk['title'], n_topics=5, n_top_words=10)
casualuk_selftext_topics = perform_lda(casualuk['selftext'], n_topics=5, n_top_words=10)

# Print LDA topics
def print_lda_topics(topics, subreddit, text_type):
    print(f"LDA Topics for {subreddit} {text_type}:")
    for topic_idx, top_features in topics:
        print(f"Topic {topic_idx}: {', '.join(top_features)}")
    print()

print_lda_topics(depression_title_topics, 'depression', 'titles')
print_lda_topics(depression_selftext_topics, 'depression', 'selftexts')
print_lda_topics(casualuk_title_topics, 'CasualUK', 'titles')
print_lda_topics(casualuk_selftext_topics, 'CasualUK', 'selftexts')

LDA Topics for depression titles:
Topic 0: depressed, just, depression, day, im, wish, thoughts, work, lonely, idk
Topic 1: don, know, want, anymore, life, just, lost, time, die, depression
Topic 2: hate, life, going, think, depression, year, ve, love, just, bad
Topic 3: feel, like, depression, help, better, life, feeling, point, does, getting
Topic 4: need, tired, help, life, just, depression, want, really, advice, talk

LDA Topics for depression selftexts:
Topic 0: depression, pain, day, stop, self, add, said, treatment, did, year
Topic 1: just, like, life, time, don, know, got, ve, years, year
Topic 2: ve, depression, just, help, feel, work, like, time, don, need
Topic 3: just, like, feel, don, want, know, life, people, ve, time
Topic 4: ve, just, don, like, know, today, ll, told, room, away

LDA Topics for CasualUK titles:
Topic 0: like, just, uk, thing, does, use, days, old, don, good
Topic 1: uk, does, best, better, english, live, tea, new, help, feel
Topic 2: thread, january, la