The below code snippet was used to find out top 30 topics from each week. At first I loaded the files used stopwords and wordnet to create a custom stopwords list and then wrote a function to clean the text and limitize it. And then loaded the JSON files and created the created text along with weeks. So my final file representation represents the week and the text together. And then after combining posts and comments for each week together, IronTF idea vectorizer with Max df as .9, mean DF as 1 and then Extracted feature names and scores for top 30 topics.

In [1]:
import json
import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
import nltk
import re
import string
import os

In [41]:
## loading and preprocessing the data

nltk.download('stopwords')
nltk.download('wordnet')

# === PARAMETERS ===
SINGLE_STOPWORDS = set(stopwords.words('english'))
SINGLE_STOPWORDS = set(stopwords.words('english')).union({
    # General fillers
    'get', 'got', 'like', 'know', 'one', 'thing', 'things', 'really', 'would', 'could', 'also',
    'even', 'make', 'much', 'many', 'say', 'said', 'well', 'still', 'dont', 'didnt', 'cant', 'im',
    'us', 'use', 'using', 'way', 'need', 'want', 'think', 'go', 'going', 'take', 'give', 'see', 'may',

    # Contextless personal pronouns
    'i', 'you', 'he', 'she', 'we', 'they', 'me', 'him', 'her', 'them', 'your', 'my', 'our', 'their',

    # Domain-generic words
    'agency', 'federal', 'employee', 'employees', 'government', 'position', 'job', 'jobs', 'office',
    'manager', 'supervisor', 'human', 'resource', 'work', 'working', 'time', 'day', 'year', 'month',
    'week', 'leave', 'pay', 'email', 'contact', 'title', 'context', 'question', 'concern', 'thank',
    'please', 'subreddit', 'moderator', 'bot', 'automatically', 'performed', 'message', 'compose',

    # Reddit/meta-specific terms
    'removed', 'deleted', 'post', 'thread', 'comment', 'reply', 'upvote', 'downvote',

    
})

BIGRAM_STOPWORDS = {
    # Contractions (as bigrams often yield them)
    'you re', 'they re', 'we re', 'it s', 'that s', 'what s', 'who s', 'isn t', 'wasn t', 'aren t',
    'doesn t', 'don t', 'didn t', 'hasn t', 'haven t', 'hadn t', 'can t', 'couldn t', 'shouldn t',

    # Low-value system phrases from earlier outputs
    'provide context', 'context title', 'please contact', 'bot action', 'message compose',
    'fednews question', 'action performed', 'performed automatically', 'compose fednews',
    'moderator subreddit', 'subreddit message', 'contact moderator', 'question concern',
    'automatically please', 'message bot', 'bot message', 'automatically performed', 'gif giphy',
}

lemmatizer = WordNetLemmatizer()


def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(rf"[{re.escape(string.punctuation)}]", " ", text)


    # Step 1: Tokenize
    words = text.split()

    # Step 2: Remove single-word stopwords
    words = [w for w in words if w not in SINGLE_STOPWORDS and len(w) > 2]

    # Step 3: Remove bigram stopwords
    i = 0
    filtered_words = []
    while i < len(words):
        if i < len(words) - 1:
            bigram = f"{words[i]} {words[i+1]}"
            if bigram in BIGRAM_STOPWORDS:
                # Debug print
                # print(f"Removing bigram: {bigram}")
                i += 2
                continue
        filtered_words.append(words[i])
        i += 1

    # Step 4: Lemmatize
    tokens = [lemmatizer.lemmatize(w) for w in filtered_words]
    if len(words) != len(filtered_words): print("Removed bigrams")

    # tokens = text.split()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in CUSTOM_STOPWORDS and len(word) > 2]
    return " ".join(tokens)


def load_jsonl(file_path, text_fields):
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f]
    rows = []
    for item in data:
        if any(item.get(field, '') in ['[removed]', '[deleted]'] for field in text_fields):
            continue
        text = ' '.join(item.get(field, '') for field in text_fields)
        timestamp = item.get('created_utc')
        if text and timestamp:
            dt = datetime.utcfromtimestamp(timestamp)
            week = dt.strftime('%Y-%U')
            rows.append((week, clean_text(text)))
    return rows




[nltk_data] Downloading package stopwords to
[nltk_data]     /home/csgrads/shahr072/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/csgrads/shahr072/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [42]:
# === LOAD POSTS AND COMMENTS ===
posts = load_jsonl('../2_months_data/filtered_posts.jsonl', ['title', 'selftext'])
comments = load_jsonl('../2_months_data/filtered_comments.jsonl', ['body'])

  dt = datetime.utcfromtimestamp(timestamp)


Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed bigrams
Removed 

In [43]:
# === COMBINE POSTS AND COMMENTS BY WEEK ===
combined_by_week = defaultdict(list)
for week, text in posts + comments:
    combined_by_week[week].append(text)

print(f"Number of weeks: {len(combined_by_week)}")
# sample of combined_by_week
for week, texts in list(combined_by_week.items())[:3]:
    print(f"Week: {week}, Number of documents: {len(texts)}")
    print("Sample text:", texts[0][:100], "...")
    print()


Number of weeks: 9
Week: 2025-00, Number of documents: 5798
Sample text: connecticut fed health insurance pick remote new health insurance pick old wife asthma looking high  ...

Week: 2025-01, Number of documents: 14665
Sample text: significance accruing hour last ppd noticed accrued hour instead ppd wondering accrue additional hou ...

Week: 2025-02, Number of documents: 18272
Sample text: significance coming probation period nonbargaining appeal right additional protection completing pro ...



In [44]:
# extracting Top 30 topics for each week.

# === PER-WEEK TOPIC EXTRACTION ===
weekly_topics = {}

for week, docs in combined_by_week.items():
    joined_text = docs  # each doc is a separate document
    # joined_text = [' '.join(docs)]  # all docs in a week are combined into one document
    #vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1, 2))
    # max_df=0.95 Ignore terms that appear in more than 95% of documents (here, weeks).
    # min_df=2 Ignore terms that appear in less than 2 documents (here, weeks).
    # ngram_range=(1, 2) Use unigrams and bigrams. captures more specific phrases like "telework policy" or "hiring freeze" that are more informative than single words.
    #vectorizer = TfidfVectorizer(max_df=0.9, min_df=1, ngram_range=(1, 2))
    vectorizer = TfidfVectorizer(max_df=0.9, min_df=1, ngram_range=(2, 2)) # checking bigrams
    
    tfidf_matrix = vectorizer.fit_transform(joined_text)
    feature_names = vectorizer.get_feature_names_out()
    #scores = tfidf_matrix.toarray()[0]
    scores = tfidf_matrix.mean(axis=0).A1  # average tf-idf across all docs in that week
    top_indices = scores.argsort()[::-1][:30]
    top_terms = [(feature_names[i], scores[i]) for i in top_indices]
    weekly_topics[week] = top_terms



In [45]:
import json

json_ready = {
    week: [(term, float(score)) for term, score in topics]
    for week, topics in weekly_topics.items()
}

with open('weekly_30_topics_bigram_4.json', 'w') as f:
    json.dump(json_ready, f, indent=2)


In [46]:
# === PRINT TOP TOPICS PER WEEK ===
for week, topics in sorted(weekly_topics.items()):
    print(f"\nWeek {week}:")
    for term, score in topics[:30]:
        print(f"  {term} ({score:.4f})")



Week 2025-00:
  you re (0.0037)
  moderator fednews (0.0021)
  fednews question (0.0021)
  action moderator (0.0021)
  question concern (0.0021)
  travel comp (0.0018)
  year service (0.0016)
  private sector (0.0016)
  weight loss (0.0015)
  they re (0.0014)
  year ago (0.0013)
  health insurance (0.0013)
  per period (0.0013)
  credit hour (0.0012)
  made error (0.0012)
  fednews team (0.0011)
  post reply (0.0011)
  title provide (0.0011)
  title future (0.0011)
  provide title (0.0011)
  future post (0.0011)
  feel removal (0.0011)
  submission violates (0.0011)
  follow instruction (0.0011)
  team action (0.0011)
  unmonitored follow (0.0011)
  reply unmonitored (0.0011)
  violates rule (0.0011)
  removal made (0.0011)
  error fednews (0.0011)

Week 2025-01:
  you re (0.0033)
  private sector (0.0023)
  hiring freeze (0.0017)
  action moderator (0.0016)
  question concern (0.0016)
  moderator fednews (0.0016)
  fednews question (0.0016)
  they re (0.0014)
  year ago (0.0013)
  go