In [1]:
import pandas as pd
import json
import numpy as np
from pathlib import Path
from wordcloud import WordCloud

In [18]:
def process_subreddit(master_df: pd.DataFrame, name: str):
    """
    Process a subreddit's data from the master DataFrame.
    Handles missing values and data inconsistencies.
    """
    # Filter and clean data
    subreddit_comments = master_df[master_df["subreddit"] == name].copy()
    
    # Convert empty strings to NaN
    for col in ['text_sentiment', 'toxic_word_count', 'left_wing', 'righy_wing']:
        subreddit_comments[col] = subreddit_comments[col].replace('', np.nan)
    
    # Create output directory
    output_dir = Path(f"data/processed/{name}")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # --- Generate Metadata ---
    metadata = {
        # Sentiment analysis (handle missing values)
        "sentiment_avg": subreddit_comments["text_sentiment"].mean(skipna=True),
        "sentiment_distribution": {
            "positive": (subreddit_comments["text_sentiment"] > 0.3).mean(),
            "neutral": ((subreddit_comments["text_sentiment"] >= -0.3) & 
                       (subreddit_comments["text_sentiment"] <= 0.3)).mean(),
            "negative": (subreddit_comments["text_sentiment"] < -0.3).mean()
        },
        
        # Toxicity analysis (handle missing toxic_word_count)
        "toxicity_avg": subreddit_comments["toxic_word_count"].mean(skipna=True),
        "toxic_comments_ratio": (subreddit_comments["toxic_word_count"] > 0).mean(),
        
        # Political lean (correcting 'righy_wing' typo and handling predicted_label)
        "political_lean": {
            "left": subreddit_comments["left_wing"].mean(skipna=True),
            "right": subreddit_comments["righy_wing"].mean(skipna=True),
            # Fallback to predicted_label if wing counts missing
            "predicted_left": (subreddit_comments["predicted_label"] == "Left").mean(),
            "predicted_right": (subreddit_comments["predicted_label"] == "Right").mean()
        },
        
        # Engagement metrics
        "avg_upvotes": subreddit_comments["upvotes"].mean(),
        "total_comments": len(subreddit_comments)
    }
    
    print("Saving to: ", output_dir / "metadata.json")
    
    # Save metadata
    try: 
        with open(output_dir / "metadata.json", "w") as f:
            json.dump(metadata, f, indent=2)
    except TypeError as e:
        print("Failed to serialize metadata: ", e)
    except Exception as e:
        print("Error saving metadata: ", e)
    
    # --- Generate Word Clouds ---
    wc_params = {
        "width": 1200,
        "height": 800,
        "background_color": "white",
        "max_words": 200,
        "collocations": False
    }
    
    # Generate for left/right based on predicted_label
    for lean in ["Left", "Right"]:
        try:
            lean_comments = subreddit_comments[
                subreddit_comments["predicted_label"] == lean
            ]
            if len(lean_comments) > 0:
                text = " ".join(lean_comments["preprocessed_text"].dropna().astype(str))
                WordCloud(**wc_params).generate(text).to_file(
                    output_dir / f"wordcloud_{lean.lower()}.png"
                )
        except Exception as e:
            print(f"Failed to generate {lean} wordcloud for r/{name}: {str(e)}")
    
    # --- Generate Sentiment Timeline ---
    if "created_utc" in subreddit_comments.columns:
        try:
            subreddit_comments["date"] = pd.to_datetime(
                subreddit_comments["created_utc"], 
                unit='s'
            ).dt.date
            timeline = subreddit_comments.groupby("date")["text_sentiment"].mean()
            timeline.to_csv(output_dir / "sentiment_timeline.csv")
        except Exception as e:
            print(f"Failed to generate timeline for r/{name}: {str(e)}")

    return metadata

In [19]:
# Load master dataframe
master_df = pd.read_csv("utils/complete_final_toxicity_classifier.csv")

subreddit_list = ["news", "worldnews", "politics", "democrats", 
                  "conservative", "canada"]

# Process all subreddits
for subreddit in subreddit_list:
    print(f"Processing r/{subreddit}")
    process_subreddit(master_df, subreddit)

Processing r/news
Saving to:  data/processed/news/metadata.json
Processing r/worldnews
Saving to:  data/processed/worldnews/metadata.json
Processing r/politics
Saving to:  data/processed/politics/metadata.json
Processing r/democrats
Saving to:  data/processed/democrats/metadata.json
Processing r/conservative
Saving to:  data/processed/conservative/metadata.json
Processing r/canada
Saving to:  data/processed/canada/metadata.json


In [17]:
os.getcwd()

'/Users/cassandra/Documents/GitHub/reddit-dashboard'