# Top 5 Subreddit Word Cloudes

This script will parse the `top_posts.csv.gz` file, and create wordclouds based on their titles. These visuals will be stored as PNG's in the `images` folder.

In [1]:
import gzip
import json
from csv import DictReader
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
data = []

In [3]:
# Open and store each post as a list of dict elements
with gzip.open('../data/top_posts.csv.gz', 'rt') as file:
    csv_reader = DictReader(file)
    
    for row in csv_reader:
        data.append(row)

In [4]:
# Open and store the top subreddits (pre-sorted) as a list
with open('../data/top_subreddits.json', 'r') as file:
    top_subreddits = json.loads(file.read())
    top_subreddits = list(top_subreddits.keys())

In [5]:
print(f"Number of Data Points: {format(len(data), ',')}")

Number of Data Points: 246,472


In [6]:
print(f"Keys in each Data Point: {list(data[0].keys())}")

Keys in each Data Point: ['unixtime', 'title', 'total_votes', 'reddit_id', 'number_of_upvotes', 'subreddit', 'number_of_downvotes', 'score', 'number_of_comments', 'username']


In [7]:
print(f"Top 5 Subreddits: {top_subreddits[:5]}")

Top 5 Subreddits: ['funny', 'AskReddit', 'gaming', 'aww', 'Music']


In [8]:
# Store all titles
top_5_wordclouds = defaultdict(str)
top_5_subreddits = top_subreddits[:5]

In [9]:
for datum in data:
    # Edge Case: Skip posts that are no in the top 5 subreddits
    if datum['subreddit'] not in top_5_subreddits:
        continue
    
    top_5_wordclouds[datum['subreddit']] += datum['title'] + " "

In [10]:
# Generate word clouds for each of the top 5 subreddits
for subreddit in top_5_wordclouds:
    text = top_5_wordclouds[subreddit]
    
    file_path = f"../images/wordcloud_{subreddit}.png"
    
    # Generate a word cloud image
    wordcloud = WordCloud(
        color_func=lambda *args, **kwargs: "black",
        background_color=None,
        min_word_length=2,
        height=400,
        width=800,
        mode="RGBA",
        scale=2
    ).generate(text)
    
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.savefig(
        file_path,
        transparent=True,
        dpi=300,
        bbox_inches='tight'
    )
    plt.close()