# Cleaning the dataset

In [None]:
import csv

def extract_subreddit_names(input_csv, output_csv):
    with open(input_csv, 'r', newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        with open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.writer(outfile)
            for row in reader:
                # Write only the first column (subreddit names)
                writer.writerow([row[0]])

# Usage
input_csv = '/Users/ElevenyCHEN/Desktop/Mod_Datasets/SubReddit-time.csv'  # Replace with the path to your existing CSV file
output_csv = '/Users/ElevenyCHEN/Desktop/Mod_Datasets/SubReddit-list.csv'   # Replace with the path where you want the new CSV file to be saved
extract_subreddit_names(input_csv, output_csv)


# Counting posting users

In [None]:
import zstandard as zstd
import json
import csv
import os
import threading
from datetime import datetime, timedelta
import time


size = 1024 * 1024 * 10

# Generate file paths
def generate_file_paths(start_year, start_month, end_year, end_month):
    current = datetime(start_year, start_month, 1)
    end = datetime(end_year, end_month, 1)
    paths = []
    while current >= end:
        file_name = current.strftime("%Y-%m") + ".zst"
        file_path = f'D:\Eleveny\RS_{file_name}'  # Adjust the path as needed
        paths.append(file_path)
        current -= timedelta(days=1)
        current = current.replace(day=1)
    return paths


# Define a lock for thread safety
lock = threading.Lock()

# Worker function to process a single .zst file
def process_file(file_path, subreddit_authors, time_mark):
    print(f"Starting processing file: {file_path} at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    line_count = 0
    update_interval = 1000000  # Update the progress every 1,000,000 lines

    with open(file_path, 'rb') as compressed:
        dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
        with dctx.stream_reader(compressed) as reader:
            buffer = ''
            while True:
                chunk = reader.read(size).decode('utf-8', errors='ignore')
                buffer += chunk
                lines = buffer.split('\n')
                buffer = lines.pop()

                for line in lines:
                    if line:
                        line_count += 1
                        try:
                            json_obj = json.loads(line)
                            subreddit = json_obj.get("subreddit")
                            author = json_obj.get("author")
                            if subreddit and author:
                                with lock:
                                    if subreddit not in subreddit_authors:
                                        subreddit_authors[subreddit] = {}
                                    if time_mark not in subreddit_authors[subreddit]:
                                        subreddit_authors[subreddit][time_mark] = set()
                                    subreddit_authors[subreddit][time_mark].add(author)
                        except json.JSONDecodeError:
                            continue

                        # Periodic progress update
                        if line_count % update_interval == 0:
                            print(f"Processing file: {file_path}, lines processed: {line_count}, time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

                if not chunk:
                    break

    print(f"Finished processing file: {file_path} at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")



# Main function to process subreddits
def process_subreddits(subreddits, start_year, start_month, end_year, end_month, output_csv):
    file_paths = generate_file_paths(start_year, start_month, end_year, end_month)
    subreddit_authors = {subreddit: {} for subreddit in subreddits}

    threads = []
    for file_path in file_paths:
        time_mark = file_path.split('/')[-1].split('.')[0]  # Format: YYYY-MM
        thread = threading.Thread(target=process_file, args=(file_path, subreddit_authors, time_mark))
        threads.append(thread)
        thread.start()

    # Wait for all threads to complete
    for thread in threads:
        thread.join()

    # Write the output CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)

        # Writing headers
        headers = ["subreddit"] + sorted([file_path.split('/')[-1].split('.')[0] for file_path in file_paths])
        writer.writerow(headers)

        # Writing data rows
        for subreddit, data in subreddit_authors.items():
            row = [subreddit] + [len(data.get(month, [])) for month in headers[1:]]
            writer.writerow(row)

# Read the subreddit list and process the subreddits
subreddit_list_csv = r'D:\Eleveny\SubReddit-list.csv'  # Adjust the path as needed
subreddits = []
with open(subreddit_list_csv, 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip header
    for row in reader:
        subreddits.append(row[0])

# Process the subreddits and write to a new CSV
output_csv = r'D:\Eleveny\subreddit_posting_users.csv'  # Adjust the path as needed
process_subreddits(subreddits, 2016, 5, 2016, 1, output_csv)


Not excluding 0 version

In [None]:
import csv

def read_and_rank_subreddits(input_csv):
    subreddits = []
    with open(input_csv, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        headers = next(reader)

        for row in reader:
            subreddit, posting_users = row[0], int(row[1])  # Reading the 'posting_users' count
            subreddits.append((subreddit, posting_users))

    # Sort subreddits based on 'posting_users' count
    subreddits.sort(key=lambda x: x[1], reverse=True)

    # Calculate cumulative distribution
    total = sum(posting_users for _, posting_users in subreddits)
    cumulative = 0
    top_95_percent = []
    top_90_percent = []
    for subreddit, posting_users in subreddits:
        cumulative += posting_users
        percentage = cumulative / total
        if percentage <= 0.95:
            top_95_percent.append((subreddit, posting_users))
        if percentage <= 0.90:
            top_90_percent.append((subreddit, posting_users))

    return top_95_percent, top_90_percent

def write_to_csv(subreddits, output_csv):
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["subreddit", "posting_users"])  # Header
        for subreddit, posting_users in subreddits:
            writer.writerow([subreddit, posting_users])

# Usage
input_csv = r'D:\Eleveny\subreddit_posting_users.csv'  # Adjust the path as needed
top_95_percent, top_90_percent = read_and_rank_subreddits(input_csv)

# Write to separate CSV files
output_csv_95 = r'D:\Eleveny\top_95_percent_subreddits.csv'
output_csv_90 = r'D:\Eleveny\top_90_percent_subreddits.csv'
write_to_csv(top_95_percent, output_csv_95)
write_to_csv(top_90_percent, output_csv_90)

Excluding 0 version

In [None]:
import csv

def read_and_rank_subreddits(input_csv):
    subreddits = []
    with open(input_csv, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        headers = next(reader)

        for row in reader:
            subreddit, posting_users = row[0], int(row[1])
            if posting_users > 0:  # Exclude subreddits with 0 posting users
                subreddits.append((subreddit, posting_users))

    # Sort subreddits based on 'posting_users' count
    subreddits.sort(key=lambda x: x[1], reverse=True)

    # Calculate cumulative distribution
    total = sum(posting_users for _, posting_users in subreddits if posting_users > 0)
    cumulative = 0
    top_95_percent = []
    top_90_percent = []
    for subreddit, posting_users in subreddits:
        cumulative += posting_users
        percentage = cumulative / total
        if percentage <= 0.95:
            top_95_percent.append((subreddit, posting_users))
        if percentage <= 0.90:
            top_90_percent.append((subreddit, posting_users))

    return top_95_percent, top_90_percent

def write_to_csv(subreddits, output_csv):
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["subreddit", "posting_users"])  # Header
        for subreddit, posting_users in subreddits:
            writer.writerow([subreddit, posting_users])

# Usage
input_csv = r'D:\Eleveny\subreddit_posting_users.csv'  # Adjust the path as needed
top_95_percent, top_90_percent = read_and_rank_subreddits(input_csv)

# Write to separate CSV files
output_csv_95 = r'D:\Eleveny\top_95_percent_subreddits.csv'
output_csv_90 = r'D:\Eleveny\top_90_percent_subreddits.csv'
write_to_csv(top_95_percent, output_csv_95)
write_to_csv(top_90_percent, output_csv_90)

Plotting

In [None]:
!pip install matplotlib

In [None]:
import csv
import matplotlib.pyplot as plt

def read_and_rank_subreddits(input_csv):
    subreddits = []
    with open(input_csv, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        headers = next(reader)

        for row in reader:
            subreddit, posting_users = row[0], int(row[1])
            if posting_users > 0:  # Exclude subreddits with 0 posting users
                subreddits.append((subreddit, posting_users))

    subreddits.sort(key=lambda x: x[1], reverse=True)
    return subreddits

def plot_distribution(subreddits):
    # Extract posting user counts
    posting_users = [count for _, count in subreddits]

    # Create a plot
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(posting_users)), posting_users, color='blue')
    plt.xlabel('Subreddits')
    plt.ylabel('Posting Users Count')
    plt.title('Distribution of Posting Users per Subreddit')
    plt.show()

# Usage
input_csv = r'D:\Eleveny\subreddit_posting_users.csv'  # Adjust the path as needed
subreddits = read_and_rank_subreddits(input_csv)

# Print the number of subreddits and plot distribution
print(f"Total number of subreddits: {len(subreddits)}")
plot_distribution(subreddits)