## Part 0: Setup

In [None]:

# Import Packages
import pyarrow
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
assert np.__version__ == "2.1.0", f"Expected numpy version 2.1.0, but got {np.__version__}"

from autoviz import AutoViz_Class
%matplotlib inline

# Check if running in Google Colab
def is_colab():
    from IPython import get_ipython
    return get_ipython().__class__.__module__ == "google.colab._shell"

## Part 1: Read Collected Reddit Data

In [None]:
# Save the collected data to parquet format
SUBMISSION_PARQUET_PATH = './data/wallstreetbets-collection-wss.parquet'
# Verify that the path exists
import os
if not os.path.exists(SUBMISSION_PARQUET_PATH):
    print(f"Error: The file {SUBMISSION_PARQUET_PATH} does not exist.")

# Create a pyarrow schema for the data types.
submission_schema = pyarrow.schema([
    ('title', pyarrow.string()),
    ('created_utc', pyarrow.float64()),
    ('id', pyarrow.string()),
    ('is_original_content', pyarrow.bool_()),
    ('link_flair_text', pyarrow.string()),
    ('locked', pyarrow.bool_()),
    ('name', pyarrow.string()),
    ('num_comments', pyarrow.int64()),
    ('over_18', pyarrow.bool_()),
    ('permalink', pyarrow.string()),
    ('selftext', pyarrow.string()),
    ('spoiler', pyarrow.bool_()),
    ('upvote_ratio', pyarrow.float64()),
    ('ss_neg', pyarrow.float64()),
    ('ss_neu', pyarrow.float64()),
    ('ss_pos', pyarrow.float64()),
    ('ss_compound', pyarrow.float64())
])

# Read the parquet file into a pandas DataFrame using the schema.
submission_collection = pd.read_parquet(SUBMISSION_PARQUET_PATH, engine='pyarrow', schema=submission_schema)

# Show the first few rows of the DataFrame
# display(submission_collection.head())

In [None]:
# Save the collected data to parquet format
COMMENT_PARQUET_PATH = './data/wallstreetbets-comment-collection-wss.parquet'

# Create a pyarrow schema for the comment data
comment_schema = pyarrow.schema([
    ('parent_post_id', pyarrow.string()),
    ('parent_comment_id', pyarrow.string()),
    ('comment_id', pyarrow.string()),
    ('author', pyarrow.string()),
    ('created_utc', pyarrow.float64()),
    ('score', pyarrow.int64()),
    ('body', pyarrow.string()),
    ('ss_neg', pyarrow.float64()),
    ('ss_neu', pyarrow.float64()),
    ('ss_pos', pyarrow.float64()),
    ('ss_compound', pyarrow.float64())
])

# Read the parquet file into a pandas DataFrame using the schema.
comment_collection = pd.read_parquet(COMMENT_PARQUET_PATH, engine='pyarrow', schema=comment_schema)

# Show the first few rows of the DataFrame
# display(comment_collection.head())

## Part 2: Initial Analysis

In [None]:
def visualize_data(collection, save_path=None):
    if collection.empty:
        print("Error: The input DataFrame is empty.")
        return None

    AV = AutoViz_Class()
    df_av = collection.copy()
    df_av.dropna(inplace=True)

    if df_av.empty:
        print("Error: The processed DataFrame is empty after dropping columns and NaN values.")
        return None

    try:
        df_av['created_utc'] = pd.to_datetime(df_av['created_utc'], unit='s')
        df_av['created_utc'] = df_av['created_utc'].dt.date
        AV.AutoViz(filename="",
                   dfte=df_av,
                   verbose=2,
                   lowess=True, 
                   chart_format="png",
                   save_plot_dir=save_path)
        plt.show()
        plt.close()
    except AttributeError as e:
        print(f"AutoViz encountered an error: {e}")
        return None

    return df_av

In [None]:
visualize_data(submission_collection, save_path='./plots/submission')
visualize_data(comment_collection, save_path='./plots/comment')

## Part 3: Custom Analysis