In [1]:
import pandas as pd
from tqdm import tqdm
from CryptoFraudDetection.elasticsearch.data_retrieval import search_data

In [2]:
response = search_data(index="reddit_posts_2", q="*", size=6000)

In [3]:
# Recursive function to flatten comments with parent_id
def flatten_comments(comments, parent_id):
    rows = []
    for comment in comments:
        # Extract comment details
        rows.append(
            {
                "id": comment["id"],
                "parent_id": parent_id,
                "author": comment["author"],
                "body": comment["body"],
                "created": comment["created"],
                "depth": comment["depth"],
                "downs": comment["downs"],
                "edited": comment["edited"],
                "score": comment["score"],
                "subreddit": comment["subreddit"],
                "ups": comment["ups"],
            }
        )
        # If the comment has nested replies, process them recursively
        if comment.get("comments"):
            rows.extend(flatten_comments(comment["comments"], parent_id=comment["id"]))
    return rows


# Function to flatten the entire JSON structure
def flatten_json(json_data):
    # Extract submission data
    submission = {
        "id": json_data["id"],
        "parent_id": None,
        "author": json_data["author"],
        "body": json_data["body"],
        "created": json_data["created"],
        "depth": json_data["depth"],
        "downs": json_data["downs"],
        "edited": json_data["edited"],
        "score": json_data["score"],
        "subreddit": json_data["subreddit"],
        "ups": json_data["ups"],
        "title": json_data["title"],  # Specific to submission
        "url": json_data["url"],  # Specific to submission
        "num_comments": json_data["num_comments"],  # Specific to submission
    }

    # Flatten comments
    comments = flatten_comments(json_data["comments"], parent_id=json_data["id"])

    # Combine submission and comments into a single dataset
    all_data = [submission] + comments

    # Convert to DataFrame
    return pd.DataFrame(all_data)

In [4]:
df = pd.DataFrame()
for post in tqdm(response["hits"]["hits"]):
    if df.empty:
        df = flatten_json(post["_source"])
    else:
        df = pd.concat([df, flatten_json(post["_source"])])

100%|██████████| 5193/5193 [01:53<00:00, 45.90it/s] 


In [5]:
df = df.drop(columns=["ups", "downs"])
df = df.convert_dtypes()

In [6]:
df.shape

(432655, 12)

In [7]:
df.to_parquet("../data/processed/reddit_posts_2.parquet", index=False)