In [2]:
%pip install tqdm
import pandas as pd
from tqdm import tqdm

from CryptoFraudDetection.utils.enums import LoggerMode
from CryptoFraudDetection.utils.logger import Logger
from CryptoFraudDetection.elasticsearch.data_retrieval import search_data
from CryptoFraudDetection.elasticsearch.data_insertion import insert_dataframe

Collecting tqdm
  Using cached tqdm-4.67.0-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.0-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
response = search_data(index="reddit_posts_2", q="*", size=1)

In [12]:
response["hits"]["hits"][0].keys()

dict_keys(['_index', '_id', '_score', '_ignored', '_source'])

In [15]:
response["hits"]["hits"][0]['_source']

{'author': 'writtey',
 'body': '',
 'comments': [{'author': 'Baecchus',
   'body': 'Average gas fee back in 2021',
   'comments': [{'author': 'partymsl',
     'body': 'Ah. The good old forced hodl days due to gas fees...',
     'comments': [{'author': 'itsaBazinga',
       'body': 'I always had the StarCraft we require more vespen gas voice line go through my head when doing a transaction back then.',
       'comments': [{'author': 'Atyzzze',
         'body': 'You must construct additional pylons! (stake at home! Contribute to true decentralization, unlike these large centralized and optimized for profit mining warehouses)',
         'comments': None,
         'created': '2024-10-08 23:12:21',
         'edited': None,
         'depth': 3,
         'downs': 0,
         'id': 'lr0jjil',
         'score': 6,
         'search_query': 'Ethereum ETH',
         'subreddit': 'r/CryptoCurrency',
         'ups': 6},
        {'author': 'chainer3000',
         'body': 'Glad I’m not alone lol',
   

In [3]:
# Recursive function to flatten comments with parent_id
def flatten_comments(comments, parent_id):
    rows = []
    for comment in comments:
        # Extract comment details
        rows.append(
            {
                "id": comment["id"],
                "parent_id": parent_id,
                "author": comment["author"],
                "body": comment["body"],
                "created": comment["created"],
                "depth": comment["depth"],
                "edited": comment["edited"],
                "score": comment["score"],
                "search_query": comment["search_query"],
                "subreddit": comment["subreddit"],
            }
        )
        # If the comment has nested replies, process them recursively
        if comment.get("comments"):
            rows.extend(flatten_comments(comment["comments"], parent_id=comment["id"]))
    return rows


# Function to flatten the entire JSON structure
def flatten_json(json_data):
    # Extract submission data
    submission = {
        "id": json_data["id"],
        "parent_id": None,
        "author": json_data["author"],
        "body": json_data["body"],
        "created": json_data["created"],
        "depth": json_data["depth"],
        "edited": json_data["edited"],
        "score": json_data["score"],
        "search_query": json_data["search_query"],
        "subreddit": json_data["subreddit"],
        "title": json_data["title"],  # Specific to submission
        "url": json_data["url"],  # Specific to submission
        "num_comments": json_data["num_comments"],  # Specific to submission
    }

    # Flatten comments
    comments = flatten_comments(json_data["comments"], parent_id=json_data["id"])

    # Combine submission and comments into a single dataset
    all_data = [submission] + comments

    # Convert to DataFrame
    return pd.DataFrame(all_data)

In [4]:
df = pd.DataFrame()
for post in tqdm(response["hits"]["hits"]):
    if df.empty:
        df = flatten_json(post["_source"])
    else:
        df = pd.concat([df, flatten_json(post["_source"])])

df = df.convert_dtypes()

100%|██████████| 5193/5193 [02:06<00:00, 41.08it/s] 


In [5]:
df.shape

(432655, 13)

In [6]:
df.head()

Unnamed: 0,id,parent_id,author,body,created,depth,edited,score,search_query,subreddit,title,url,num_comments
0,1fz46jo,,writtey,,2024-10-08 16:36:10,-1,,257,Ethereum ETH,r/CryptoCurrency,This User Paid $700K for a Single Ethereum Tra...,https://cointab.com/user-paid-700k-ethereum-tr...,137.0
1,lqyulca,1fz46jo,Baecchus,Average gas fee back in 2021,2024-10-08 17:15:39,0,,208,Ethereum ETH,r/CryptoCurrency,,,
2,lqzv2yy,lqyulca,partymsl,Ah. The good old forced hodl days due to gas f...,2024-10-08 20:38:34,1,,39,Ethereum ETH,r/CryptoCurrency,,,
3,lr09t08,lqzv2yy,itsaBazinga,I always had the StarCraft we require more ves...,2024-10-08 22:06:13,2,,20,Ethereum ETH,r/CryptoCurrency,,,
4,lr0jjil,lr09t08,Atyzzze,You must construct additional pylons! (stake a...,2024-10-08 23:12:21,3,,6,Ethereum ETH,r/CryptoCurrency,,,


In [7]:
df.to_parquet("../data/processed/reddit_posts_2.parquet", index=False)

In [10]:
logger_ = Logger(
    name="reddit_unwrap_posts", level=LoggerMode.DEBUG, log_dir="../logs"
)

_ = insert_dataframe(logger=logger_, index="reddit_posts_unwrapped_2", df=df)