In [1]:
import zstandard as zstd
import json
import pandas as pd
import io

In [2]:
# read a single zst file
def read_zst_to_json(file_path):
    """
    Read a zst compressed file and return a list of JSON objects.
    
    Args:
        file_path (str): Path to .zst file
        
    Returns:
        list: List of parsed JSON objects
    """
    data = []
    print(f"Processing {file_path}...")
    
    try:
        with open(file_path, 'rb') as fh:
            dctx = zstd.ZstdDecompressor()
            with dctx.stream_reader(fh) as reader:
                text_stream = io.TextIOWrapper(reader, encoding='utf-8')
                for line_count, line in enumerate(text_stream, 1):
                    try:
                        json_obj = json.loads(line.strip())
                        data.append(json_obj)
                        
                        # Print progress every 1000 records
                        if len(data) % 1000 == 0:
                            print(f"Processed {len(data)} records...")
                            
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON at line {line_count}: {e}")
                        continue
                        
        print(f"Total records processed: {len(data)}")
        return data
        
    except Exception as e:
        print(f"Error processing file: {e}")
        import traceback
        traceback.print_exc()
        return []

In [3]:
# First check if file exists
file_path = "../data/Input/Reddit/submissions/hysterectomy_submissions.zst"
import os
print(f"File exists: {os.path.exists(file_path)}")
if os.path.exists(file_path):
    print(f"File size: {os.path.getsize(file_path)} bytes")

# Read the data
submissions_data = read_zst_to_json(file_path)

# Check the result
print("\nFinal check:")
print(f"Length of data: {len(submissions_data)}")
if len(submissions_data) > 0:
    print("First record:", json.dumps(submissions_data[0], indent=2))

File exists: True
File size: 4054982 bytes
Processing ../data/Input/Reddit/submissions/hysterectomy_submissions.zst...
Processed 1000 records...
Processed 2000 records...
Processed 3000 records...
Processed 4000 records...
Processed 5000 records...
Processed 6000 records...
Processed 7000 records...
Processed 8000 records...
Processed 9000 records...
Processed 10000 records...
Processed 11000 records...
Total records processed: 11510

Final check:
Length of data: 11510
First record: {
  "link_flair_text": null,
  "permalink": "/r/hysterectomy/comments/26z4vk/changes_to_sexual_functioning/",
  "selftext": "If you've had a hysterectomy (removal of uterus, not ovaries), can you please tell whether or  not you've had changes in your sexual functioning, libido, and ability to have orgasms?",
  "media": null,
  "ups": 1,
  "retrieved_on": 1441331902,
  "created_utc": "1401563576",
  "author_flair_text": null,
  "media_embed": {},
  "subreddit": "hysterectomy",
  "domain": "self.hysterectomy"

In [4]:
# Look at the first record
if submissions_data:
    print(json.dumps(submissions_data[0], indent=2))

{
  "link_flair_text": null,
  "permalink": "/r/hysterectomy/comments/26z4vk/changes_to_sexual_functioning/",
  "selftext": "If you've had a hysterectomy (removal of uterus, not ovaries), can you please tell whether or  not you've had changes in your sexual functioning, libido, and ability to have orgasms?",
  "media": null,
  "ups": 1,
  "retrieved_on": 1441331902,
  "created_utc": "1401563576",
  "author_flair_text": null,
  "media_embed": {},
  "subreddit": "hysterectomy",
  "domain": "self.hysterectomy",
  "hide_score": false,
  "downs": 0,
  "thumbnail": "self",
  "distinguished": null,
  "title": "Changes to sexual functioning",
  "secure_media_embed": {},
  "from_id": null,
  "secure_media": null,
  "url": "http://www.reddit.com/r/hysterectomy/comments/26z4vk/changes_to_sexual_functioning/",
  "created": 1401567176,
  "subreddit_id": "t5_2x8iq",
  "from_kind": null,
  "stickied": false,
  "score": 1,
  "edited": false,
  "quarantine": false,
  "num_comments": 0,
  "author_flair_

In [5]:
# number of records
print(len(submissions_data))


11510


In [6]:
# Sample a few random titles
import random
print("\nRandom sample of 5 titles:")
for item in random.sample(submissions_data, min(5, len(submissions_data))):
    print(f"- {item['title']}")


Random sample of 5 titles:
- Pre-op next week - questions to bring to discuss?
- 3 Days post of total laproscopic hysterectomy :)
- Bath zomg bath
- H-day in Two-ish Days!
- In pre-op and waiting to go back.


In [7]:
# What defines a unique record?
# Check if all submissions have unique IDs
ids = [item['id'] for item in submissions_data]
unique_ids = len(set(ids))
print(f"\nTotal submissions: {len(submissions_data)}")
print(f"Unique submission IDs: {unique_ids}")


Total submissions: 11510
Unique submission IDs: 11510


In [8]:
# Convert timestamps to integers and handle any potential type mismatches
timestamps = []
for item in submissions_data:
    try:
        timestamp = int(item['created_utc'])  # Convert to integer
        timestamps.append(timestamp)
    except (ValueError, TypeError) as e:
        print(f"Error with timestamp: {item['created_utc']} - {e}")
        continue

# Now process the timestamps
from datetime import datetime
if timestamps:  # Make sure we have valid timestamps
    earliest = datetime.fromtimestamp(min(timestamps))
    latest = datetime.fromtimestamp(max(timestamps))
    print(f"\nDate range of submissions:")
    print(f"Earliest: {earliest}")
    print(f"Latest: {latest}")


Date range of submissions:
Earliest: 2014-05-31 12:12:56
Latest: 2022-12-31 15:59:13


In [None]:
# turn into a pandas dataframe
submissions_df = pd.DataFrame(submissions_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11510 entries, 0 to 11509
Columns: 126 entries, link_flair_text to call_to_action
dtypes: bool(3), float64(37), int64(3), object(83)
memory usage: 10.8+ MB


link_flair_text    11510
permalink              0
selftext               0
media              11478
ups                11478
                   ...  
media_metadata     11414
poll_data          11489
gallery_data       11428
is_gallery         11414
call_to_action     11310
Length: 126, dtype: int64

In [15]:
# look at the first few rows
submissions_df.head()

Unnamed: 0,link_flair_text,permalink,selftext,media,ups,retrieved_on,created_utc,author_flair_text,media_embed,subreddit,...,upvote_ratio,is_created_from_ads_ui,retrieved_utc,top_awarded_type,url_overridden_by_dest,media_metadata,poll_data,gallery_data,is_gallery,call_to_action
0,,/r/hysterectomy/comments/26z4vk/changes_to_sex...,If you've had a hysterectomy (removal of uteru...,,1.0,1441332000.0,1401563576,,{},hysterectomy,...,,,,,,,,,,
1,,/r/hysterectomy/comments/3mjfz7/spotting/,"Ladies, I have a dumb question for you. I had...",,1.0,1450769000.0,1443323627,,{},hysterectomy,...,,,,,,,,,,
2,,/r/hysterectomy/comments/49c2k3/hysterectomy/,,,1.0,1463452000.0,1457346839,,{},hysterectomy,...,,,,,,,,,,
3,,/r/hysterectomy/comments/4bd4dn/girl_talk_time...,,,1.0,1463487000.0,1458579592,,{},hysterectomy,...,,,,,,,,,,
4,,/r/hysterectomy/comments/52fcak/4_months_post_...,"I had a hysterectomy about 4 months ago, and I...",,1.0,1476412000.0,1473699325,,{},hysterectomy,...,,,,,,,,,,


In [16]:
submissions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11510 entries, 0 to 11509
Columns: 126 entries, link_flair_text to call_to_action
dtypes: bool(3), float64(37), int64(3), object(83)
memory usage: 10.8+ MB


In [17]:
submissions_df.isnull().sum()

link_flair_text    11510
permalink              0
selftext               0
media              11478
ups                11478
                   ...  
media_metadata     11414
poll_data          11489
gallery_data       11428
is_gallery         11414
call_to_action     11310
Length: 126, dtype: int64

In [19]:
# only keep necessary columns
# Create a DataFrame with only the columns we need
essential_cols = ['id', 'name', 'created_utc', 'title']
submissions_subset = submissions_df[essential_cols]

# Convert created_utc to datetime for easier analysis
submissions_subset['created_utc'] = pd.to_datetime(submissions_subset['created_utc'], unit='s')

# Display info about the DataFrame
print("DataFrame Info:")
submissions_subset.info()

# Show first few rows
print("\nFirst few rows:")
print(submissions_subset.head())

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11510 entries, 0 to 11509
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           11510 non-null  object        
 1   name         10202 non-null  object        
 2   created_utc  11510 non-null  datetime64[ns]
 3   title        11510 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 359.8+ KB

First few rows:
       id       name         created_utc  \
0  26z4vk  t3_26z4vk 2014-05-31 19:12:56   
1  3mjfz7  t3_3mjfz7 2015-09-27 03:13:47   
2  49c2k3  t3_49c2k3 2016-03-07 10:33:59   
3  4bd4dn  t3_4bd4dn 2016-03-21 16:59:52   
4  52fcak  t3_52fcak 2016-09-12 16:55:25   

                                          title  
0                 Changes to sexual functioning  
1                                     Spotting?  
2                                  Hysterectomy  
3                  Girl Talk Time: Hyst

  submissions_subset['created_utc'] = pd.to_datetime(submissions_subset['created_utc'], unit='s')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submissions_subset['created_utc'] = pd.to_datetime(submissions_subset['created_utc'], unit='s')


In [24]:
# Filter for titles containing "Essure"
essure_submissions = submissions_subset[submissions_subset['title'].str.contains('Essure', case=True, na=False)]

# Display info about the filtered DataFrame
print("Number of submissions about Essure:", len(essure_submissions))

# Show the titles
print("\nTitles containing 'Essure':")
print(essure_submissions[['created_utc', 'title']].to_string())

Number of submissions about Essure: 3

Titles containing 'Essure':
             created_utc                                                                                    title
1208 2020-03-03 20:53:51  Robotic LAVH today and feeling surprisingly good! With SURPRISE traveling Essure coils!
2007 2020-09-26 20:03:21                                                           Essure removal = hysterectomy?
9682 2022-09-28 15:30:23                                                                         Essure procedure


In [20]:
# comments data frame

file_path = "../data/Input/Reddit/comments/hysterectomy_comments.zst"
comments_data = read_zst_to_json(file_path)

Processing ../data/Input/Reddit/comments/hysterectomy_comments.zst...
Processed 1000 records...
Processed 2000 records...
Processed 3000 records...
Processed 4000 records...
Processed 5000 records...
Processed 6000 records...
Processed 7000 records...
Processed 8000 records...
Processed 9000 records...
Processed 10000 records...
Processed 11000 records...
Processed 12000 records...
Processed 13000 records...
Processed 14000 records...
Processed 15000 records...
Processed 16000 records...
Processed 17000 records...
Processed 18000 records...
Processed 19000 records...
Processed 20000 records...
Processed 21000 records...
Processed 22000 records...
Processed 23000 records...
Processed 24000 records...
Processed 25000 records...
Processed 26000 records...
Processed 27000 records...
Processed 28000 records...
Processed 29000 records...
Processed 30000 records...
Processed 31000 records...
Processed 32000 records...
Processed 33000 records...
Processed 34000 records...
Processed 35000 recor

In [21]:
# Check the first comment to see its structure
if comments_data:
    print("First comment structure:")
    print(json.dumps(comments_data[0], indent=2))
    
    print("\nTotal number of comments:")
    print(len(comments_data))

First comment structure:
{
  "retrieved_on": 1444604587,
  "gilded": 0,
  "ups": 1,
  "score": 1,
  "distinguished": null,
  "author_flair_css_class": null,
  "author": "lasercat13",
  "subreddit": "hysterectomy",
  "created_utc": "1443325744",
  "edited": false,
  "id": "cvfhc36",
  "body": "I would call your Ob/Gyn and ask to make sure. \r\rI had a hysterectomy back in April this year, and had everything removed except my ovaries. So far I haven't had a hint of spotting. And I've felt better than I have in 14 years. I'm 35, and this was the last step in a long line of treatments. \r\rI hope you get some answers and peace of mind! ",
  "link_id": "t3_3mjfz7",
  "author_flair_text": null,
  "controversiality": 0,
  "subreddit_id": "t5_2x8iq",
  "parent_id": "t3_3mjfz7"
}

Total number of comments:
144908


In [23]:
comments_subset = pd.DataFrame(comments_data)[['id', 'subreddit', 'created_utc', 'body','link_id', 'parent_id']]

In [25]:
# Filter comments containing "Essure"
essure_comments = comments_subset[comments_subset['body'].str.contains('Essure', case=True, na=False)]

# Display info about the filtered DataFrame
print("Number of comments about Essure:", len(essure_comments))

# Show sample of the comments
print("\nSample of comments containing 'Essure':")
print("\nTimestamp | Comment Text")
print("-" * 80)
for _, row in essure_comments.iterrows():
    # Truncate long comments for display
    comment_preview = row['body'][:200] + "..." if len(row['body']) > 200 else row['body']
    print(f"{row['created_utc']} | {comment_preview}")
    print("-" * 80)

# Basic statistics
print(f"\nTotal number of Essure-related comments: {len(essure_comments)}")
print(f"Number of unique posts these comments appear in: {essure_comments['link_id'].nunique()}")

Number of comments about Essure: 37

Sample of comments containing 'Essure':

Timestamp | Comment Text
--------------------------------------------------------------------------------
1498973050 | So - I started by having Essure. 

Then I changed docs. My new doc, I told her I'd like a hyst for the following reasons:

- I am sterile. My uterus is useless now.

- I don't want children, hence I h...
--------------------------------------------------------------------------------
1499272739 | I didn't even find someone who was willing to do the Essure until I was 26. I had the hyst this past January, just before my 32nd birthday. 

My advice would be, yes, go into the reasons you don't wan...
--------------------------------------------------------------------------------
1583421099 | I was told 10 but my discharge paperwork said 5. I had some microsurgery on bowels though as my Essure coils had migrated from my tubes into uterus and perforated bowel along the way. We did NOT know ...
---