check diff of link and parent id.

In [1]:
import json

# Initialize counters
match_count = 0
mismatch_count = 0

# Open and read the JSON file (assuming it's a list of entries)
with open('2_months_data/filtered_comments.jsonl', 'r') as file:
    #data = json.load(file)  # If it's a list of JSON objects
    # If it's a JSON Lines file (one JSON per line), use:
    data = [json.loads(line) for line in file]

    for entry in data:
        link_id = entry.get('link_id', None)
        parent_id = entry.get('parent_id', None)
        
        if link_id is not None and parent_id is not None:
            if link_id == parent_id:
                match_count += 1
            else:
                mismatch_count += 1
        else:
            print(f"Skipping entry (missing fields): {entry.get('id', 'unknown')}")

# Print results
print(f"Number of matches (link_id == parent_id): {match_count}")
print(f"Number of mismatches (link_id != parent_id): {mismatch_count}")
print(f"Total entries processed: {match_count + mismatch_count}")

Number of matches (link_id == parent_id): 319658
Number of mismatches (link_id != parent_id): 495489
Total entries processed: 815147


In [3]:
import json

top_level = 0  # parent_id == t3_ (reply to post)
nested = 0     # parent_id == t1_ (reply to comment)

with open('2_months_data/filtered_comments.jsonl', 'r') as file:
    data = [json.loads(line) for line in file]
    
    for entry in data:
        parent_id = entry.get('parent_id', '')
        
        if parent_id.startswith('t3_'):
            top_level += 1
        elif parent_id.startswith('t1_'):
            nested += 1
        else:
            print(f"Unknown parent_id format: {parent_id}")

print(f"Top-level comments (reply to post): {top_level}")
print(f"Nested comments (reply to another comment): {nested}")

Top-level comments (reply to post): 319658
Nested comments (reply to another comment): 495489


In [6]:
import json

# Step 1: Load all post IDs from filtered_posts.jsonl
post_ids = set()

with open('2_months_data/filtered_posts.jsonl', 'r') as posts_file:
    for line in posts_file:
        try:
            post = json.loads(line)
            post_ids.add(post['id'])  # Raw ID (without 't3_')
        except json.JSONDecodeError:
            print(f"Skipping malformed line in posts file: {line[:100]}...")
        except KeyError:
            print(f"Post missing 'id' field: {line[:100]}...")

print(f"Loaded {len(post_ids)} unique post IDs from posts file")

# Step 2: Analyze comments against known posts
matched_comments = 0
unmatched_comments = 0

with open('2_months_data/filtered_comments_matched.jsonl', 'r') as comments_file:
    for line in comments_file:
        try:
            comment = json.loads(line)
            link_id = comment['link_id']
            
            # Extract post ID by removing 't3_' prefix
            post_id_from_comment = link_id[3:] if link_id.startswith('t3_') else link_id
            
            if post_id_from_comment in post_ids:
                matched_comments += 1
            else:
                unmatched_comments += 1
                
        except json.JSONDecodeError:
            print(f"Skipping malformed line in comments file: {line[:100]}...")
        except KeyError:
            print(f"Comment missing 'link_id' field: {line[:100]}...")

# Results
print("\nResults:")
print(f"Comments matching known posts: {matched_comments}")
print(f"Comments from unknown posts: {unmatched_comments}")
print(f"Percentage matched: {(matched_comments/(matched_comments+unmatched_comments))*100:.2f}%")

Loaded 31602 unique post IDs from posts file

Results:
Comments matching known posts: 812565
Comments from unknown posts: 0
Percentage matched: 100.00%


In [5]:
import json

# Step 1: Load all post IDs from filtered_posts.jsonl
post_ids = set()

with open('2_months_data/filtered_posts.jsonl', 'r') as posts_file:
    for line in posts_file:
        try:
            post = json.loads(line)
            post_ids.add(post['id'])  # Raw ID (without 't3_')
        except json.JSONDecodeError:
            print(f"Skipping malformed line in posts file: {line[:100]}...")

print(f"Loaded {len(post_ids)} unique post IDs")

# Step 2: Filter comments and write to new file
kept_comments = 0
dropped_comments = 0

with (open('2_months_data/filtered_comments.jsonl', 'r') as comments_file,
      open('2_months_data/filtered_comments_matched.jsonl', 'w') as output_file):
    
    for line in comments_file:
        try:
            comment = json.loads(line)
            link_id = comment['link_id']
            post_id_from_comment = link_id[3:] if link_id.startswith('t3_') else link_id
            
            if post_id_from_comment in post_ids:
                output_file.write(line)  # Keep original line to preserve formatting
                kept_comments += 1
            else:
                dropped_comments += 1
                
        except json.JSONDecodeError:
            print(f"Skipping malformed line: {line[:100]}...")

# Results
print("\nFiltering results:")
print(f"Comments kept: {kept_comments}")
print(f"Comments dropped: {dropped_comments}")
print(f"New filtered file: 'filtered_comments_matched.jsonl'")

Loaded 31602 unique post IDs

Filtering results:
Comments kept: 812565
Comments dropped: 2582
New filtered file: 'filtered_comments_matched.jsonl'


In [7]:
import json

has_non_t3 = False

with open('2_months_data/filtered_comments.jsonl', 'r') as f:
    for line in f:
        try:
            comment = json.loads(line)
            link_id = comment.get('link_id', '')
            if not link_id.startswith('t3_'):
                has_non_t3 = True
                print(f"Found non-t3 link_id: {link_id} in comment {comment.get('id')}")
                break
        except json.JSONDecodeError:
            continue

if not has_non_t3:
    print("All link_id values start with 't3_' - safe to proceed")
else:
    print("Warning: Found comments with non-t3 link_id prefixes")

All link_id values start with 't3_' - safe to proceed


In [8]:
if not has_non_t3:
    with (open('2_months_data/filtered_comments.jsonl', 'r') as infile,
          open('2_months_data/filtered_comments_cleanlink.jsonl', 'w') as outfile):
        
        for line in infile:
            try:
                comment = json.loads(line)
                # Remove t3_ prefix while maintaining all other fields
                comment['link_id'] = comment['link_id'][3:]
                outfile.write(json.dumps(comment) + '\n')
            except json.JSONDecodeError:
                continue  # skip malformed lines

    print("Created new file with cleaned link_ids: filtered_comments_cleanlink.jsonl")
else:
    print("Aborting cleanup due to non-t3 link_ids found")

Created new file with cleaned link_ids: filtered_comments_cleanlink.jsonl
