In [5]:
# DATA EXPLORATION
import pandas as pd 

# use ijson parser for large json file
import ijson 
from collections import Counter

 # file with just text/author/subreddit/etc (no /s markers)
comments = "comments.json"

# counting sarcastic & not (data not available in this file)
sarcastic_count = 0
non_sarcastic_count = 0

# coutning top authors & subreddits
authors = []
subreddits = []

# json file is structured as a dictionary - go through key value pairs
with open(comments, "rb") as f:
    for key, value in ijson.kvitems(f, ""):
        # check if comment has sarcasm indicator
        text = value.get("text", "")
        if "/s" in text:
            sarcastic_count += 1
        else:
            non_sarcastic_count += 1

        authors.append(value.get("author"))
        subreddits.append(value.get("subreddit"))

print("Sarcastic comments:", sarcastic_count) # 0
print("Non-sarcastic comments:", non_sarcastic_count) # 12,704,751

# authors
author_counts = Counter(authors).most_common(10)
print("Top 10 authors:", author_counts)

# subreddits
subreddit_counts = Counter(subreddits).most_common(10)
print("Top 10 subreddits:", subreddit_counts)

Sarcastic comments: 0
Non-sarcastic comments: 12704751
Top 10 authors: [('[deleted]', 297775), ('mirandaBBfan', 2226), ('timewaitsforsome', 1747), ('bulnreinhart', 1735), ('Gundam336', 1727), ('Tiffosi', 1274), ('HeyDontSlip', 1262), ('Based06', 1243), ('GoSomaliPirates', 1155), ('dbauer0706', 1127)]
Top 10 subreddits: [('AskReddit', 2448608), ('pics', 659980), ('worldnews', 572368), ('politics', 526758), ('videos', 383311), ('funny', 374652), ('news', 365579), ('todayilearned', 316996), ('gaming', 306898), ('pcmasterrace', 306737)]


In [6]:
test = "test-balanced.csv"
train = "train-balanced.csv"
import csv

sarcastic_count = 0
non_sarcastic_count = 0
authors = []
subreddits = []

with open("train-balanced.csv", "r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        text = row.get("text", "")
        if "/s" in text:
            sarcastic_count += 1
        else:
            non_sarcastic_count += 1

        authors.append(row.get("author"))
        subreddits.append(row.get("subreddit"))

print("Sarcastic comments:", sarcastic_count)
print("Non-sarcastic comments:", non_sarcastic_count)

Sarcastic comments: 0
Non-sarcastic comments: 128540


In [8]:
#parse train file 
import json
train_rows = []

with open("train-balanced.csv", "r") as f:
    for line in f:
        line = line.strip()

        # Split into 3 parts using '|'
        part_id, part_codes, part_labels = line.split("|")

        # Split codes and labels by space
        codes = part_codes.split()
        labels = part_labels.split()

        train_rows.append({
            "id": part_id,
            "code1": codes[0],
            "code2": codes[1],
            "label1": int(labels[0]),
            "label2": int(labels[1])
        })

train_df = pd.DataFrame(train_rows)
print(train_df.head)

comments = []
with open("comments.json", "r") as f:
    for line in f:
        comments.append(json.loads(line))

comments_df = pd.DataFrame(comments)

<bound method NDFrame.head of                     id    code1    code2  label1  label2
0                7uaac  c07fd66  c07fjge       1       0
1                7u896  c07f3md  c07f3ls       1       0
2                7visa  c07jcz3  c07it38       0       1
3                7vq9q  c07jfvv  c07jy05       1       0
4                7xdys  c07o37s  c07o350       1       0
...                ...      ...      ...     ...     ...
128536  62oyi7 dfo7koy  dfoaxnm  dfocp4d       1       0
128537          62oypx  dfoplb5  dfo8536       0       1
128538  62p1zh dfoiibk  dfoks1e  dfojn53       1       0
128539          62p22t  dfp3fdl  dfog1nz       1       0
128540          62p5z8  dfor4e3  dfohaos       1       0

[128541 rows x 5 columns]>


In [9]:
records = []

for comment_id in comments_df.columns:
    entry = comments_df.iloc[0][comment_id]   # extract the dict
    entry["id"] = comment_id                  # add the ID into the dict
    records.append(entry)

comments_normalized = pd.DataFrame(records)
print(comments_normalized.head())

                                                text     author  score  ups  \
0  Upvote For Simultaneous "Million Person" March...  [deleted]     48  104   
1                      Economics (29654 subscribers)       pfft     14   14   
2  Children in the Czech Republic are happier and...  [deleted]     29   48   
3  Of course it's a "less of a country", those pe...  joe24pack      1    1   
4  Here we go again: Israeli PM vows 'sharp respo...  [deleted]     14   23   

   downs     date  created_utc  subreddit       id  
0     56  2009-02   1233540251  Economics    7u4r6  
1      0  2009-02   1233549003  Economics  c07ewjj  
2     19  2009-02   1233533923  worldnews    7u4a5  
3      0  2009-02   1233553378  worldnews  c07ey0j  
4      9  2009-02   1233502066  worldnews    7u1ht  


In [10]:
comments_normalized.to_csv("data/clean.csv")