In [None]:
import json

comments = []
decoder = json.JSONDecoder()

with open("reddit_comments_dec_2024.json", "r") as f:
    data = f.read()  # Read the entire file as a string
    pos = 0
    while pos < len(data):
        try:
            obj, index = decoder.raw_decode(data, pos)  # Decode a JSON object
            comments.append(obj)
            pos = index  # Move the position forward
        except json.JSONDecodeError:
            break  # Stop if there's an error in decoding

comments = comments[0]

In [None]:
classes = list(set([comment["author"] for comment in comments]))
# print(len(classes))

# get test and train data
import random
random.shuffle(comments)
train_data = comments[:int(len(comments)*0.8)]
test_data = comments[int(len(comments)*0.8):]

In [None]:
print(comments[0].keys())

In [None]:
print(len(train_data))
print(len(test_data))

In [None]:
# sort comments by comment body length and get number of words in longest comment
a = sorted(train_data, key=lambda x: len(x["body"]))
max_comment_length = len(a[-1]["body"].split())
print(max_comment_length)

In [None]:
subs = list(set([comment["subreddit"] for comment in comments]))
print(len(subs))

class_indices = {c: i for i, c in enumerate(classes)}

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=2000)  # You can limit the features

comment_bodies = [comment["body"] for comment in (train_data + test_data)]

vectorizer.fit(comment_bodies)

with open("vectorizer.pkl", "wb") as f:
    import pickle
    pickle.dump(vectorizer, f)

In [None]:
v = vectorizer.transform(["the world and"]).toarray().flatten()
print(sum(v))

In [None]:
NUM_SUB_FEATURES = 1024

import hashlib

def feature(comment):
    create_time = comment["created_utc"]
    score = comment["score"]
    ups = comment["ups"]
    sub_encoding = [0] * NUM_SUB_FEATURES
    sub_index = int(hashlib.sha1(comment["subreddit"].encode()).hexdigest(), 16) % NUM_SUB_FEATURES
    sub_encoding[sub_index] = 1 if sub_index % 2 else -1
    tfidf_features = vectorizer.transform([comment["body"]]).toarray().flatten()
    return [1, create_time, score, ups] + sub_encoding + tfidf_features.tolist()

In [None]:
X_train = [feature(comment) for comment in train_data]
with open("X_train.pkl", "wb") as f:
    import pickle
    pickle.dump(X_train, f)

In [None]:
y_train = [class_indices[comment["author"]] for comment in train_data]
with open("y_train.pkl", "wb") as f:
    import pickle
    pickle.dump(y_train, f)

In [None]:
X_test = [feature(comment) for comment in test_data]
with open("X_test.pkl", "wb") as f:
    import pickle
    pickle.dump(X_test, f)

In [None]:
y_test = [class_indices[comment["author"]] for comment in test_data]
with open("y_test.pkl", "wb") as f:
    import pickle
    pickle.dump(y_test, f)