Dataloading

In [10]:
import json

# Function to read and fix JSON data
def read_and_fix_json(file_path):
    with open(file_path, 'r') as f:
        raw_data = f.read()
    fixed_data = '[' + raw_data.replace('}{', '},{') + ']'
    return json.loads(fixed_data)

# Read and fix data from both JSON files
data1 = read_and_fix_json('data/top_100_cleaned.json')
data2 = read_and_fix_json('data/100_300_cleaned.json')

# Concatenate the data
combined_data = data1 + data2



In [None]:
from collections import Counter

# Keep only the 'author' and 'body' fields
filtered_data = [{'author': d['author'], 'body': d['body']} for d in combined_data]

# Count the number of comments per author
author_counts = Counter(item['author'] for item in filtered_data)

# Remove posts from authors with over 20,000 comments
filtered_data = [item for item in filtered_data if author_counts[item['author']] <= 20000]



In [14]:
import pandas as pd

# Convert filtered_data to a pandas DataFrame
df_filtered = pd.DataFrame(filtered_data)

# Write the DataFrame to a JSON file
df_filtered.to_json('data/filtered_data.json', orient='records', indent=4)

In [12]:
# Write the filtered data to a new JSON file
with open('data/reddit_comments_dec_2024.json', 'w') as f:
    json.dump(filtered_data, f, indent=4)

In [13]:
with open('data/reddit_comments_dec_2024.json', 'r') as f:
    data = json.load(f)

# Optionally, display the first few records for verification
print(data[:3])

[{'author': 'hive-protect', 'body': 'Hello u/Ok_Bodybuilder_4302,\n\nWe recommend posting only in this subreddit, as we ensure posts are not manipulated with upvotes. This gives you a better chance of receiving a referral.\n\nWe want to help you avoid wasting time on subreddits where posts are manipulated, which might prevent you from getting the referrals you deserve.\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/teslareferralcode) if you have any questions or concerns.*', 'created_utc': 1733011204.0, 'score': 1, 'subreddit': 'teslareferralcode', 'subreddit_id': 't5_3ixnx', 'ups': 1}, {'author': 'BroMandi', 'body': 'Deal link: [Walmart](https://www.walmart.com/ip/LG-65-Class-4K-UHD-OLED-Web-OS-24-Smart-120-Hz-TV-with-Dolby-Vision-OLED65C4PUA/5193228935) ([Check price on Amazon](https://www.amazon.com/gp/goldbox?&linkCode=ll2&tag=bestbuzzy-20))\n\n---\n\nBeach Camera is back with another sale on th

In [31]:
training_data = [item['body'] for item in data]
training_labels = [item['author'] for item in data]

# Optionally, display the first few entries for verification
print(training_data[:3])
print(training_labels[:3])

['u/madthumbz is the first to solve this drawing!', 'Pixelary is a new pixel-based drawing and guessing game built on [Reddit\'s developer platform](https://developers.reddit.com). To play, press the "Guess" button to submit a guess or "Draw" button to create your own drawing. [Submit feedback](https://www.reddit.com/r/Pixelary/comments/1f578ps/hello_pixelary_community/).', 'Hello u/Ok_Bodybuilder_4302,\n\nWe recommend posting only in this subreddit, as we ensure posts are not manipulated with upvotes. This gives you a better chance of receiving a referral.\n\nWe want to help you avoid wasting time on subreddits where posts are manipulated, which might prevent you from getting the referrals you deserve.\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/teslareferralcode) if you have any questions or concerns.*']
['pixelary-game', 'pixelary-game', 'hive-protect']


In [32]:
from collections import Counter
import re
from collections import defaultdict


# Count the occurrences of each comment
comment_counts = Counter(training_data)

# Filter out authors with comments that appear more than 50 times or are identified as bots
bot_authors = [author for author, comment in zip(training_labels, training_data) if comment_counts[comment] > 100 ]
print(len(set(bot_authors)))


# Get unique authors from the filtered list
unique_filtered_authors = set(training_labels) - set(bot_authors)

# Filter the training data and labels based on the filtered authors
training_data = [comment for comment, author in zip(training_data, training_labels) if author in unique_filtered_authors]
training_labels = [author for author in training_labels if author in unique_filtered_authors]


54


In [33]:
from collections import Counter
import numpy as np

# Number of unique authors
num_authors = len(set(training_labels))

# Number of messages per author
messages_per_author = Counter(training_labels)
# Average number of messages per author
average_messages_per_author = np.mean(list(messages_per_author.values()))

# Maximum number of messages by a single author
max_messages_by_author = max(messages_per_author.values())

# Minimum number of messages by a single author
min_messages_by_author = min(messages_per_author.values())

print(f"Average number of messages per author: {average_messages_per_author:.2f}")
print(f"Maximum number of messages by a single author: {max_messages_by_author}")
print(f"Minimum number of messages by a single author: {min_messages_by_author}")

# Average length of messages
message_lengths = [len(message) for message in training_data]
average_length = np.mean(message_lengths)

# Longest message
longest_message = max(training_data, key=len)

print(f"Number of unique authors: {num_authors}")
print(f"Number of messages per author: {messages_per_author}")
print(f"Average length of messages: {average_length:.2f}")
print(f"Length of longest message: {len(longest_message)}")

# Median length of messages
median_length = np.median(message_lengths)

# Quartiles of message lengths
quartiles = np.percentile(message_lengths, [25, 50, 75])

# Percentage of messages shorter than 512 characters
percent_shorter_than_512 = np.sum(np.array(message_lengths) < 512) / len(message_lengths) * 100

print(f"Median length of messages: {median_length:.2f}")
print(f"Quartiles of message lengths: {quartiles}")
print(f"Percentage of messages shorter than 512 characters: {percent_shorter_than_512:.2f}%")

Average number of messages per author: 12681.92
Maximum number of messages by a single author: 52323
Minimum number of messages by a single author: 6706
Number of unique authors: 12
Number of messages per author: Counter({'MTGCardFetcher': 52323, 'Cool-Importance6004': 14791, 'SaveVideo': 12441, 'MusicMirrorMan': 11300, 'Rya_the_Scout': 9617, 'Perfect_Percentage19': 8689, 'BroMandi': 7793, 'Totally-not-Patches': 7479, 'revddit': 7280, 'reddit_lss_2': 6927, 'SourceConsistent6234': 6837, 'coinbasesupport': 6706})
Average length of messages: 959.41
Length of longest message: 9992
Median length of messages: 687.00
Quartiles of message lengths: [ 462.  687. 1397.]
Percentage of messages shorter than 512 characters: 36.64%


Author: pixelary-game
 - u/madthumbz is the first to solve this drawing!
 - Pixelary is a new pixel-based drawing and guessing game built on [Reddit's developer platform](https://developers.reddit.com). To play, press the "Guess" button to submit a guess or "Draw" button to create your own drawing. [Submit feedback](https://www.reddit.com/r/Pixelary/comments/1f578ps/hello_pixelary_community/).
 - Pixelary is a new pixel-based drawing and guessing game built on [Reddit's developer platform](https://developers.reddit.com). To play, press the "Guess" button to submit a guess or "Draw" button to create your own drawing. [Submit feedback](https://www.reddit.com/r/Pixelary/comments/1f578ps/hello_pixelary_community/).
 - u/madthumbz is the first to solve this drawing!
 - u/madthumbz is the first to solve this drawing!
 - u/madthumbz is the first to solve this drawing!
 - Pixelary is a new pixel-based drawing and guessing game built on [Reddit's developer platform](https://developers.reddit.co

In [None]:
import torch
from transformers import BertModel

import torch.nn as nn

class BertLSTMClassifier(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', lstm_hidden_dim=256, mlp_hidden_dim=128, num_classes=10, lstm_layers=1, bidirectional=True, dropout=0.3):
        super(BertLSTMClassifier, self).__init__()
        # BERT encoder
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        # LSTM layer; input size comes from BERT's hidden size
        self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size,
                            hidden_size=lstm_hidden_dim,
                            num_layers=lstm_layers,
                            batch_first=True,
                            bidirectional=bidirectional,
                            dropout=dropout if lstm_layers > 1 else 0)
        
        # Determine LSTM output dimension based on bidirectionality
        lstm_output_dim = lstm_hidden_dim * (2 if bidirectional else 1)
        
        # MLP for final classification
        self.mlp = nn.Sequential(
            nn.Linear(lstm_output_dim, mlp_hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_hidden_dim, num_classes)
        )
        
        # Softmax activation for output probabilities
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, input_ids, attention_mask):
        # Get BERT encoded representations
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)
        
        # Process the encoded sequence with LSTM
        lstm_out, (hn, _) = self.lstm(sequence_output)
        
        # Choose the final hidden state.
        # If bidirectional, concatenate the last hidden states from both directions.
        if self.lstm.bidirectional:
            hidden = torch.cat((hn[-2], hn[-1]), dim=1)
        else:
            hidden = hn[-1]
        
        # Classify using MLP
        logits = self.mlp(hidden)
        probs = self.softmax(logits)
        return probs

# Example usage:
# Assuming you have already tokenized your input into `input_ids` and `attention_mask` tensors.
# model = BertLSTMClassifier(num_classes=NUM_CLASSES)   # Replace NUM_CLASSES with your number of classes
# probabilities = model(input_ids, attention_mask)