In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
from tqdm import tqdm
import nltk

pd.set_option('display.max_colwidth', 3000)
pd.set_option('display.max_rows', 3000)
pd.set_option('display.max_columns', 3000)
pd.set_option('display.width', 1000)

nltk.download('punkt')

# Load dataset
df = pd.read_csv('./data/reddit_jokes_slim_processed.csv')

# Text Preprocessing
def preprocess_text(text):
    # Tokenize and lower case
    tokens = word_tokenize(text.lower())
    return tokens

df['text'] = df['thread_title'] + ' ' + df['thread_selftext']

df['text'] = df['text'].apply(preprocess_text)

# Build a vocabulary
word_counts = Counter()
for text in df['text']:
    word_counts.update(text)
vocab = {word: i + 1 for i, (word, _) in enumerate(word_counts.most_common())} # +1 for padding token

# Encoding text and labels
def encode_text(text, vocab):
    return [vocab[word] for word in text if word in vocab]

df['encoded_text'] = df['text'].apply(lambda x: encode_text(x, vocab))

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['thread_over_18'])

# Padding sequences
def pad_sequences(encoded_texts, max_length):
    padded = np.zeros((len(encoded_texts), max_length), dtype=int)
    for i, row in enumerate(encoded_texts):
        padded[i, :len(row)] = np.array(row)[:max_length]
    return padded

max_length = 50 # You might want to set this to the length of your longest joke
df['padded_text'] = list(pad_sequences(df['encoded_text'], max_length))

# Separate majority and minority classes
df_majority = df[df.label == 0]
df_minority = df[df.label == 1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
print(df_upsampled.label.value_counts())

# Now use df_upsampled instead of df to split your dataset
train, test = train_test_split(df_upsampled, test_size=0.3)
val, test = train_test_split(test, test_size=0.6)

# PyTorch dataset
class JokesDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = torch.tensor(self.dataframe.iloc[idx]['padded_text'])
        label = torch.tensor(self.dataframe.iloc[idx]['label'])
        return text, label

# LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size):
        super(LSTMClassifier, self).__init__()
        self.embedding_dim = embedding_dim  # Add this line
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2label = nn.Linear(hidden_dim, label_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(sentence.size(0), -1, self.embedding_dim))
        lstm_out_last = lstm_out[:, -1, :]
        label_space = self.hidden2label(lstm_out_last)
        label_scores = nn.functional.log_softmax(label_space, dim=1)
        return label_scores


# Instantiate the model, loss function, and optimizer
vocab_size = len(vocab) + 1 # +1 for padding token
model = LSTMClassifier(embedding_dim=100, hidden_dim=128, vocab_size=vocab_size, label_size=2)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training and Evaluation Functions
def train_model(model, train_loader, val_loader, optimizer, loss_function, epochs=1):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for texts, labels in tqdm(train_loader):
            optimizer.zero_grad()
            outputs = model(texts)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")
        evaluate_model(model, val_loader)

def evaluate_model(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for texts, labels in data_loader:
            outputs = model(texts)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())
    return all_preds, all_labels



# DataLoaders
batch_size = 32
train_loader = DataLoader(JokesDataset(train), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(JokesDataset(val), batch_size=batch_size, shuffle=False)
test_loader = DataLoader(JokesDataset(test), batch_size=batch_size, shuffle=False)

# Train the model
train_model(model, train_loader, val_loader, optimizer, loss_function, epochs=10)

# Evaluate on test set
test_preds, test_labels = evaluate_model(model, test_loader)
# Convert class labels to strings
class_names = [str(cls) for cls in label_encoder.classes_]

# Calculate and print precision, recall, and F1-score
print(classification_report(test_labels, test_preds, target_names=class_names))


[nltk_data] Downloading package punkt to /Users/alishahed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


label
0    31576
1    31576
Name: count, dtype: int64


100%|██████████| 1382/1382 [00:34<00:00, 40.09it/s]


Epoch 1/10, Loss: 0.6765641542057225


100%|██████████| 1382/1382 [00:35<00:00, 39.43it/s]


Epoch 2/10, Loss: 0.5197297176063147


100%|██████████| 1382/1382 [00:35<00:00, 39.00it/s]


Epoch 3/10, Loss: 0.26738519113181985


100%|██████████| 1382/1382 [00:35<00:00, 38.79it/s]


Epoch 4/10, Loss: 0.1402523130355012


100%|██████████| 1382/1382 [00:36<00:00, 37.80it/s]


Epoch 5/10, Loss: 0.08333283449777883


100%|██████████| 1382/1382 [00:35<00:00, 38.74it/s]


Epoch 6/10, Loss: 0.059937963605044324


100%|██████████| 1382/1382 [00:35<00:00, 38.69it/s]


Epoch 7/10, Loss: 0.042266305977601275


100%|██████████| 1382/1382 [00:35<00:00, 38.57it/s]


Epoch 8/10, Loss: 0.033750013039159615


100%|██████████| 1382/1382 [00:35<00:00, 38.53it/s]


Epoch 9/10, Loss: 0.027996766835212905


100%|██████████| 1382/1382 [00:35<00:00, 38.58it/s]


Epoch 10/10, Loss: 0.020693779186223157
              precision    recall  f1-score   support

       False       0.98      0.89      0.94      5715
        True       0.90      0.98      0.94      5653

    accuracy                           0.94     11368
   macro avg       0.94      0.94      0.94     11368
weighted avg       0.94      0.94      0.94     11368



In [22]:
# Save the model's state dictionary
torch.save(model.state_dict(), 'models/jokes_adult_clean_classifier_lstm.pth')


# Inferernce with the trained model

In [23]:
def preprocess_new_jokes(jokes, vocab, max_length):
    # Tokenize, encode, and pad the new jokes
    tokenized_jokes = [word_tokenize(joke.lower()) for joke in jokes]
    encoded_jokes = [[vocab.get(word, 0) for word in joke] for joke in tokenized_jokes]  # 0 for unknown words
    padded_jokes = pad_sequences(encoded_jokes, max_length)
    return padded_jokes

def predict_jokes(model, jokes_tensor):
    model.eval()
    with torch.no_grad():
        outputs = model(jokes_tensor)
        predictions = outputs.argmax(dim=1)
        return predictions

# Example new jokes
new_jokes = ["Why don't scientists trust atoms? Because they make up everything!",
"What do you call a fake noodle? An impasta!",
"Why does Dr. Pepper come in a bottle? Because his wife died.",
"What's an astronaut's favorite part of the computer? The space bar.",
"Why did the bicycle fall over? It was two-tired.",
"What do you call cheese that isn't yours? Nacho cheese.",
"A guy is sitting at the doctor's office. The doctor walks in and says, 'I have some bad news. I'm afraid you're going to have to stop masturbating.' 'I don't understand, doc,' the patient says. 'Why?' 'Because,' the doctor says. 'I'm trying to examine you.'",
"Why couldn't the bicycle stand up by itself? It was two-tired.",
"Dear NASA: Your momma thought I was big enough. From, Pluto",
"Why don’t skeletons fight each other? They don’t have the guts.",
"What did the grape do when he got stepped on? He let out a little wine.",
"What does the receptionist at a sperm bank say as clients leave? 'Thanks for coming!'",
"Why don't some couples go to the gym? Because some relationships don't work out!",
"What goes in hard and dry, but comes out soft and wet? Gum.",
"I told my suitcases there will be no vacation this year. Now I'm dealing with emotional baggage.",
"It's inappropriate to make a 'dad joke' if you're not a dad. It's a faux pa.",
"I used to play piano by ear, but now I use my hands.",
"What did the toaster say to the slice of bread? 'I want you inside me.'",
"'Give it to me! Give it to me!' she yelled. 'I'm so wet, give it to me now!' She could scream all she wanted, but I was keeping the umbrella.",
"How do you embarrass an archaeologist? Give him a used tampon and ask him which period it came from.",
"My wife told me to take the spider out instead of killing it. We went and had some drinks. Cool guy, wants to be a web developer.",
"What do you call a smiling Roman soldier with a piece of hair stuck between his front teeth? A glad-he-ate-her."]

# Preprocess the jokes
preprocessed_jokes = preprocess_new_jokes(new_jokes, vocab, max_length)
jokes_tensor = torch.tensor(preprocessed_jokes)

# Get predictions
model = LSTMClassifier(embedding_dim=100, hidden_dim=128, vocab_size=vocab_size, label_size=2)

# Load the model's state dictionary
model.load_state_dict(torch.load('models/jokes_adult_clean_classifier_lstm.pth'))

predictions = predict_jokes(model, jokes_tensor)

# Convert predictions to labels
predicted_labels = [label_encoder.classes_[pred] for pred in predictions]

# Print results
for joke, label in zip(new_jokes, predicted_labels):
    print(f"Joke: {joke}\nClassified as: {'Adult' if label else 'Clean'}\n")


Joke: Why don't scientists trust atoms? Because they make up everything!
Classified as: Clean

Joke: What do you call a fake noodle? An impasta!
Classified as: Clean

Joke: Why does Dr. Pepper come in a bottle? Because his wife died.
Classified as: Clean

Joke: What's an astronaut's favorite part of the computer? The space bar.
Classified as: Clean

Joke: Why did the bicycle fall over? It was two-tired.
Classified as: Clean

Joke: What do you call cheese that isn't yours? Nacho cheese.
Classified as: Clean

Joke: A guy is sitting at the doctor's office. The doctor walks in and says, 'I have some bad news. I'm afraid you're going to have to stop masturbating.' 'I don't understand, doc,' the patient says. 'Why?' 'Because,' the doctor says. 'I'm trying to examine you.'
Classified as: Adult

Joke: Why couldn't the bicycle stand up by itself? It was two-tired.
Classified as: Clean

Joke: Dear NASA: Your momma thought I was big enough. From, Pluto
Classified as: Clean

Joke: Why don’t skelet