In [23]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK resources if not already present
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

# Function to clean the text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()  # Remove whitespaces
    # Optionally remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Function to create averaged word vectors
def create_average_word_vector(tokens, model, num_features):
    feature_vec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    # Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index_to_key)

    for word in tokens:
        if word in index2word_set:
            nwords = nwords + 1
            feature_vec = np.add(feature_vec, model.wv[word])

    if nwords > 0:
        feature_vec = np.divide(feature_vec, nwords)
    return feature_vec

# Load the dataset
file_path = 'data/reddit_jokes_slim_processed.csv'  # Update with your file path
jokes_df = pd.read_csv(file_path)

# Clean the text in the dataset
jokes_df['combined_text'] = jokes_df['thread_title'] + " " + jokes_df['thread_selftext']
jokes_df['cleaned_text'] = jokes_df['combined_text'].apply(clean_text)

# Tokenize the cleaned text
jokes_df['tokens'] = jokes_df['cleaned_text'].apply(word_tokenize)

# Bucketing the upvote ratios into 10 equal-sized classes using quantiles
jokes_df['upvote_class'] = pd.qcut(jokes_df['thread_upvote_ratio'], q=2, labels=False, duplicates='drop')

# Train Word2Vec model
word2vec_model = Word2Vec(jokes_df['tokens'], vector_size=300, window=5, min_count=2, workers=4)

# Create averaged word vectors for the jokes
num_features = 300  # Number of features in the Word2Vec model
jokes_df['avg_word_vectors'] = jokes_df['tokens'].apply(lambda tokens: create_average_word_vector(tokens, word2vec_model, num_features))

# Preparing the final dataset
X = np.array(list(jokes_df['avg_word_vectors']))
y = jokes_df['upvote_class'].values

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the Logistic Regression model
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train, y_train)

# Predicting on the test set
y_pred_log_reg = log_reg_model.predict(X_test)

# Evaluating the model
evaluation_report_log_reg = classification_report(y_test, y_pred_log_reg)
print(evaluation_report_log_reg)


[nltk_data] Downloading package punkt to /Users/alishahed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alishahed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

           0       0.58      0.58      0.58      3765
           1       0.57      0.57      0.57      3678

    accuracy                           0.58      7443
   macro avg       0.57      0.57      0.57      7443
weighted avg       0.58      0.58      0.58      7443



In [24]:
import pandas as pd
import numpy as np
import re
import string
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK resources if not already present
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

# Function to clean the text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()  # Remove whitespaces
    # Optionally remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Function to create averaged word vectors
def create_average_word_vector(tokens, model, num_features):
    feature_vec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    # Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index_to_key)

    for word in tokens:
        if word in index2word_set:
            nwords = nwords + 1
            feature_vec = np.add(feature_vec, model.wv[word])

    if nwords > 0:
        feature_vec = np.divide(feature_vec, nwords)
    return feature_vec

# Load the dataset
file_path = 'data/reddit_jokes_slim_processed.csv'  # Update with your file path
jokes_df = pd.read_csv(file_path)



jokes_df['combined_text'] = jokes_df['thread_title'] + " " + jokes_df['thread_selftext']
jokes_df['cleaned_text'] = jokes_df['combined_text'].apply(clean_text)

# Tokenize the cleaned text
jokes_df['tokens'] = jokes_df['cleaned_text'].apply(word_tokenize)

# Bucketing the upvote ratios into 10 equal-sized classes using quantiles
jokes_df['upvote_class'] = pd.qcut(jokes_df['thread_upvote_ratio'], q=2, labels=False, duplicates='drop')

# Train Word2Vec model
word2vec_model = Word2Vec(jokes_df['tokens'], vector_size=300, window=5, min_count=2, workers=4)

# Create averaged word vectors for the jokes
num_features = 300  # Number of features in the Word2Vec model
jokes_df['avg_word_vectors'] = jokes_df['tokens'].apply(lambda tokens: create_average_word_vector(tokens, word2vec_model, num_features))

# Preparing the final dataset
X = np.array(list(jokes_df['avg_word_vectors']))
y = jokes_df['upvote_class'].values

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the LightGBM model
lgbm_model = lgb.LGBMClassifier(n_estimators=500, random_state=42)
lgbm_model.fit(X_train, y_train)

# Predicting on the test set
y_pred_lgbm = lgbm_model.predict(X_test)

# Evaluating the model
evaluation_report_lgbm = classification_report(y_test, y_pred_lgbm)
print(evaluation_report_lgbm)


[nltk_data] Downloading package punkt to /Users/alishahed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alishahed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[LightGBM] [Info] Number of positive: 14779, number of negative: 14993
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012883 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 29772, number of used features: 300
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496406 -> initscore=-0.014376
[LightGBM] [Info] Start training from score -0.014376
              precision    recall  f1-score   support

           0       0.59      0.60      0.60      3765
           1       0.58      0.57      0.57      3678

    accuracy                           0.58      7443
   macro avg       0.58      0.58      0.58      7443
weighted avg       0.58      0.58      0.58      7443



In [26]:
import pandas as pd
import numpy as np
import re
import string
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
from tqdm import tqdm

# Download NLTK resources if not already present
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

# Function to clean the text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()  # Remove whitespaces
    # Optionally remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Function to create averaged word vectors
def create_average_word_vector(tokens, model, num_features):
    feature_vec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    # Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index_to_key)

    for word in tokens:
        if word in index2word_set:
            nwords = nwords + 1
            feature_vec = np.add(feature_vec, model.wv[word])

    if nwords > 0:
        feature_vec = np.divide(feature_vec, nwords)
    return feature_vec

# Custom Dataset class
class JokesDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Load the dataset
file_path = 'data/reddit_jokes_slim_processed.csv'  # Update with your file path
jokes_df = pd.read_csv(file_path)

jokes_df['combined_text'] = jokes_df['thread_title'] + " " + jokes_df['thread_selftext']
jokes_df['cleaned_text'] = jokes_df['combined_text'].apply(clean_text)

# Tokenize the cleaned text
jokes_df['tokens'] = jokes_df['cleaned_text'].apply(word_tokenize)

# Bucketing the upvote ratios into 10 equal-sized classes using quantiles
jokes_df['upvote_class'] = pd.qcut(jokes_df['thread_upvote_ratio'], q=2, labels=False, duplicates='drop')

# Train Word2Vec model
word2vec_model = Word2Vec(jokes_df['tokens'], vector_size=300, window=5, min_count=2, workers=4)

# Create averaged word vectors for the jokes
num_features = 300  # Number of features in the Word2Vec model
jokes_df['avg_word_vectors'] = jokes_df['tokens'].apply(lambda tokens: create_average_word_vector(tokens, word2vec_model, num_features))

# Preparing the final dataset
X = np.array(list(jokes_df['avg_word_vectors']))
y = jokes_df['upvote_class'].values

# Encoding the target variable
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Converting arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Creating datasets and dataloaders
train_dataset = JokesDataset(X_train_tensor, y_train_tensor)
test_dataset = JokesDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Neural Network Model
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

model = NeuralNet(num_features, 128, len(np.unique(y_encoded)))

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training the model
num_epochs = 10
for epoch in range(num_epochs):
    loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
    for i, (features, labels) in loop:
        # Forward pass
        outputs = model(features)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update tqdm loop
        loop.set_description(f'Epoch [{epoch+1}/{num_epochs}]')
        loop.set_postfix(loss=loss.item())

# Evaluating the model
# Evaluating the model
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for features, labels in test_loader:
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        y_true.extend(labels.numpy())
        y_pred.extend(predicted.numpy())

# Converting numeric labels to strings for classification_report
class_labels = [str(cls) for cls in encoder.classes_]

# Generating classification report
report = classification_report(y_true, y_pred, target_names=class_labels)
print(report)

[nltk_data] Downloading package punkt to /Users/alishahed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alishahed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
                                                                             

              precision    recall  f1-score   support

           0       0.60      0.55      0.57      3765
           1       0.57      0.61      0.59      3678

    accuracy                           0.58      7443
   macro avg       0.58      0.58      0.58      7443
weighted avg       0.58      0.58      0.58      7443

