In [1]:
# bbc_news

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import pickle
import csv
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, r2_score

dataset = 'bbc_news' # folder and dataset name
# cols = ['gender', 'age', 'politics'] # outcome columns
cols = ['label_x'] # outcome columns
outcome = 'label'

num_epochs = 500

df = pd.read_csv(dataset + ".csv")

print(dataset)

all_scores = {}

train = pickle.load(open(dataset + '/BERTopic/run_1/train.pkl', 'rb'))
test = pickle.load(open(dataset + '/BERTopic/run_1/test.pkl', 'rb'))

for model in ['GensimLDA', 'BERTopic', 'NMF', 'Mallet_LDA', 'CTM']:
    print(model)
    
    train_error_list = []
    test_error_list = []

    r2s_train = []
    r2s_test = []

    if model == 'GensimLDA':
        lr = 0.05
    elif model == 'BERTopic':
        lr = 0.05
    elif model == 'NMF':
        lr = 0.003
    elif model == 'Mallet_LDA':
        lr = 0.003
    else:
        lr = 0.003

    for run in range(1, 6):
        print("Run: " + str(run))

        # loading in distributions that were saved during topic extraction
        test_distribution = pickle.load(open(dataset + '/' + model + '/run_' + str(run) + '/' + model + '_topic_distribution_test.pkl', 'rb'))
        train_distribution = pickle.load(open(dataset + '/' + model + '/run_' + str(run) + '/' + model + '_topic_distribution_train.pkl', 'rb'))

        if model == 'CTM':
            train_distribution = train_distribution[:round(len(train_distribution) * 0.80)]
        
        topics = []
        with open(dataset + '/' + model + '/run_' + str(run) + '/' + 'topics_100.txt', 'r') as f:
            reader = csv.reader(f)
            for row in reader:
                topic_list = [item.strip() for item in row if item.strip()]
                topics.append(topic_list)

        temp = pd.concat([train, test]).reset_index(drop=True) # concatenating train and test datasets
        distribution = np.concatenate([train_distribution, test_distribution]) # concatenating train and test distributions

        merged = pd.merge(temp, df, how='inner', left_on = 'message_id', right_on = 'Unnamed: 0')[['message_id', 'message_x'] + cols]
        merged.columns = ['message_id', 'message', 'label']

        X = distribution
        y = merged[outcome].reset_index(drop=True) # the outcome we care about

        # 80-20 split --> didn't use train-test-split function since its already shuffled
        X_train = X[:round(0.80 * len(X))]
        X_test = X[round(0.80 * len(X)):]

        y_train = y[:round(0.80 * len(X))]
        y_test = y[round(0.80 * len(X)):]

        # Convert arrays to torch tensors
        X_train_tensor = torch.tensor(np.array(X_train).astype(np.float32))
        y_train_tensor = torch.tensor(np.array(y_train).astype(np.longlong))  # Use long for classification
        X_test_tensor = torch.tensor(np.array(X_test).astype(np.float32))
        y_test_tensor = torch.tensor(np.array(y_test).astype(np.longlong))

        # Create datasets and dataloaders
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

        train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=False)
        test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

        # Define the model
        class LogisticRegressionModel(nn.Module):
            def __init__(self, input_size, num_classes):
                super(LogisticRegressionModel, self).__init__()
                self.layer1 = nn.Linear(input_size, num_classes)

            def forward(self, x):
                return self.layer1(x)

        input_size = X_train.shape[1]
        num_classes = len(np.unique(y_train))  # Assuming y_train contains all classes
        logit_model = LogisticRegressionModel(input_size, num_classes)

        # Loss and optimizer
        criterion = nn.CrossEntropyLoss()  # This includes softmax
        optimizer = optim.Adam(logit_model.parameters(), lr=lr)

        # Train the model
        for epoch in range(num_epochs):
            for inputs, targets in train_loader:
                # Forward pass
                outputs = logit_model(inputs)
                loss = criterion(outputs, targets)
                
                # Backward and optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
            if (epoch+1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

        # Predict on the test set
        logit_model.eval()  # Set the model to evaluation mode

        with torch.no_grad():
            y_pred_train = logit_model(X_train_tensor)
            _, predicted_train = torch.max(y_pred_train.data, 1)
            y_pred_test = logit_model(X_test_tensor)
            _, predicted_test = torch.max(y_pred_test.data, 1)

        train_error = 1 - accuracy_score(predicted_train, y_train)
        test_error = 1 - accuracy_score(predicted_test, y_test)

        train_error_list.append(train_error)
        test_error_list.append(test_error)
        
        print(f'Train Error: {train_error}')
        print(f'Test Error: {test_error}')

        print()

        r2_train = r2_score(y_train, predicted_train)
        r2_test = r2_score(y_test, predicted_test)

        r2s_train.append(r2_train)
        r2s_test.append(r2_test)
        
        print(f'R2 Train: {r2_train}')
        print(f'R2 Test: {r2_test}')

        all_scores[model] = {
            'Train Error': train_error_list,
            'Test Error': test_error_list,
            'R2 Train': r2s_train,
            'R2 Test': r2s_test,

            'y_train': y_train,
            'y_train_pred': predicted_train.numpy(),
            'y_test': y_test,
            'y_test_pred': predicted_test.numpy()
        }
    
    print()

print()

for m in all_scores.keys():
    print(f'{m} Average Train Error: {np.mean(all_scores[m]["Train Error"])}')
    print(f'{m} Average Test Error: {np.mean(all_scores[m]["Test Error"])}')
    print(f'{m} Average R2 Train: {np.mean(all_scores[m]["R2 Train"])}')
    print(f'{m} Average R2 Test: {np.mean(all_scores[m]["R2 Test"])}')
    print()

with open('all_results/' + dataset + '_all_scores_' + outcome + '.pkl', 'wb') as f:
     pickle.dump(all_scores, f)

bbc_news
GensimLDA
Run: 1
Epoch [10/500], Loss: 1.5406
Epoch [20/500], Loss: 1.4712
Epoch [30/500], Loss: 1.4092
Epoch [40/500], Loss: 1.3531
Epoch [50/500], Loss: 1.3020
Epoch [60/500], Loss: 1.2549
Epoch [70/500], Loss: 1.2114
Epoch [80/500], Loss: 1.1709
Epoch [90/500], Loss: 1.1330
Epoch [100/500], Loss: 1.0975
Epoch [110/500], Loss: 1.0641
Epoch [120/500], Loss: 1.0326
Epoch [130/500], Loss: 1.0029
Epoch [140/500], Loss: 0.9748
Epoch [150/500], Loss: 0.9481
Epoch [160/500], Loss: 0.9228
Epoch [170/500], Loss: 0.8989
Epoch [180/500], Loss: 0.8761
Epoch [190/500], Loss: 0.8544
Epoch [200/500], Loss: 0.8337
Epoch [210/500], Loss: 0.8141
Epoch [220/500], Loss: 0.7953
Epoch [230/500], Loss: 0.7775
Epoch [240/500], Loss: 0.7604
Epoch [250/500], Loss: 0.7441
Epoch [260/500], Loss: 0.7286
Epoch [270/500], Loss: 0.7137
Epoch [280/500], Loss: 0.6995
Epoch [290/500], Loss: 0.6859
Epoch [300/500], Loss: 0.6729
Epoch [310/500], Loss: 0.6604
Epoch [320/500], Loss: 0.6485
Epoch [330/500], Loss: 

In [2]:
all_scores

{'GensimLDA': [0.8966292134831461,
  0.8966292134831461,
  0.8966292134831461,
  0.8966292134831461,
  0.8966292134831461],
 'BERTopic': [0.9483146067415731,
  0.9595505617977528,
  0.950561797752809,
  0.9438202247191011,
  0.9348314606741573],
 'NMF': [0.9662921348314607,
  0.9707865168539326,
  0.9640449438202248,
  0.9640449438202248,
  0.9662921348314607],
 'Mallet_LDA': [0.952808988764045,
  0.9550561797752809,
  0.9640449438202248,
  0.9640449438202248,
  0.9640449438202248],
 'CTM': [0.9617977528089887,
  0.9483146067415731,
  0.9438202247191011,
  0.9483146067415731,
  0.9393258426966292]}

In [None]:
# All Scores: {'GensimLDA': [0.8966292134831461, 0.8966292134831461, 0.8966292134831461, 0.8966292134831461, 0.8966292134831461], 'BERTopic': [0.9483146067415731, 0.9595505617977528, 0.950561797752809, 0.9438202247191011, 0.9348314606741573], 'NMF': [0.9662921348314607, 0.9707865168539326, 0.9640449438202248, 0.9640449438202248, 0.9662921348314607], 'Mallet_LDA': [0.952808988764045, 0.9550561797752809, 0.9640449438202248, 0.9640449438202248, 0.9640449438202248], 'CTM': [0.9617977528089887, 0.9483146067415731, 0.9438202247191011, 0.9483146067415731, 0.9393258426966292]}

# GensimLDA Average: 0.8966292134831461
# Mallet_LDA Average: 0.9600000000000002
# CTM Average: 0.948314606741573
# BERTopic Average: 0.9474157303370786
# NMF Average: 0.9662921348314608