In [5]:
# amazon

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import pickle
import csv
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import sklearn
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

dataset = 'amazon' # folder and dataset name
# cols = ['gender', 'age', 'politics'] # outcome columns
cols = ['label_x'] # outcome columns
outcome = 'label'

logit = True

df = pd.read_csv("data/" + dataset + ".csv")

print(dataset)

# for model in ['GensimLDA', 'BERTopic', 'NMF', 'Mallet_LDA', 'CTM']:
for model in ['BERTopic']:
    print(model)
    scores = []
    scores_logit = []
    scores_rf = []
    scores_old = []

    rmses = []
    r2s = []
    maes = []

    for run in range(1, 2):

        print("Run: " + str(run))

        # loading in distributions that were saved during topic extraction
        test_distribution = pickle.load(open(dataset + '/' + model + '/run_' + str(run) + '/' + model + '_topic_distribution_test.pkl', 'rb'))
        train_distribution = pickle.load(open(dataset + '/' + model + '/run_' + str(run) + '/' + model + '_topic_distribution_train.pkl', 'rb'))
        train = pickle.load(open(dataset + '/BERTopic/run_1/train.pkl', 'rb'))
        test = pickle.load(open(dataset + '/BERTopic/run_1/test.pkl', 'rb'))

        topics = []
        with open(dataset + '/' + model + '/run_' + str(run) + '/' + 'topics_100.txt', 'r') as f:
            reader = csv.reader(f)
            for row in reader:
                topic_list = [item.strip() for item in row if item.strip()]
                topics.append(topic_list)

        temp = pd.concat([train, test]).reset_index(drop=True) # concatenating train and test datasets
        distribution = np.concatenate([train_distribution, test_distribution]) # concatenating train and test distributions

        merged = pd.merge(temp, df, how='inner', left_on = 'message_id', right_on = 'Unnamed: 0')[['message_id', 'message'] + cols]
        merged.columns = ['message_id', 'message', 'label']

        X = distribution
        y = merged[outcome].reset_index(drop=True) # the outcome we care about

        # # 80-20 split --> didn't use train-test-split function since its already shuffled
        X_train = X[:round(0.80 * len(X))]
        X_test = X[round(0.80 * len(X)):]

        y_train = y[:round(0.80 * len(X))]
        y_test = y[round(0.80 * len(X)):]

        lr_model = LinearRegression().fit(X_train, y_train) 
        y_pred_test = lr_model.predict(X_test)

        pred_rounded = []

        for i in np.round(y_pred_test):
            min = y_test.min()
            max = y_test.max()
            if i >= min and i <= max:
                pred_rounded.append(i)
            elif i > max:
                pred_rounded.append(max)
            else:
                pred_rounded.append(min)

        score = sklearn.metrics.accuracy_score(pred_rounded, list(y_test))
        scores.append(score)

        rmse_test = mean_squared_error(y_true=y_test, y_pred=y_pred_test, squared=False)
        r2_test = r2_score(y_true=y_test, y_pred=y_pred_test)
        mae_test = mean_absolute_error(y_true=y_test, y_pred=y_pred_test)

        rmses.append(rmse_test)
        r2s.append(r2_test)
        maes.append(mae_test)

        logit_model = LogisticRegression().fit(X_train, y_train)
        pred_logit = logit_model.predict(X_test)
        score_logit = sklearn.metrics.accuracy_score(pred_logit, list(y_test))
        scores_logit.append(score_logit)

        # rf = RandomForestClassifier(random_state = 42).fit(X_train, y_train)
        # pred_rf = rf.predict(X_test)
        # score_rf = sklearn.metrics.accuracy_score(pred_rf, list(y_test))
        # scores_rf.append(score_rf)

        dummy_train = pd.get_dummies(train['label'], dtype=int)
        dummy_test = pd.get_dummies(test['label'], dtype=int)
        dummy_train.columns = ["label " + str(i) for i in range(len(dummy_train.columns))]
        dummy_test.columns = ["label " + str(i) for i in range(len(dummy_test.columns))]

        accuracies = []
        r2s_old = []
        for i, label_col in enumerate(list(dummy_train)):
            lr_model_old = LogisticRegression().fit(X_train, dummy_train[label_col])
            accuracies.append(lr_model_old.score(X_test, dummy_test[label_col]))
            r2s_old.append(r2_score(dummy_test[label_col], lr_model_old.predict(X_test)))
        
        accuracies = [np.mean(accuracies)] + accuracies + [np.mean(r2s_old)]
        scores_old.append(accuracies[0])
        
    print("Linear model score: " + str(np.mean(scores)))
    print("Linear model RMSE: " + str(np.mean(rmses)))
    print("Linear model R2: " + str(np.mean(r2s)))
    print("Linear model MAE: " + str(np.mean(maes)))
    print()
    print("Logistic model: " + str(np.mean(scores_logit)))
    print("Rf model: " + str(np.mean(scores_rf)))
    print("Old Logistic model: " + str(np.mean(scores_old)))
    print()

amazon
BERTopic
Run: 1
Linear model score: 0.284825
Linear model RMSE: 1.1976948175902427
Linear model R2: 0.28706706554319295
Linear model MAE: 0.9991154128697017

Logistic model: 0.31805
Rf model: nan
Old Logistic model: 0.799995



In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score

num_epochs = 300
lr = 0.003

# Convert arrays to torch tensors
X_train_tensor = torch.tensor(np.array(X_train).astype(np.float32))
y_train_tensor = torch.tensor(np.array(y_train).astype(np.longlong))
X_test_tensor = torch.tensor(np.array(X_test).astype(np.float32))
y_test_tensor = torch.tensor(np.array(y_test).astype(np.longlong))

# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

# Define the model
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegressionModel, self).__init__()
        self.layer1 = nn.Linear(input_size, num_classes)  # First linear layer

    def forward(self, x):
        out = self.layer1(x)
        return out

input_size = X_train.shape[1]
num_classes = len(np.unique(y_train))  # Assuming y_train contains all classes
model = LogisticRegressionModel(input_size, num_classes)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()  # This includes softmax
optimizer = optim.Adam(model.parameters(), lr=lr)

# Train the model
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Predict on the test set
model.eval()  # Set the model to evaluation mode

with torch.no_grad():
    y_pred_train = model(X_train_tensor)
    _, predicted_train = torch.max(y_pred_train.data, 1)

# Calculate accuracy
score_logit_train = accuracy_score(predicted_train.numpy(), y_train)
print(f'Train Accuracy Score: {score_logit_train}')

with torch.no_grad():
    y_pred_test = model(X_test_tensor)
    _, predicted = torch.max(y_pred_test.data, 1)

# Calculate accuracy
score_logit = accuracy_score(predicted.numpy(), y_test)
print(f'Test Accuracy Score: {score_logit}')

Epoch [10/300], Loss: 1.5376
Epoch [20/300], Loss: 1.5059
Epoch [30/300], Loss: 1.4882
Epoch [40/300], Loss: 1.4772
Epoch [50/300], Loss: 1.4696
Epoch [60/300], Loss: 1.4641
Epoch [70/300], Loss: 1.4599
Epoch [80/300], Loss: 1.4565
Epoch [90/300], Loss: 1.4538
Epoch [100/300], Loss: 1.4514
Epoch [110/300], Loss: 1.4494
Epoch [120/300], Loss: 1.4476
Epoch [130/300], Loss: 1.4460
Epoch [140/300], Loss: 1.4445
Epoch [150/300], Loss: 1.4432
Epoch [160/300], Loss: 1.4419
Epoch [170/300], Loss: 1.4407
Epoch [180/300], Loss: 1.4396
Epoch [190/300], Loss: 1.4385
Epoch [200/300], Loss: 1.4375
Epoch [210/300], Loss: 1.4365
Epoch [220/300], Loss: 1.4356
Epoch [230/300], Loss: 1.4347
Epoch [240/300], Loss: 1.4338
Epoch [250/300], Loss: 1.4329
Epoch [260/300], Loss: 1.4321
Epoch [270/300], Loss: 1.4313
Epoch [280/300], Loss: 1.4306
Epoch [290/300], Loss: 1.4298
Epoch [300/300], Loss: 1.4291
Train Accuracy Score: 0.41135
Test Accuracy Score: 0.4144


In [16]:
pd.Series(y_test).value_counts(), pd.Series(y_train).value_counts()

(label
 4    8126
 2    8081
 0    8062
 1    7960
 3    7771
 Name: count, dtype: int64,
 label
 3    32229
 1    32040
 0    31938
 2    31919
 4    31874
 Name: count, dtype: int64)

In [17]:
pd.Series(predicted).value_counts(), pd.Series(predicted_train).value_counts()

(0    10361
 4     9842
 3     7653
 1     7009
 2     5135
 Name: count, dtype: int64,
 0    41465
 4    39498
 3    30700
 1    28078
 2    20259
 Name: count, dtype: int64)