In [1]:
import datasets
from utils import *
import numpy as np
import models.lr as lr
import models.bert as bert
import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## bert

In [2]:
bert_config = ModelArguments()
bert_config.num_labels = 4
bert_config.model = "bert-base-uncased"
bert_config.cache_size = 32
bert_config.batch_size = 16
bert_config.num_epochs = 4
bert_config.device = 'cuda' if torch.cuda.is_available() else 'cpu'

## HATESPEECH

In [10]:
# change dataset here
import importlib
set_seed(42)
data_env = 'data.hatespeech'
data_module = importlib.import_module(data_env)
data = datasets.Dataset.from_pandas(pd.read_csv("./data/hatespeech_preprocessed.csv"))

llm_labels = open("./gpt_results/gpt3.5/hatespeech_gpt3.5_turbo_1106.txt", "r").readlines()
llm_labels = [int(data_module.postprocess(l.strip())) for l in llm_labels]
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['label'] == llm_labels[i]:
        correct += 1
    total += 1
print(f"LLM Accuracy: {correct/total}")

def update_labels(example, idx):
    example['llm_label'] = llm_labels[idx]
    return example

data = data.map(update_labels, with_indices=True)
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['llm_label'] == llm_labels[i]:
        correct += 1
    total += 1
assert correct/total == 1.0 # should be 1.0

# split data into train and test
data = data.shuffle()
data = data.train_test_split(test_size=0.5)

num_train = len(data['train'])
N_range = [600, 2700, 4900]

results = []
# for i in np.arange(0.1, 0.9, 0.05):
#     bert_model = bert.BertModel(bert_config)

#     train_data = data['train'].train_test_split(train_size=i, shuffle=False)
#     dataset = GenericDataset(train_data['train'])
#     train_dataloader = DataLoader(dataset, batch_size=bert_config.batch_size, shuffle=True)

#     val_dataset = GenericDataset(data['test'])
#     val_dataloader = DataLoader(val_dataset, batch_size=bert_config.batch_size, shuffle=False)

#     # train model
#     acc = bert_model.train(train_dataloader, val_dataloader)
#     results.append((i, acc))

for N in N_range:
    print(f"Training with {N} samples")
    train_data = data['train'].train_test_split(train_size=N, shuffle=False)
    print(f"Train Size: {len(train_data['train'])}, Test Size: {len(data['test'])}")
    dataset = GenericDataset(train_data['train'])
    train_dataloader = DataLoader(dataset, batch_size=bert_config.batch_size, shuffle=True)

    val_dataset = GenericDataset(data['test'])
    val_dataloader = DataLoader(val_dataset, batch_size=bert_config.batch_size, shuffle=False)

    # train model
    bert_model = bert.BertModel(bert_config)
    acc = bert_model.train(train_dataloader, val_dataloader)
    results.append((N, acc))

LLM Accuracy: 0.8334111931234234


Map: 100%|██████████| 10703/10703 [00:00<00:00, 47656.88 examples/s]


Training with 600 samples
Train Size: 600, Test Size: 5352


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Model loaded
Epoch:  1


Epoch 0 Loss: 0.4222: 100%|██████████| 75/75 [00:19<00:00,  3.78it/s]


Epoch:  2


Epoch 1 Loss: 0.3117: 100%|██████████| 75/75 [00:20<00:00,  3.74it/s]


Epoch:  3


Epoch 2 Loss: 0.2261: 100%|██████████| 75/75 [00:20<00:00,  3.71it/s]


Epoch:  4


Epoch 3 Loss: 0.0316: 100%|██████████| 75/75 [00:20<00:00,  3.69it/s]


Epoch:  5


Epoch 4 Loss: 0.0301: 100%|██████████| 75/75 [00:20<00:00,  3.68it/s]
100%|██████████| 669/669 [01:07<00:00,  9.90it/s]


Recall: 0.6439267886855241
Validation Accuracy:  0.804745889387145
Training with 2700 samples
Train Size: 2700, Test Size: 5352


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Model loaded
Epoch:  1


Epoch 0 Loss: 0.1103: 100%|██████████| 338/338 [01:31<00:00,  3.71it/s]


Epoch:  2


Epoch 1 Loss: 0.0468: 100%|██████████| 338/338 [01:31<00:00,  3.71it/s]


Epoch:  3


Epoch 2 Loss: 0.3543: 100%|██████████| 338/338 [01:31<00:00,  3.71it/s]


Epoch:  4


Epoch 3 Loss: 0.0045: 100%|██████████| 338/338 [01:30<00:00,  3.72it/s]


Epoch:  5


Epoch 4 Loss: 0.0024: 100%|██████████| 338/338 [01:31<00:00,  3.71it/s]
100%|██████████| 669/669 [01:06<00:00,  9.99it/s]


Recall: 0.7387687188019967
Validation Accuracy:  0.8071748878923767
Training with 4900 samples
Train Size: 4900, Test Size: 5352


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Model loaded
Epoch:  1


Epoch 0 Loss: 0.3712: 100%|██████████| 613/613 [02:45<00:00,  3.71it/s]


Epoch:  2


Epoch 1 Loss: 1.1251: 100%|██████████| 613/613 [02:45<00:00,  3.71it/s]


Epoch:  3


Epoch 2 Loss: 0.0030: 100%|██████████| 613/613 [02:45<00:00,  3.71it/s]


Epoch:  4


Epoch 3 Loss: 0.1090: 100%|██████████| 613/613 [02:45<00:00,  3.71it/s]


Epoch:  5


Epoch 4 Loss: 0.0015: 100%|██████████| 613/613 [02:45<00:00,  3.71it/s]
100%|██████████| 669/669 [01:06<00:00, 10.03it/s]

Recall: 0.7737104825291181
Validation Accuracy:  0.7935351270553064





## IMDB

In [3]:
# split into train, val, test
import importlib
set_seed(42)
data_env = 'data.imdb'
data_module = importlib.import_module(data_env)
data = datasets.Dataset.from_pandas(pd.read_csv("./data/imdb_preprocessed.csv"))

llm_labels = open("./gpt_results/gpt3.5/imdb_gpt3.5_turbo_1106.txt", "r").readlines()
llm_labels = [int(data_module.postprocess(l.strip())) for l in llm_labels]
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['label'] == llm_labels[i]:
        correct += 1
    total += 1
print(f"LLM Accuracy: {correct/total}")

def update_labels(example, idx):
    example['llm_label'] = llm_labels[idx]
    return example

data = data.map(update_labels, with_indices=True)
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['llm_label'] == llm_labels[i]:
        correct += 1
    total += 1
assert correct/total == 1.0 # should be 1.0

# split data into train and test
data = data.shuffle()
data = data.train_test_split(test_size=0.5)

num_train = len(data['train'])
# N_range = [600, 2700, 4900]

results = []

N_range = [1300]
for N in N_range:
    print(f"Training with {N} samples")
    train_data = data['train'].train_test_split(train_size=N, shuffle=False)
    print(f"Train Size: {len(train_data['train'])}, Test Size: {len(data['test'])}")
    dataset = GenericDataset(train_data['train'])
    train_dataloader = DataLoader(dataset, batch_size=bert_config.batch_size, shuffle=True)

    val_dataset = GenericDataset(data['test'])
    val_dataloader = DataLoader(val_dataset, batch_size=bert_config.batch_size, shuffle=False)

    # train model
    bert_model = bert.BertModel(bert_config)
    acc = bert_model.train(train_dataloader, val_dataloader)
    results.append((N, acc))

LLM Accuracy: 0.94152


Map: 100%|██████████| 25000/25000 [00:00<00:00, 49886.46 examples/s]


Training with 1300 samples
Train Size: 1300, Test Size: 12500


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Model loaded
Epoch:  1


Epoch 0 Loss: 0.5909: 100%|██████████| 82/82 [00:41<00:00,  1.96it/s]


Epoch:  2


Epoch 1 Loss: 0.6401: 100%|██████████| 82/82 [00:42<00:00,  1.93it/s]


Epoch:  3


Epoch 2 Loss: 0.0320: 100%|██████████| 82/82 [00:42<00:00,  1.94it/s]


Epoch:  4


Epoch 3 Loss: 0.8381: 100%|██████████| 82/82 [00:42<00:00,  1.93it/s]
100%|██████████| 782/782 [02:36<00:00,  4.98it/s]

Recall: 0.7391997449386258
Validation Accuracy:  0.8528





## FEVER

In [9]:
import importlib
set_seed(42)
data_env = 'data.fever'
data_module = importlib.import_module(data_env)
data = datasets.Dataset.from_pandas(pd.read_csv("./data/fever_preprocessed.csv"))

data = data.map(lambda example: {'label': 0 if example['label'] == 'REFUTES' else 1, 'text': example['text']})

llm_labels = open("./llama_results/fever_llama2_70b_chat.txt", "r").readlines()
llm_labels = [int(l.strip()) for l in llm_labels]
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['label'] == llm_labels[i]:
        correct += 1
    total += 1
print(f"LLM Accuracy: {correct/total}")

def update_labels(example, idx):
    example['llm_label'] = llm_labels[idx]
    return example

data = data.map(update_labels, with_indices=True)
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['llm_label'] == llm_labels[i]:
        correct += 1
    total += 1
assert correct/total == 1.0 # should be 1.0

# split data into train and test
data = data.shuffle()
data = data.train_test_split(test_size=0.5)

num_train = len(data['train'])
# N_range = [600, 3300, 4900]
N_range = [700, 2000, 2800]

results = []

for N in N_range:
    print(f"Training with {N} samples")
    train_data = data['train'].train_test_split(train_size=N, shuffle=False)
    print(f"Train Size: {len(train_data['train'])}, Test Size: {len(data['test'])}")
    # print(len(train_data['test']))
    dataset = GenericDataset(train_data['train'])
    train_dataloader = DataLoader(dataset, batch_size=bert_config.batch_size, shuffle=True)

    val_dataset = GenericDataset(data['test'])
    val_dataloader = DataLoader(val_dataset, batch_size=bert_config.batch_size, shuffle=False)

    # train model
    bert_model = bert.BertModel(bert_config)
    acc = bert_model.train(train_dataloader, val_dataloader)
    results.append((N, acc))

Map:   0%|          | 0/6512 [00:00<?, ? examples/s]

Map: 100%|██████████| 6512/6512 [00:00<00:00, 19961.52 examples/s]


LLM Accuracy: 0.7714987714987716


Map: 100%|██████████| 6512/6512 [00:00<00:00, 49747.93 examples/s]


Training with 700 samples
Train Size: 700, Test Size: 3256


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Model loaded
Epoch:  1


Epoch 0 Loss: 0.8973: 100%|██████████| 88/88 [00:23<00:00,  3.81it/s]


Epoch:  2


Epoch 1 Loss: 0.5371: 100%|██████████| 88/88 [00:23<00:00,  3.75it/s]


Epoch:  3


Epoch 2 Loss: 0.5423: 100%|██████████| 88/88 [00:23<00:00,  3.73it/s]


Epoch:  4


Epoch 3 Loss: 0.5342: 100%|██████████| 88/88 [00:23<00:00,  3.73it/s]


Epoch:  5


Epoch 4 Loss: 0.0764: 100%|██████████| 88/88 [00:23<00:00,  3.73it/s]
100%|██████████| 407/407 [00:40<00:00,  9.96it/s]


Recall: 0.5141451414514145
Validation Accuracy:  0.6587837837837838
Training with 2000 samples
Train Size: 2000, Test Size: 3256


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Model loaded
Epoch:  1


Epoch 0 Loss: 0.5282: 100%|██████████| 250/250 [01:07<00:00,  3.71it/s]


Epoch:  2


Epoch 1 Loss: 0.5029: 100%|██████████| 250/250 [01:07<00:00,  3.71it/s]


Epoch:  3


Epoch 2 Loss: 0.4255: 100%|██████████| 250/250 [01:07<00:00,  3.71it/s]


Epoch:  4


Epoch 3 Loss: 0.5288: 100%|██████████| 250/250 [01:07<00:00,  3.71it/s]


Epoch:  5


Epoch 4 Loss: 0.0146: 100%|██████████| 250/250 [01:07<00:00,  3.71it/s]
100%|██████████| 407/407 [00:40<00:00,  9.99it/s]


Recall: 0.43726937269372695
Validation Accuracy:  0.6566339066339066
Training with 2800 samples
Train Size: 2800, Test Size: 3256


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Model loaded
Epoch:  1


Epoch 0 Loss: 0.3177: 100%|██████████| 350/350 [01:34<00:00,  3.71it/s]


Epoch:  2


Epoch 1 Loss: 0.2785: 100%|██████████| 350/350 [01:34<00:00,  3.71it/s]


Epoch:  3


Epoch 2 Loss: 0.1072: 100%|██████████| 350/350 [01:34<00:00,  3.71it/s]


Epoch:  4


Epoch 3 Loss: 0.3345: 100%|██████████| 350/350 [01:34<00:00,  3.71it/s]


Epoch:  5


Epoch 4 Loss: 0.0147: 100%|██████████| 350/350 [01:34<00:00,  3.71it/s]
100%|██████████| 407/407 [00:40<00:00, 10.01it/s]

Recall: 0.47232472324723246
Validation Accuracy:  0.6753685503685504





## ISEAR

In [3]:
bert_config = ModelArguments()
bert_config.num_labels = 7
bert_config.model = "bert-base-uncased"
bert_config.cache_size = 16
bert_config.batch_size = 8
bert_config.num_epochs = 5
bert_config.device = 'cuda' if torch.cuda.is_available() else 'cpu'

import importlib
data_env = 'data.isear'
data_module = importlib.import_module(data_env)

set_seed(42)
data = datasets.Dataset.from_pandas(pd.read_csv("./data/isear_preprocessed.csv"))

isear_to_id = data_module.isear_to_id

# Change labels to id
data = data.map(lambda e: {'label': isear_to_id[e['label']]})

# llm_labels = open("./llama_results/isear_llama2_70b_chat.txt", "r").readlines()
# llm_labels = [int(l.strip()) for l in llm_labels]
llm_labels = open("./gpt_results/gpt3.5/isear_gpt3.5_turbo_1106.txt", "r").readlines()
llm_labels = [int(data_module.postprocess(l.strip())) for l in llm_labels]
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['label'] == llm_labels[i]:
        correct += 1
    total += 1
print(f"LLM Accuracy: {correct/total}")

def update_labels(example, idx):
    example['llm_label'] = llm_labels[idx]
    return example

data = data.map(update_labels, with_indices=True)
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['llm_label'] == llm_labels[i]:
        correct += 1
    total += 1
assert correct/total == 1.0 # should be 1.0

# filter out labels that are -1
print("Original Size: ", len(data))
data = data.filter(lambda e: e['llm_label'] != -1)
print("Filtered Size: ", len(data))

# split data into train and test
data = data.shuffle()
data = data.train_test_split(test_size=0.5)

num_train = len(data['train'])
# N_range = [600, 3300, 4900]
N_range = [1200, 1500, 2700]

results = []

for N in N_range:
    print(f"Training with {N} samples")
    train_data = data['train'].train_test_split(train_size=N, shuffle=False)
    print(f"Train Size: {len(train_data['train'])}, Test Size: {len(data['test'])}")
    # print(len(train_data['test']))
    dataset = GenericDataset(train_data['train'])
    train_dataloader = DataLoader(dataset, batch_size=bert_config.batch_size, shuffle=True)

    val_dataset = GenericDataset(data['test'])
    val_dataloader = DataLoader(val_dataset, batch_size=bert_config.batch_size, shuffle=False)

    # train model
    bert_model = bert.BertModel(bert_config)
    acc = bert_model.train(train_dataloader, val_dataloader)
    results.append((N, acc))

Map:  31%|███       | 2379/7666 [00:00<00:00, 23609.33 examples/s]

Map: 100%|██████████| 7666/7666 [00:00<00:00, 22211.72 examples/s]


LLM Accuracy: 0.7034959561701017


Map: 100%|██████████| 7666/7666 [00:00<00:00, 29520.70 examples/s]


Original Size:  7666


Filter: 100%|██████████| 7666/7666 [00:00<00:00, 389267.97 examples/s]

Filtered Size:  7666
Training with 1200 samples
Train Size: 1200, Test Size: 3833



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Model loaded
Epoch:  1


Epoch 0 Loss: 1.8327: 100%|██████████| 150/150 [00:39<00:00,  3.78it/s]


Epoch:  2


Epoch 1 Loss: 1.1620: 100%|██████████| 150/150 [00:40<00:00,  3.72it/s]


Epoch:  3


Epoch 2 Loss: 0.5300: 100%|██████████| 150/150 [00:40<00:00,  3.71it/s]


Epoch:  4


Epoch 3 Loss: 0.3735: 100%|██████████| 150/150 [00:40<00:00,  3.70it/s]


Epoch:  5


Epoch 4 Loss: 0.4160: 100%|██████████| 150/150 [00:40<00:00,  3.71it/s]
100%|██████████| 480/480 [00:47<00:00, 10.05it/s]


Recall: 0.7166064981949458
Validation Accuracy:  0.6196190973128098
Training with 1500 samples
Train Size: 1500, Test Size: 3833


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Model loaded
Epoch:  1


Epoch 0 Loss: 1.8232: 100%|██████████| 188/188 [00:50<00:00,  3.72it/s]


Epoch:  2


Epoch 1 Loss: 1.5675: 100%|██████████| 188/188 [00:50<00:00,  3.71it/s]


Epoch:  3


Epoch 2 Loss: 0.4955: 100%|██████████| 188/188 [00:50<00:00,  3.71it/s]


Epoch:  4


Epoch 3 Loss: 0.4284: 100%|██████████| 188/188 [00:50<00:00,  3.71it/s]


Epoch:  5


Epoch 4 Loss: 0.1053: 100%|██████████| 188/188 [00:50<00:00,  3.71it/s]
100%|██████████| 480/480 [00:47<00:00, 10.02it/s]


Recall: 0.8068592057761733
Validation Accuracy:  0.6282285416123141
Training with 2700 samples
Train Size: 2700, Test Size: 3833


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Model loaded
Epoch:  1


Epoch 0 Loss: 1.8897:  24%|██▍       | 81/338 [00:22<01:10,  3.67it/s]


KeyboardInterrupt: 

In [4]:
results

[(0.1, 0.86856),
 (0.15000000000000002, 0.8992),
 (0.20000000000000004, 0.85496),
 (0.25000000000000006, 0.89752),
 (0.30000000000000004, 0.89936),
 (0.3500000000000001, 0.90336),
 (0.40000000000000013, 0.91104),
 (0.45000000000000007, 0.90792),
 (0.5000000000000001, 0.9116),
 (0.5500000000000002, 0.91672),
 (0.6000000000000002, 0.89664),
 (0.6500000000000001, 0.89808),
 (0.7000000000000002, 0.8888),
 (0.7500000000000002, 0.90968),
 (0.8000000000000002, 0.89928),
 (0.8500000000000002, 0.91336)]