In [1]:
import datasets
from utils import *
import numpy as np
import models.lr as lr
import models.bert as bert
import os
import pandas as pd
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

  from .autonotebook import tqdm as notebook_tqdm


lr

In [2]:
lr_config = ModelArguments()
lr_config.num_labels = 4
lr_config.cache_size = 8
lr_config.cost = 1 #110M for bert-base
lr_config.device = 'cuda' if torch.cuda.is_available() else 'cpu'

## imdb

In [None]:
# split into train, val, test
import importlib
set_seed(42)
data_env = 'data.imdb'
data_module = importlib.import_module(data_env)
data = datasets.Dataset.from_pandas(pd.read_csv("./data/imdb_preprocessed.csv"))

llm_labels = open("./gpt_results/gpt3.5/imdb_gpt3.5_turbo_1106.txt", "r").readlines()
llm_labels = [int(data_module.postprocess(l.strip())) for l in llm_labels]
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['label'] == llm_labels[i]:
        correct += 1
    total += 1
print(f"LLM Accuracy: {correct/total}")

def update_labels(example, idx):
    example['llm_label'] = llm_labels[idx]
    return example

data = data.map(update_labels, with_indices=True)
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['llm_label'] == llm_labels[i]:
        correct += 1
    total += 1
assert correct/total == 1.0 # should be 1.0

# split data into train and test
data = data.shuffle()
data = data.train_test_split(test_size=0.5)

results = []
for i in np.arange(0.1, 0.9, 0.05):
    train_data = data['train'].train_test_split(train_size=i, shuffle=False)
    lr_model = lr.LogisticRegressionModelSkLearn(lr_config, data=train_data['train']['text'])

    # train model
    lr_model.train(train_data['train'])
    acc = lr_model.evaluate(data['test'])
    print(f"Train Size: {i}, Accuracy: {acc}")
    results.append((i, acc))

LLM Accuracy: 0.94152


Map: 100%|██████████| 25000/25000 [00:00<00:00, 40414.70 examples/s]


Logistic Regression Model initialized
Train Size: 0.1, Accuracy: 0.77192
Logistic Regression Model initialized
Train Size: 0.15000000000000002, Accuracy: 0.8228
Logistic Regression Model initialized
Train Size: 0.20000000000000004, Accuracy: 0.838
Logistic Regression Model initialized
Train Size: 0.25000000000000006, Accuracy: 0.85504
Logistic Regression Model initialized
Train Size: 0.30000000000000004, Accuracy: 0.86024
Logistic Regression Model initialized
Train Size: 0.3500000000000001, Accuracy: 0.86328
Logistic Regression Model initialized
Train Size: 0.40000000000000013, Accuracy: 0.86424
Logistic Regression Model initialized
Train Size: 0.45000000000000007, Accuracy: 0.86544
Logistic Regression Model initialized
Train Size: 0.5000000000000001, Accuracy: 0.86088
Logistic Regression Model initialized
Train Size: 0.5500000000000002, Accuracy: 0.86104
Logistic Regression Model initialized
Train Size: 0.6000000000000002, Accuracy: 0.86944
Logistic Regression Model initialized
Train 

In [None]:
results

[(0.1, 0.77192),
 (0.15000000000000002, 0.8228),
 (0.20000000000000004, 0.838),
 (0.25000000000000006, 0.85504),
 (0.30000000000000004, 0.86024),
 (0.3500000000000001, 0.86328),
 (0.40000000000000013, 0.86424),
 (0.45000000000000007, 0.86544),
 (0.5000000000000001, 0.86088),
 (0.5500000000000002, 0.86104),
 (0.6000000000000002, 0.86944),
 (0.6500000000000001, 0.87056),
 (0.7000000000000002, 0.87152),
 (0.7500000000000002, 0.87032),
 (0.8000000000000002, 0.86864),
 (0.8500000000000002, 0.87368)]

## hatespeech

In [25]:
# split into train, val, test
import importlib
set_seed(42)
data_env = 'data.hatespeech'
data_module = importlib.import_module(data_env)
data = datasets.Dataset.from_pandas(pd.read_csv("./data/hatespeech_preprocessed.csv"))

llm_labels = open("./gpt_results/gpt3.5/hatespeech_gpt3.5_turbo_1106.txt", "r").readlines()
llm_labels = [int(data_module.postprocess(l.strip())) for l in llm_labels]
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['label'] == llm_labels[i]:
        correct += 1
    total += 1
print(f"LLM Accuracy: {correct/total}")

def update_labels(example, idx):
    example['llm_label'] = llm_labels[idx]
    return example

data = data.map(update_labels, with_indices=True)
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['llm_label'] == llm_labels[i]:
        correct += 1
    total += 1
assert correct/total == 1.0 # should be 1.0

# split data into train and test
data = data.shuffle()
data = data.train_test_split(test_size=0.5)
# print("Test Size: ", len(data['test']))

results = []
N_range = [600, 2700, 4900]
for N in N_range:
    # print(f"Train Size: {N}")
    train_data = data['train'].train_test_split(train_size=N, shuffle=False)
    print(f"Train Size: {len(train_data['train'])}, Test Size: {len(data['test'])}")
    lr_model = lr.LogisticRegressionModelSkLearn(lr_config, data=train_data['train']['text'])

    # train model
    lr_model.train(train_data['train'])
    acc = lr_model.evaluate(data['test'])
    print(f"Train Size: {N}, Accuracy: {acc}")
    results.append((i, acc))


LLM Accuracy: 0.8334111931234234


Map: 100%|██████████| 10703/10703 [00:00<00:00, 28166.40 examples/s]


Train Size: 600, Test Size: 5352
Logistic Regression Model initialized
Recall: 0.3793677204658902
Train Size: 600, Accuracy: 0.8017563527653214
Train Size: 2700, Test Size: 5352
Logistic Regression Model initialized
Recall: 0.4925124792013311
Train Size: 2700, Accuracy: 0.8223094170403588
Train Size: 4900, Test Size: 5352
Logistic Regression Model initialized
Recall: 0.4242928452579035
Train Size: 4900, Accuracy: 0.8503363228699552


In [5]:
lr_config = ModelArguments()
lr_config.num_labels = 7
lr_config.cache_size = 8
lr_config.cost = 1 #110M for bert-base
lr_config.device = 'cuda' if torch.cuda.is_available() else 'cpu'

import importlib
data_env = 'data.isear'
data_module = importlib.import_module(data_env)

set_seed(42)
data = datasets.Dataset.from_pandas(pd.read_csv("./data/isear_preprocessed.csv"))

isear_to_id = data_module.isear_to_id

# Change labels to id
data = data.map(lambda e: {'label': isear_to_id[e['label']]})

# llm_labels = open("./llama_results/isear_llama2_70b_chat.txt", "r").readlines()
# llm_labels = [int(l.strip()) for l in llm_labels]
llm_labels = open("./gpt_results/gpt3.5/isear_gpt3.5_turbo_1106.txt", "r").readlines()
llm_labels = [int(data_module.postprocess(l.strip())) for l in llm_labels]
# llm_labels = open("./
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['label'] == llm_labels[i]:
        correct += 1
    total += 1
print(f"LLM Accuracy: {correct/total}")

def update_labels(example, idx):
    example['llm_label'] = llm_labels[idx]
    return example

data = data.map(update_labels, with_indices=True)
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['llm_label'] == llm_labels[i]:
        correct += 1
    total += 1
assert correct/total == 1.0 # should be 1.0

# filter out labels that are -1
print("Original Size: ", len(data))
data = data.filter(lambda e: e['llm_label'] != -1)
print("Filtered Size: ", len(data))

# split data into train and test
data = data.shuffle()
data = data.train_test_split(test_size=0.5)

results = []
# N_range = [1200, 1500, 2700]
N_range = [1500]
# N_range = [5100]
for N in N_range:
    train_data = data['train'].train_test_split(train_size=N, shuffle=False)
    print(f"Train Size: {len(train_data['train'])}, Test Size: {len(data['test'])}")
    # print(train_data['train'])
    # print the possible train data labels
    # print(set(train_data['train']['label']))
    # print(set(train_data['train']['llm_label']))
    lr_model = lr.LogisticRegressionModelSkLearn(lr_config, data=train_data['train']['text'])

    # train model
    lr_model.train(train_data['train'])
    acc = lr_model.evaluate(data['test'])
    print(f"Train Size: {N}, Accuracy: {acc}")
    results.append((i, acc))


Map:   0%|          | 0/7666 [00:00<?, ? examples/s]

Map: 100%|██████████| 7666/7666 [00:00<00:00, 39933.20 examples/s]


LLM Accuracy: 0.7034959561701017


Map: 100%|██████████| 7666/7666 [00:00<00:00, 30540.94 examples/s]


Original Size:  7666


Filter: 100%|██████████| 7666/7666 [00:00<00:00, 213466.03 examples/s]

Filtered Size:  7666
Train Size: 1500, Test Size: 3833
Logistic Regression Model initialized





Recall: 0.8459595959595959
Train Size: 1500, Accuracy: 0.47456300547873725


## FEVER

In [None]:
# split into train, val, test
import importlib
set_seed(42)
data_env = 'data.fever'
data_module = importlib.import_module(data_env)
data = datasets.Dataset.from_pandas(pd.read_csv("./data/fever_preprocessed.csv"))

data = data.map(lambda example: {'label': 0 if example['label'] == 'REFUTES' else 1, 'text': example['text']})

# isear_to_id = data_module.isear_to_id

# Change labels to id
# data = data.map(lambda e: {'label': isear_to_id[e['label']]})

llm_labels = open("./llama_results/fever_llama2_70b_chat.txt", "r").readlines()
llm_labels = [int(l.strip()) for l in llm_labels]
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['label'] == llm_labels[i]:
        correct += 1
    total += 1
    # print(data[i]['label'], llm_labels[i])
print(f"LLM Accuracy: {correct/total}")

def update_labels(example, idx):
    example['llm_label'] = llm_labels[idx]
    return example

data = data.map(update_labels, with_indices=True)
total, correct = 0, 0
for i, d in enumerate(data):
    if data[i]['llm_label'] == llm_labels[i]:
        correct += 1
    total += 1
assert correct/total == 1.0 # should be 1.0

# data = data.filter(lambda e: e['llm_label'] != -1)

# split data into train and test
data = data.shuffle()
data = data.train_test_split(test_size=0.5)

results = []
N_range = [700, 2000, 2800]
# N_range = [5100]
for N in N_range:
    train_data = data['train'].train_test_split(train_size=N, shuffle=False)
    print(f"Train Size: {len(train_data['train'])}, Test Size: {len(data['test'])}")
    lr_model = lr.LogisticRegressionModelSkLearn(lr_config, data=train_data['train']['text'])

    # train model
    lr_model.train(train_data['train'])
    acc = lr_model.evaluate(data['test'])
    print(f"Train Size: {N}, Accuracy: {acc}")
    results.append((i, acc))
