In [78]:
import datasets
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
import evaluate
import random
import argparse
from utils import *
import os

from datasets import load_from_disk

from openai import OpenAI

In [79]:
client = OpenAI()

In [80]:
def build_content(text):
    """Add prompt to sentence"""
    completion = {"role": "user", "content": f"Classes: [`positive`, `negative`]\nText: {text}\n\nClassify the text into one of the above classes. Only return the class."}

    return completion

In [81]:
# Define the directory where the dataset is saved
load_directory = "dataset/"

# Load the dataset from the specified directory
loaded_data = load_from_disk(load_directory)

# Now you can access the individual splits (train, test, unsupervised) as follows:
train_dataset = loaded_data["train"]
test_dataset = loaded_data["test"]
unsupervised_dataset = loaded_data["unsupervised"]

In [82]:
N = 100

In [83]:
TEXTS_TO_CLASSIFY = test_dataset['text'][:N]
TRUE_LABES = test_dataset['label'][:N]

In [84]:
def classify_texts(texts, labels, output_file):
    with open(output_file, 'a') as file:
        i = 0
        print('Started')
        for text, true_label in zip(texts, labels):

            context = build_content(text)

            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[context]
            )

            output_class = response.choices[0].message.content
            file.write(f"{output_class}, {true_label}\n")
            print(f'Saved Row {i}: {output_class}, {true_label}')
            i += 1

In [85]:
%%time
classify_texts(TEXTS_TO_CLASSIFY, TRUE_LABES, "GPT_output/gpt_out_original.txt")

Started
Saved Row 0: negative, 0
Saved Row 1: positive, 0
Saved Row 2: negative, 0
Saved Row 3: negative, 0
Saved Row 4: positive, 0
Saved Row 5: negative, 0
Saved Row 6: negative, 0
Saved Row 7: negative, 0
Saved Row 8: negative, 0
Saved Row 9: negative, 0
Saved Row 10: negative, 0
Saved Row 11: negative, 0
Saved Row 12: negative, 0
Saved Row 13: negative, 0
Saved Row 14: negative, 0
Saved Row 15: negative, 0
Saved Row 16: negative, 0
Saved Row 17: negative, 0
Saved Row 18: negative, 0
Saved Row 19: negative, 0
Saved Row 20: positive, 0
Saved Row 21: positive, 0
Saved Row 22: negative, 0
Saved Row 23: negative, 0
Saved Row 24: negative, 0
Saved Row 25: negative, 0
Saved Row 26: negative, 0
Saved Row 27: negative, 0
Saved Row 28: negative, 0
Saved Row 29: negative, 0
Saved Row 30: negative, 0
Saved Row 31: negative, 0
Saved Row 32: positive, 0
Saved Row 33: negative, 0
Saved Row 34: negative, 0
Saved Row 35: negative, 0
Saved Row 36: negative, 0
Saved Row 37: positive, 0
Saved Row 38: 

**out_100_original.txt**

100%|██████████| 13/13 [00:15<00:00,  1.16s/it]
Score:  {'accuracy': 0.92}


**out_distilBERT_100_original.txt**

100%|██████████| 13/13 [00:07<00:00,  1.73it/s]
Score:  {'accuracy': 0.92}

# BERT

In [86]:
import datasets
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
import evaluate
import random
import argparse
from utils import *
import os

In [88]:
# DEBUG Purpose
# parser = argparse.ArgumentParser()
# args = parser.parse_args()
class MyDict:
    def __init__(self, data):
        self.data = data
        for key, value in data.items():
            setattr(self, key, value)

args = MyDict({
    "train": False,
    "train_augmented": False,
    "eval": True,
    "eval_transformed": False,
    "model_dir": "./CARC_output/out",
    "debug_transformation": False,
    "learning_rate": 5e-5,
    "num_epochs": 3,
    "small": False

})

In [89]:
# Set seed
random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


In [90]:
# Tokenize the input
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


# Core training function
def do_train(args, model, train_dataloader, save_dir="./out"):

    
    optimizer = AdamW(model.parameters(), lr=args.learning_rate)
    num_epochs = args.num_epochs
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )
    model.train()
    progress_bar = tqdm(range(num_training_steps))

    # Implement the training loop --- make sure to use the optimizer and lr_sceduler (learning rate scheduler)
    # Remember that pytorch uses gradient accumumlation so you need to use zero_grad (https://pytorch.org/tutorials/recipes/recipes/zeroing_out_gradients.html)
    # You can use progress_bar.update(1) to see the progress during training
    # You can refer to the pytorch tutorial covered in class for reference
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad() # zero gradients
            progress_bar.update(1)   
    
    print("Training completed...")
    print("Saving Model....")
    model.save_pretrained(save_dir)
    
    return
    
    
# Core evaluation function
def do_eval(eval_dataloader, output_dir, out_file):
    
    model = AutoModelForSequenceClassification.from_pretrained(output_dir)
    model.to(device)
    model.eval()

    metric = evaluate.load("accuracy")

    for batch in tqdm(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
        
        # write to output file
        for i in range(predictions.shape[0]):
            out_file.write(str(predictions[i].item()) + "\n")
            #out_file.write("\n")
            out_file.write(str(batch["labels"][i].item()) + "\n\n")
            #out_file.write("\n\n")

    score = metric.compute()
    
    return score

# Created a dataladoer for the augmented training dataset
def create_augmented_dataloader(dataset):
    
    ################################
    ##### YOUR CODE BEGINGS HERE ###
    
    # Here, 'dataset' is the original dataset. You should return a dataloader called 'train_dataloader' (with batch size = 8) -- this
    # dataloader will be for the original training split augmented with 5k random transformed examples from the training set.
    # You may want to set load_from_cache_file to False when using dataset maps
    # You may find it helpful to see how the dataloader was created at other place in this code.

    # 5000 randomly transformed examples
    train_augmented_size = 5000
    train_transformed_sample = dataset["train"].shuffle(seed=42).select(range(train_augmented_size))
    train_transformed_sample = train_transformed_sample.map(custom_transform, load_from_cache_file=False) 

    # Augment the training data with 5000 randomly transformed examples to create the new augmented training dataset
    # Final dataset train size: "25,000" + "5,000" = "30,000" 
    train_transformed_dataset = concatenate_datasets([dataset["train"], train_transformed_sample])                                                
    
    train_dataloader = None

    tokenized_dataset = train_transformed_dataset.map(tokenize_function, batched=True)

    # Prepare dataset for use by model
    tokenized_dataset = tokenized_dataset.remove_columns(["text"])
    tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
    tokenized_dataset.set_format("torch")

    # Create dataloaders for iterating over the dataset
    train_dataloader = DataLoader(tokenized_dataset, shuffle=True, batch_size=8)
    
    ##### YOUR CODE ENDS HERE ######
    
    return train_dataloader

# Create a dataloader for the transformed test set
def create_transformed_dataloader(dataset, debug_transformation):
    
    # Print 5 random transformed examples
    if debug_transformation:
        small_dataset = dataset["test"].shuffle(seed=42).select(range(5))
        small_transformed_dataset = small_dataset.map(custom_transform, load_from_cache_file=False)
        for k in range(5):
            print("Original Example ", str(k))
            print(small_dataset[k])
            print("\n")
            print("Transformed Example ", str(k))
            print(small_transformed_dataset[k])
            print('='*30)

        exit()
      
    
    transformed_dataset = dataset["test"].map(custom_transform, load_from_cache_file=False)                                                    
    transformed_tokenized_dataset = transformed_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    transformed_tokenized_dataset = transformed_tokenized_dataset.remove_columns(["text"])
    transformed_tokenized_dataset = transformed_tokenized_dataset.rename_column("label", "labels")
    transformed_tokenized_dataset.set_format("torch")

    transformed_val_dataset = transformed_tokenized_dataset    
    eval_dataloader = DataLoader(transformed_val_dataset, batch_size=8)
    
    return eval_dataloader

In [104]:
global device
global tokenizer

In [91]:
# Device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Tokenize the dataset
dataset = load_dataset("imdb")

In [96]:
dataset = dataset["test"].select(range(N))

In [100]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map: 100%|██████████| 100/100 [00:00<00:00, 1395.05 examples/s]


In [101]:
# Prepare dataset for use by model
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

In [102]:
eval_dataloader = DataLoader(tokenized_dataset, batch_size=8)

In [105]:
# Evaluate the trained model on the original test dataset
if args.eval:
    
    out_file = os.path.basename(os.path.normpath(args.model_dir))
    out_file = out_file + f"_{N}" "_original.txt"
    out_file = open(out_file, "w")
    
    score = do_eval(eval_dataloader, args.model_dir, out_file)
    print("Score: ", score)
    
    out_file.close()

100%|██████████| 13/13 [00:15<00:00,  1.16s/it]

Score:  {'accuracy': 0.92}





# Destilbert

In [115]:
# DEBUG Purpose
# parser = argparse.ArgumentParser()
# args = parser.parse_args()
class MyDict:
    def __init__(self, data):
        self.data = data
        for key, value in data.items():
            setattr(self, key, value)

args = MyDict({
    "train": False,
    "train_augmented": False,
    "eval": True,
    "eval_transformed": False,
    "model_dir": "./CARC_output/out_distilBERT",
    "debug_transformation": False,
    "learning_rate": 5e-5,
    "num_epochs": 3,
    "small": False

})

In [116]:
# Tokenize the input
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


# Core training function
def do_train(args, model, train_dataloader, save_dir="./out_distilbert"):

    
    optimizer = AdamW(model.parameters(), lr=args.learning_rate)
    num_epochs = args.num_epochs
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )
    model.train()
    progress_bar = tqdm(range(num_training_steps))

    # Implement the training loop --- make sure to use the optimizer and lr_sceduler (learning rate scheduler)
    # Remember that pytorch uses gradient accumumlation so you need to use zero_grad (https://pytorch.org/tutorials/recipes/recipes/zeroing_out_gradients.html)
    # You can use progress_bar.update(1) to see the progress during training
    # You can refer to the pytorch tutorial covered in class for reference
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad() # zero gradients
            progress_bar.update(1)   
    
    print("Training completed...")
    print("Saving Model....")
    model.save_pretrained(save_dir)
    
    return
    
    
# Core evaluation function
def do_eval(eval_dataloader, output_dir, out_file):
    
    model = AutoModelForSequenceClassification.from_pretrained(output_dir)
    model.to(device)
    model.eval()

    metric = evaluate.load("accuracy")

    for batch in tqdm(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
        
        # write to output file
        for i in range(predictions.shape[0]):
            out_file.write(str(predictions[i].item()) + "\n")
            #out_file.write("\n")
            out_file.write(str(batch["labels"][i].item()) + "\n\n")
            #out_file.write("\n\n")

    score = metric.compute()
    
    return score

# Created a dataladoer for the augmented training dataset
def create_augmented_dataloader(dataset):
    
    ################################
    ##### YOUR CODE BEGINGS HERE ###
    
    # Here, 'dataset' is the original dataset. You should return a dataloader called 'train_dataloader' (with batch size = 8) -- this
    # dataloader will be for the original training split augmented with 5k random transformed examples from the training set.
    # You may want to set load_from_cache_file to False when using dataset maps
    # You may find it helpful to see how the dataloader was created at other place in this code.

    # 5000 randomly transformed examples
    train_augmented_size = 5000
    train_transformed_sample = dataset["train"].shuffle(seed=42).select(range(train_augmented_size))
    train_transformed_sample = train_transformed_sample.map(custom_transform, load_from_cache_file=False) 

    # Augment the training data with 5000 randomly transformed examples to create the new augmented training dataset
    # Final dataset train size: "25,000" + "5,000" = "30,000" 
    train_transformed_dataset = concatenate_datasets([dataset["train"], train_transformed_sample])                                                
    
    train_dataloader = None

    tokenized_dataset = train_transformed_dataset.map(tokenize_function, batched=True)

    # Prepare dataset for use by model
    tokenized_dataset = tokenized_dataset.remove_columns(["text"])
    tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
    tokenized_dataset.set_format("torch")

    # Create dataloaders for iterating over the dataset
    train_dataloader = DataLoader(tokenized_dataset, shuffle=True, batch_size=8)
    
    ##### YOUR CODE ENDS HERE ######
    
    return train_dataloader

# Create a dataloader for the transformed test set
def create_transformed_dataloader(dataset, debug_transformation):
    
    # Print 5 random transformed examples
    if debug_transformation:
        small_dataset = dataset["test"].shuffle(seed=42).select(range(5))
        small_transformed_dataset = small_dataset.map(custom_transform, load_from_cache_file=False)
        for k in range(5):
            print("Original Example ", str(k))
            print(small_dataset[k])
            print("\n")
            print("Transformed Example ", str(k))
            print(small_transformed_dataset[k])
            print('='*30)

        exit()
      
    
    transformed_dataset = dataset["test"].map(custom_transform, load_from_cache_file=False)                                                    
    transformed_tokenized_dataset = transformed_dataset.map(tokenize_function, batched=True, load_from_cache_file=False)
    transformed_tokenized_dataset = transformed_tokenized_dataset.remove_columns(["text"])
    transformed_tokenized_dataset = transformed_tokenized_dataset.rename_column("label", "labels")
    transformed_tokenized_dataset.set_format("torch")

    transformed_val_dataset = transformed_tokenized_dataset    
    eval_dataloader = DataLoader(transformed_val_dataset, batch_size=8)
    
    return eval_dataloader

In [117]:
global device
global tokenizer

# Device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

In [118]:
# Tokenize the dataset
dataset = load_dataset("imdb")
dataset = dataset["test"].select(range(N))

In [119]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [120]:
# Prepare dataset for use by model
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

In [121]:
eval_dataloader = DataLoader(tokenized_dataset, batch_size=8)

In [123]:
# Evaluate the trained model on the original test dataset
if args.eval:
    
    out_file = os.path.basename(os.path.normpath(args.model_dir))
    out_file = out_file + f"_{N}" "_original.txt"
    out_file = open(out_file, "w")
    
    score = do_eval(eval_dataloader, args.model_dir, out_file)
    print("Score: ", score)
    
    out_file.close()

100%|██████████| 13/13 [00:07<00:00,  1.73it/s]

Score:  {'accuracy': 0.92}



