In [None]:
!python pip install accelerate appdirs loralib black black[jupyter] datasets git+https://github.com/huggingface/peft.git git+https://github.com/huggingface/transformers.git sentencepiece gradio fire torch datasets bitsandbytes utils

In [1]:
#!conda install cudatoolkit
#!pip install git+https://github.com/huggingface/peft.git
#!pip install Sentencepiece
#!pip install git+https://github.com/huggingface/transformers.git # https://stackoverflow.com/questions/65854722/huggingface-albert-tokenizer-nonetype-error-with-colab
import os
import sys
from typing import List

import fire
import torch
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import transformers
from datasets import load_dataset
import torch.nn as nn
import bitsandbytes as bnb

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)

from transformers import AutoModelForMaskedLM
from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig

from sklearn.metrics import confusion_matrix, classification_report, multilabel_confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer

device = torch.device("cuda:0" if torch.cuda.device_count() > 0 else "cpu")
device

ModuleNotFoundError: No module named 'fire'

In [3]:
#Causal Language Model: only consider words to the left
model = LlamaForCausalLM.from_pretrained(
    #"tloen/alpaca-lora-7b",
    "decapoda-research/llama-7b-hf",
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map='auto'
)

tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
print(tokenizer.pad_token_id)  # unk. we want this to be different from the eos token
print(tokenizer.eos_token_id)
print(tokenizer.unk_token_id)
tokenizer.pad_token_id = (1)
tokenizer.padding_side = "left"  # Allow batched inference


model = prepare_model_for_int8_training(model)
#model = get_peft_model(model, config) #https://github.com/huggingface/peft State-of-the-art Parameter-Efficient Fine-Tuning (PEFT) methods

resume_from_checkpoint = False

if resume_from_checkpoint:
    # Check the available weights and load them
    checkpoint_name = os.path.join(
        resume_from_checkpoint, "pytorch_model.bin"
    )  # Full checkpoint
    if not os.path.exists(checkpoint_name):
        checkpoint_name = os.path.join(
            resume_from_checkpoint, "adapter_model.bin"
        )  # only LoRA model - LoRA config above has to fit
        resume_from_checkpoint = (
            False  # So the trainer won't try loading its state
        )
    # The two files above have a different name depending on how they were saved, but are actually the same.
    if os.path.exists(checkpoint_name):
        print(f"Restarting from {checkpoint_name}")
        adapters_weights = torch.load(checkpoint_name)
        model = set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {checkpoint_name} not found")

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


None
0
0


In [4]:
import json
import pandas as pd
from emoji import demojize
from nltk.tokenize import TweetTokenizer
from typing import List, Dict
from transformers import DataCollatorWithPadding

class MultiLabelDataCollator(DataCollatorWithPadding):
    def __init__(self, tokenizer):
        super().__init__(tokenizer)

    def __call__(self, features: List[Dict[str, torch.Tensor]]):
        batch = super().__call__(features)
        batch["labels"] = torch.stack([feature["label"] for feature in features])
        return batch
        
    @staticmethod
    def loss(logits, labels):
        # Use BCEWithLogitsLoss for multi-label classification
        loss_fct = torch.nn.BCEWithLogitsLoss()
        return loss_fct(logits, labels.float())

def compute_metrics(eval_pred):
    print(eval_pred)
    predictions, labels = eval_pred
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    predictions = (probs >= 0.5).to(int)
    labels = labels.astype(int)
    report = classification_report(labels, predictions, labels=range(len(classes)), output_dict=True, zero_division=0)

    metrics = {
        "accuracy": np.mean(predictions == labels),
        "micro_precision": report["micro avg"]["precision"],
        "micro_recall": report["micro avg"]["recall"],
        "micro_f1": report["micro avg"]["f1-score"],
        "macro_precision": report["macro avg"]["precision"],
        "macro_recall": report["macro avg"]["recall"],
        "macro_f1": report["macro avg"]["f1-score"],
    }

    return metrics

class TweetDataset(Dataset):
    def __init__(self, x, y, mlb, tokenizer, train = True):
        self.x = x
        self.y = y
        self.mlb = mlb
        self.tokenizer = tokenizer
        self.encoded_tweets = self.preprocess_text(self.x)
        self.train = train
        
    @staticmethod
    def normalizeToken(token):
        lowercased_token = token.lower()
        if token.startswith("@"):
            return "@USER"
        elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
            return "HTTPURL"
        elif len(token) == 1:
            return demojize(token)
        else:
            if token == "’":
                return "'"
            elif token == "…":
                return "..."
            else:
                return token
    
    def normalizeTweet(self, tweet):
        tokens = TweetTokenizer().tokenize(tweet.replace("’", "'").replace("…", "..."))
        normTweet = " ".join([self.normalizeToken(token) for token in tokens])

        normTweet = (
            normTweet.replace("cannot ", "can not ")
                .replace("n't ", " n't ")
                .replace("n 't ", " n't ")
                .replace("ca n't", "can't")
                .replace("ai n't", "ain't")
        )
        normTweet = (
            normTweet.replace("'m ", " 'm ")
                .replace("'re ", " 're ")
                .replace("'s ", " 's ")
                .replace("'ll ", " 'll ")
                .replace("'d ", " 'd ")
                .replace("'ve ", " 've ")
        )
        normTweet = (
            normTweet.replace(" p . m .", "  p.m.")
                .replace(" p . m ", " p.m ")
                .replace(" a . m .", " a.m.")
                .replace(" a . m ", " a.m ")
        )
        return " ".join(normTweet.split())

    def generate_prompt(self, data_point):
        #if self.train:
        rules = "War/Terror = 1, Non-War/Terror = 0"
        examples = "" #"This tweet is about War. This tweet is about Terror."
        instructions = "Please label the following tweet as War/Terror or Non-War/Terror with the rules provided above and only provide the labels:" 

        full_prompt = rules + "\n" + examples + "\n" + instructions + data_point

        return full_prompt
    
    def tokenize(self, prompt, add_eos_token=True):
        # there's probably a way to do this with the tokenizer settings
        # but again, gotta move fast
        cutoff_len = 100000
        result = tokenizer(
            prompt,
            return_tensors="pt"
        )
        """print(result)
        print(result["input_ids"][-1] != tokenizer.eos_token_id)
        if (
            result["input_ids"][-1] != tokenizer.eos_token_id
            and len(result["input_ids"]) < cutoff_len
            and add_eos_token
        ):
            result["input_ids"].append(tokenizer.eos_token_id)
            result["attention_mask"].append(1)"""

        #result["labels"] = result["input_ids"].copy()

        return result

    def preprocess_text(self, X):
        X = [self.normalizeTweet(tweet) for tweet in X] #normalize
        X = [self.generate_prompt(tweet) for tweet in X] #generate prompt
        X = [self.tokenize(tweet) for tweet in X] #tokenize
        return X
        
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        label = self.y[idx]
        return {'input_ids': self.encoded_tweets[idx]['input_ids'],
                'attention_mask': self.encoded_tweets[idx]['attention_mask'],
                'label': torch.tensor(label, dtype=torch.float32)}
                #'label_ids': self.labels[idx]}

#model.print_trainable_parameters() 

i = 0
train_size = "full"
epochs = 200
task = "generic"
print(f"Starting training of {i+1}. fold...")
output_dir = f"./{task}_epochs_{epochs}_train_size_{train_size}_fold_{i}"
os.makedirs(output_dir, exist_ok=True)

# Load the data for this fold
filename = f"../data/labeled_data/{task}_test_{i}.json"
with open(filename) as f:
    data = json.load(f)
train_df = pd.DataFrame(data["train"])
val_df = pd.DataFrame(data["valid"])
test_df = pd.DataFrame(data["test"])
train_annotations = train_df["annotations"].tolist()

# Get all unique classes
global classes
classes = set()
for annotation in train_annotations:
    classes.update(annotation)
classes = sorted(list(classes))

# Convert the annotations to binary labels
mlb = MultiLabelBinarizer(classes=classes)

"""# train_size argument is used to control the size of the training set 
if train_size != "full":
    train_df = train_df.sample(n=train_size)
if validation_size != "full":
    val_df = val_df.sample(n=validation_size)
if test_size != "full":
    test_df = test_df.sample(n=test_size)"""

train_labels = mlb.fit_transform(train_df["annotations"])
val_labels = mlb.transform(val_df["annotations"])
test_labels = mlb.transform(test_df["annotations"])

train_dataset = TweetDataset(train_df['text'].to_list(), torch.tensor(train_labels), mlb, tokenizer)
val_dataset = TweetDataset(val_df['text'].to_list(), torch.tensor(val_labels), mlb, tokenizer)
test_dataset = TweetDataset(test_df['text'].to_list(), torch.tensor(test_labels), mlb, tokenizer)
data_collator = MultiLabelDataCollator(tokenizer)

Starting training of 1. fold...


In [13]:
train_dataset[0]["input_ids"]

  'label': torch.tensor(label, dtype=torch.float32)}


tensor([[    0,  3362, 29914, 29911,  2704,   353, 29871, 29896, 29892, 10050,
         29899, 29956,   279, 29914, 29911,  2704,   353, 29871, 29900,    13,
            13, 12148,  3858,   278,  1494,  7780,   300,   408,  3362, 29914,
         29911,  2704,   470, 10050, 29899, 29956,   279, 29914, 29911,  2704,
           411,   278,  6865,  4944,  2038,   322,   871,  3867,   278, 11073,
         29901,  5328,  2834,  1122,  1284,   263,   982,   373, 12178,   595,
           525, 29879, 18786,  7331,  4219,  7331,  4219]])

In [6]:
# Generate
generate_ids = model.generate(
    train_dataset[0]["input_ids"],
    do_sample=True,
    top_k=50,
    top_p=0.95,
    max_length=50,
    num_return_sequences=1
)
t = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

  'label': torch.tensor(label, dtype=torch.float32)}
Input length of input_ids is 67, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


tensor([[    0,  2799,  4080, 29901,  2391,   599, 11443, 28058,   297, 22968,
           936,  1797, 29889]])

In [8]:
from peft import PeftModel
from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig
import torch
device = torch.device("cuda:0" if torch.cuda.device_count() > 0 else "cpu")

base_model = "decapoda-research/llama-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(base_model)
model = LlamaForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(
    model,
    "tloen/alpaca-lora-7b",
    torch_dtype=torch.float16,
)

model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2

prompt = "Instruction: Hello, are you alive?"
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
generation_config = GenerationConfig(
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4
)

generate_params = {
    "input_ids": input_ids,
    "generation_config": generation_config,
    "return_dict_in_generate": True,
    "output_scores": True,
    "max_new_tokens": 128,
}

with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=128,
    )
s = generation_output.sequences[0]
output = tokenizer.decode(s)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


ValueError: 
                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
                        these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
                        `device_map` to `from_pretrained`. Check
                        https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                        for more details.
                        

In [11]:
train_df['text'].to_list()[0]

"How life may find a way on Saturn's moon https://t.co/aCambG1yAm https://t.co/1jsT6ItnVh"

In [13]:
prompt = f"Is the Tweet '{train_df['text'].to_list()[0]}' about War/Terror? Answer with 0 for no and 1 for yes."
inputs = tokenizer("the nutrition facts of a peanut are", return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
generation_config = GenerationConfig(
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4
)

generate_params = {
    "input_ids": input_ids,
    "generation_config": generation_config,
    "return_dict_in_generate": True,
    "output_scores": True,
    "max_new_tokens": 128,
}

with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=128,
    )
s = generation_output.sequences[0]
output = tokenizer.decode(s)

In [14]:
output

'<unk>the nutrition facts of a peanut are the nutrition facts of a peanut are the nutration facts of a peanut are the nutration facts of a peanut are the nutration facts of a peanut are the nutration facts of a peanut are the nutration facts of a peanut are the nutration facts of a peanut are the nutration facts of a peanut are the nutration facts of a peanut are the nutration facts of a peanut are the nutration facts of a peanut are the nutration facts of a'

In [None]:
tokenizer.decode(generate_ids[0])