In [1]:
!python pip install accelerate appdirs loralib black black[jupyter] datasets git+https://github.com/huggingface/peft.git git+https://github.com/huggingface/transformers.git sentencepiece gradio fire torch datasets bitsandbytes utils

In [4]:
import os
import sys
from typing import List

import fire
import torch
import transformers
from datasets import load_dataset
import torch.nn as nn
#import bitsandbytes as bnb

"""from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)"""

from transformers import AutoModelForMaskedLM
from transformers import LlamaForCausalLM, LlamaTokenizer

from sklearn.metrics import confusion_matrix, classification_report, multilabel_confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer

device = torch.device("cuda:0" if torch.cuda.device_count() > 0 else "cpu")
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda', index=0)

In [6]:
model = LlamaForCausalLM.from_pretrained(
    "decapoda-research/llama-7b-hf",
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map='auto'
)

tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)


#model = prepare_model_for_int8_training(model)
#model = get_peft_model(model, config) #https://github.com/huggingface/peft State-of-the-art Parameter-Efficient Fine-Tuning (PEFT) methods

if resume_from_checkpoint:
    # Check the available weights and load them
    checkpoint_name = os.path.join(
        resume_from_checkpoint, "pytorch_model.bin"
    )  # Full checkpoint
    if not os.path.exists(checkpoint_name):
        checkpoint_name = os.path.join(
            resume_from_checkpoint, "adapter_model.bin"
        )  # only LoRA model - LoRA config above has to fit
        resume_from_checkpoint = (
            False  # So the trainer won't try loading its state
        )
    # The two files above have a different name depending on how they were saved, but are actually the same.
    if os.path.exists(checkpoint_name):
        print(f"Restarting from {checkpoint_name}")
        adapters_weights = torch.load(checkpoint_name)
        model = set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {checkpoint_name} not found")

def compute_metrics(eval_pred):
    print(eval_pred)
    predictions, labels = eval_pred
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    predictions = (probs >= 0.5).to(int)
    labels = labels.astype(int)
    report = classification_report(labels, predictions, labels=range(len(classes)), output_dict=True, zero_division=0)

    metrics = {
        "accuracy": np.mean(predictions == labels),
        "micro_precision": report["micro avg"]["precision"],
        "micro_recall": report["micro avg"]["recall"],
        "micro_f1": report["micro avg"]["f1-score"],
        "macro_precision": report["macro avg"]["precision"],
        "macro_recall": report["macro avg"]["recall"],
        "macro_f1": report["macro avg"]["f1-score"],
    }

    return metrics

class TweetDataset(Dataset):
    def __init__(self, x, y, mlb, tokenizer, train = True):
        self.x = x
        self.y = y
        self.mlb = mlb
        self.tokenizer = tokenizer
        self.encoded_tweets = self.preprocess_text(self.x)
        self.train = train
        
    @staticmethod
    def normalizeToken(token):
        lowercased_token = token.lower()
        if token.startswith("@"):
            return "@USER"
        elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
            return "HTTPURL"
        elif len(token) == 1:
            return demojize(token)
        else:
            if token == "’":
                return "'"
            elif token == "…":
                return "..."
            else:
                return token
    
    def normalizeTweet(self, tweet):
        tokens = TweetTokenizer().tokenize(tweet.replace("’", "'").replace("…", "..."))
        normTweet = " ".join([self.normalizeToken(token) for token in tokens])

        normTweet = (
            normTweet.replace("cannot ", "can not ")
                .replace("n't ", " n't ")
                .replace("n 't ", " n't ")
                .replace("ca n't", "can't")
                .replace("ai n't", "ain't")
        )
        normTweet = (
            normTweet.replace("'m ", " 'm ")
                .replace("'re ", " 're ")
                .replace("'s ", " 's ")
                .replace("'ll ", " 'll ")
                .replace("'d ", " 'd ")
                .replace("'ve ", " 've ")
        )
        normTweet = (
            normTweet.replace(" p . m .", "  p.m.")
                .replace(" p . m ", " p.m ")
                .replace(" a . m .", " a.m.")
                .replace(" a . m ", " a.m ")
        )
        return " ".join(normTweet.split())

    def generate_prompt(self, data_point):
        #if self.train:
        rules = "War/Terror = 1, Non-War/Terror = 0"
        examples = "This tweet is about War. This tweet is about Terror."
        instructions = "Please label the following tweet as War/Terror or Non-War/Terror with the rules provided above and only provide the labels:" 

        full_prompt = rules + "\n" + examples + "\n" + instructions + data_point

        return full_prompt
    
    def tokenize(self, prompt, add_eos_token=True):
        # there's probably a way to do this with the tokenizer settings
        # but again, gotta move fast
        cutoff_len = 100000
        result = tokenizer(
            prompt,
            truncation=True,
            max_length=cutoff_len,
            padding=False,
            return_tensors=None,
        )
        if (
            result["input_ids"][-1] != tokenizer.eos_token_id
            and len(result["input_ids"]) < cutoff_len
            and add_eos_token
        ):
            result["input_ids"].append(tokenizer.eos_token_id)
            result["attention_mask"].append(1)

        #result["labels"] = result["input_ids"].copy()

        return result

    def preprocess_text(self, X):
        X = [self.normalizeTweet(tweet) for tweet in X] #normalize
        X = [self.generate_and_tokenize_prompt(tweet) for tweet in X] #generate prompt
        X = [self.tokenize(tweet) for tweet in X] #tokenize
        return X
        
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        label = self.y[idx]
        return {'input_ids': self.encoded_tweets['input_ids'][idx],
                'attention_mask': self.encoded_tweets['attention_mask'][idx],
                'label': torch.tensor(label, dtype=torch.float32)}
                #'label_ids': self.labels[idx]}

model.print_trainable_parameters() 

i = 0
train_size = "full"
epochs = 200
task = "generic"
print(f"Starting training of {i+1}. fold...")
output_dir = f"./{task}_epochs_{epochs}_train_size_{train_size}_fold_{i}"
os.makedirs(output_dir, exist_ok=True)

# Load the data for this fold
filename = f"./content/drive/MyDrive/Colab Notebooks/labeled_data/{task}_test_{i}.json"
with open(filename) as f:
    data = json.load(f)
train_df = pd.DataFrame(data["train"])
val_df = pd.DataFrame(data["valid"])
test_df = pd.DataFrame(data["test"])
train_annotations = train_df["annotations"].tolist()

# Get all unique classes
global classes
classes = set()
for annotation in train_annotations:
    classes.update(annotation)
classes = sorted(list(classes))

# Convert the annotations to binary labels
mlb = MultiLabelBinarizer(classes=classes)

"""# train_size argument is used to control the size of the training set 
if train_size != "full":
    train_df = train_df.sample(n=train_size)
if validation_size != "full":
    val_df = val_df.sample(n=validation_size)
if test_size != "full":
    test_df = test_df.sample(n=test_size)"""

train_labels = mlb.fit_transform(train_df["annotations"])
val_labels = mlb.transform(val_df["annotations"])
test_labels = mlb.transform(test_df["annotations"])

train_dataset = TweetDataset(train_df['text'].to_list(), torch.tensor(train_labels), mlb, tokenizer)
val_dataset = TweetDataset(val_df['text'].to_list(), torch.tensor(val_labels), mlb, tokenizer)
test_dataset = TweetDataset(test_df['text'].to_list(), torch.tensor(test_labels), mlb, tokenizer)
data_collator = MultiLabelDataCollator(tokenizer)

Downloading (…)lve/main/config.json: 100%|██████████| 427/427 [00:00<00:00, 214kB/s]
Downloading (…)model.bin.index.json: 100%|██████████| 25.5k/25.5k [00:00<00:00, 3.64MB/s]
Downloading shards:   0%|          | 0/33 [00:00<?, ?it/s]
Downloading (…)l-00001-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]
Downloading (…)l-00001-of-00033.bin:   3%|▎         | 10.5M/405M [00:05<03:31, 1.86MB/s]
Downloading (…)l-00001-of-00033.bin:   5%|▌         | 21.0M/405M [00:11<03:38, 1.76MB/s]
Downloading (…)l-00001-of-00033.bin:   8%|▊         | 31.5M/405M [00:17<03:27, 1.80MB/s]
Downloading (…)l-00001-of-00033.bin:  10%|█         | 41.9M/405M [00:28<04:35, 1.32MB/s]
Downloading (…)l-00001-of-00033.bin:  10%|█         | 41.9M/405M [00:39<04:35, 1.32MB/s]
Downloading (…)l-00001-of-00033.bin:  13%|█▎        | 52.4M/405M [00:59<09:07, 643kB/s] 
Downloading (…)l-00001-of-00033.bin:  16%|█▌        | 62.9M/405M [01:05<06:51, 831kB/s]
Downloading (…)l-00001-of-00033.bin:  18%|█▊        | 73.4M/405M


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
binary_path: c:\Users\bruno\.conda\envs\my_env\lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll
CUDA SETUP: Loading binary c:\Users\bruno\.conda\envs\my_env\lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll...


Loading checkpoint shards: 100%|██████████| 33/33 [00:40<00:00,  1.23s/it]
Downloading (…)neration_config.json: 100%|██████████| 124/124 [00:00<00:00, 31.0kB/s]
Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 1.94MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 2.00/2.00 [00:00<00:00, 666B/s]
Downloading (…)okenizer_config.json: 100%|██████████| 141/141 [00:00<00:00, 47.0kB/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


NameError: name 'resume_from_checkpoint' is not defined