In [70]:
!pip install peft
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [71]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import wandb

from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
import pandas as pd
import random

In [72]:
# model_name = "tiiuae/falcon-7b"
# model_name = "tiiuae/falcon-rw-1b"
model_name = "distilbert-base-uncased"
# mistral = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [73]:
# define label maps
id2label = {0: "Irrelevant", 1: "Relevant"}
label2id = {"Irrelevant":0, "Relevant":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
# Tokenize input text
input_text = "Hello"
inputs = tokenizer(input_text, return_tensors="pt")

# Inference
outputs = model(**inputs)

# Post-processing
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1).item()
print("Predicted Class:", predicted_class)

Predicted Class: 0


# Low Rank Adaptation Fine Tuning

## Dataset Gathering

In [75]:
df = pd.read_csv('../data/WaTA_dataset.csv', encoding = "ISO-8859-1")

test_set = 0.3

x_train, y_train, x_test, y_test = [], [], [] ,[]
relevant_sentences, irrelevant_sentences = [], []

for index, row in df.iterrows():
    if row['Class'] == 'Relevant':
        relevant_sentences.append(row['Sentence'])
    else:
        irrelevant_sentences.append(row['Sentence'])

random.shuffle(relevant_sentences)
random.shuffle(irrelevant_sentences)

relevant_train = relevant_sentences[:int(len(relevant_sentences)*(1-test_set))]
relevant_test = relevant_sentences[int(len(relevant_sentences)*(1-test_set)):]
irrelevant_train = irrelevant_sentences[:int(len(irrelevant_sentences)*(1-test_set))]
irrelevant_test = irrelevant_sentences[int(len(irrelevant_sentences)*(1-test_set)):]

for sentence in relevant_train:
    x_train.append(sentence)
    y_train.append(1)
    
for sentence in irrelevant_train:
    x_train.append(sentence)
    y_train.append(0)

for sentence in relevant_test:
    x_test.append(sentence)
    y_test.append(1)
    
for sentence in irrelevant_test:
    x_test.append(sentence)
    y_test.append(0)       
        
dataset = DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
                        'test':Dataset.from_dict({'label':y_test,'text':x_test})})

In [76]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 17577
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 7534
    })
})

## Preprocess data

In [77]:
# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [78]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=2048
    )

    return tokenized_inputs

In [79]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/17577 [00:00<?, ? examples/s]

Map: 100%|██████████| 17577/17577 [00:00<00:00, 31239.86 examples/s]
Map: 100%|██████████| 7534/7534 [00:00<00:00, 30720.00 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 17577
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 7534
    })
})

In [80]:
# create data collator (similar to a pytorch dataloader)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluation

In [81]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

Downloading builder script: 100%|██████████| 7.55k/7.55k [00:00<00:00, 5.14MB/s]
Downloading builder script: 100%|██████████| 7.36k/7.36k [00:00<00:00, 5.33MB/s]
Downloading builder script: 100%|██████████| 6.77k/6.77k [00:00<00:00, 5.10MB/s]


In [82]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels), 
            "precision": precision.compute(predictions=predictions, references=labels),
            "recall": recall.compute(predictions=predictions, references=labels),
            "f1": f1.compute(predictions=predictions, references=labels)}

In [92]:

def evaluate_model(examples):
    correct = 0
    for sentence in examples:
        # tokenize text
        inputs = tokenizer.encode(sentence["text"], return_tensors="pt").to(model.device)
        # compute logits
        logits = model(inputs).logits
        # convert logits to label
        predictions = torch.argmax(logits)
        
        if predictions.item() == sentence["label"]:
            correct += 1

        print(sentence["text"])
        print("Predicted:", id2label[predictions.item()] + ", Actual:", id2label[sentence["label"]])
    print("Accuracy:", correct/len(examples))

In [84]:
# define list of examples
examples = dataset["test"].shuffle().select(range(10))

print("Untrained model predictions:")
print("----------------------------")
evaluate_model(examples)

Untrained model predictions:
----------------------------
The employee leads them to their table
Predicted: Irrelevant, Actual: Relevant
The steam will allow the pores to open zero blackheads goal  
Predicted: Irrelevant, Actual: Relevant
The still warm conched chocolate is placed in a tempering machine so that it can be slowly and steadily cooled 
Predicted: Irrelevant, Actual: Relevant
The largest panel left open is the outside back
Predicted: Irrelevant, Actual: Irrelevant
The user recovers his password
Predicted: Irrelevant, Actual: Relevant
The bass strings are also wrapped with copper windings in a process called loading the strings
Predicted: Relevant, Actual: Irrelevant
The employees and students take the shelving toll to fill the shelves
Predicted: Irrelevant, Actual: Relevant
After the particular thread for ribbon has been spun dyed and treated it is rolled on bobbins
Predicted: Irrelevant, Actual: Relevant
The packaging team arrives in the department
Predicted: Irrelevant, A

# Train model

In [85]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin', 'v_lin'])

In [86]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 665,858 || all params: 67,620,868 || trainable%: 0.9846930684178736


In [87]:
# hyperparameters
lr = 1e-3
batch_size = 16
num_epochs = 10
weight_decay=0.01

In [88]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_name + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [89]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

In [90]:
# # train model
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2928,0.269754,{'accuracy': 0.8879745155295992},{'precision': 0.9380849532037437},{'recall': 0.9124649859943977},{'f1': 0.9250976215832446}
2,0.2784,0.27925,{'accuracy': 0.8926201221130874},{'precision': 0.9217271632547738},{'recall': 0.9380252100840336},{'f1': 0.9298047722342734}
3,0.2463,0.294752,{'accuracy': 0.8910273427130343},{'precision': 0.9241977450130096},{'recall': 0.9327731092436975},{'f1': 0.9284656269059858}
4,0.2408,0.266015,{'accuracy': 0.8863817361295461},{'precision': 0.9360632183908046},{'recall': 0.9124649859943977},{'f1': 0.9241134751773049}
5,0.222,0.289466,{'accuracy': 0.8875763206795859},{'precision': 0.9330603525013352},{'recall': 0.9175420168067226},{'f1': 0.9252361196928237}
6,0.2034,0.292783,{'accuracy': 0.895009291213167},{'precision': 0.9225485145114203},{'recall': 0.9404761904761905},{'f1': 0.9314260944950152}
7,0.1783,0.340588,{'accuracy': 0.8930183169631006},{'precision': 0.9139385757678029},{'recall': 0.9481792717086834},{'f1': 0.9307441141089534}
8,0.1638,0.361224,{'accuracy': 0.8944783647464826},{'precision': 0.9173315226616873},{'recall': 0.946078431372549},{'f1': 0.9314832370938549}
9,0.1389,0.420212,{'accuracy': 0.8916910007963897},{'precision': 0.920041180507893},{'recall': 0.9387254901960784},{'f1': 0.9292894280762565}
10,0.1228,0.493573,{'accuracy': 0.8918237324130608},{'precision': 0.9216462889615981},{'recall': 0.9369747899159664},{'f1': 0.9292473304974391}


TrainOutput(global_step=10990, training_loss=0.20777345610489295, metrics={'train_runtime': 942.9347, 'train_samples_per_second': 186.407, 'train_steps_per_second': 11.655, 'total_flos': 1806901996184688.0, 'train_loss': 0.20777345610489295, 'epoch': 10.0})

In [93]:
evaluate_model(examples)

The employee leads them to their table
Predicted: Relevant, Actual: Relevant
The steam will allow the pores to open zero blackheads goal  
Predicted: Irrelevant, Actual: Relevant
The still warm conched chocolate is placed in a tempering machine so that it can be slowly and steadily cooled 
Predicted: Relevant, Actual: Relevant
The largest panel left open is the outside back
Predicted: Irrelevant, Actual: Irrelevant
The user recovers his password
Predicted: Relevant, Actual: Relevant
The bass strings are also wrapped with copper windings in a process called loading the strings
Predicted: Relevant, Actual: Irrelevant
The employees and students take the shelving toll to fill the shelves
Predicted: Relevant, Actual: Relevant
After the particular thread for ribbon has been spun dyed and treated it is rolled on bobbins
Predicted: Relevant, Actual: Relevant
The packaging team arrives in the department
Predicted: Relevant, Actual: Relevant
Plate guides are inserted at intervals of five to six 

# Save fine-tuned model

In [94]:
peft_model_id = "distilbert-base-uncased-lora-text-classification"
trainer.save_model(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

('distilbert-base-uncased-lora-text-classification/tokenizer_config.json',
 'distilbert-base-uncased-lora-text-classification/special_tokens_map.json',
 'distilbert-base-uncased-lora-text-classification/vocab.txt',
 'distilbert-base-uncased-lora-text-classification/added_tokens.json',
 'distilbert-base-uncased-lora-text-classification/tokenizer.json')

In [95]:
config = PeftConfig.from_pretrained(peft_model_id)

test_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model_inf = PeftModel.from_pretrained(test_model, peft_model_id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [96]:
evaluate_model(examples)

The employee leads them to their table
Predicted: Relevant, Actual: Relevant
The steam will allow the pores to open zero blackheads goal  
Predicted: Irrelevant, Actual: Relevant
The still warm conched chocolate is placed in a tempering machine so that it can be slowly and steadily cooled 
Predicted: Relevant, Actual: Relevant
The largest panel left open is the outside back
Predicted: Irrelevant, Actual: Irrelevant
The user recovers his password
Predicted: Relevant, Actual: Relevant
The bass strings are also wrapped with copper windings in a process called loading the strings
Predicted: Relevant, Actual: Irrelevant
The employees and students take the shelving toll to fill the shelves
Predicted: Relevant, Actual: Relevant
After the particular thread for ribbon has been spun dyed and treated it is rolled on bobbins
Predicted: Relevant, Actual: Relevant
The packaging team arrives in the department
Predicted: Relevant, Actual: Relevant
Plate guides are inserted at intervals of five to six 