In [21]:
!pip install peft
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
import pandas as pd
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# model_name = "tiiuae/falcon-7b"
# model_name = "tiiuae/falcon-rw-1b"
model_name = "distilbert-base-uncased"
mistral = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
# define label maps
id2label = {0: "Irrelevant", 1: "Relevant"}
label2id = {"Irrelevant":0, "Relevant":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Tokenize input text
input_text = "Hello"
inputs = tokenizer(input_text, return_tensors="pt")

# Inference
outputs = model(**inputs)

# Post-processing
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1).item()
print("Predicted Class:", predicted_class)

Predicted Class: 0


# Low Rank Adaptation Fine Tuning

## Dataset Gathering

In [8]:
df = pd.read_csv('WaTA_dataset.csv', encoding = "ISO-8859-1")

test_set = 0.3

x_train, y_train, x_test, y_test = [], [], [] ,[]
for index, row in df.iterrows():
    if random.random() < test_set:
        # TEST SET
        x_test.append(row['Sentence'])
        y_test.append(label2id[row['Class']])
    else:
        # TRAINING SET
        x_train.append(row['Sentence'])
        y_train.append(label2id[row['Class']])
        
dataset = DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
                        'test':Dataset.from_dict({'label':y_test,'text':x_test})})

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 17554
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 7557
    })
})

## Preprocess data

In [10]:
# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [11]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [12]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/17554 [00:00<?, ? examples/s]

Map: 100%|██████████| 17554/17554 [00:02<00:00, 6785.15 examples/s]
Map: 100%|██████████| 7557/7557 [00:01<00:00, 6619.08 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 17554
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 7557
    })
})

In [13]:
# create data collator (similar to a pytorch dataloader)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluation

In [14]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")

ImportError: To be able to use evaluate-metric/accuracy, you need to install the following dependencies['scikit-learn'] using 'pip install sklearn' for instance'

In [21]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [15]:
# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Irrelevant


Not a fan, don't recommed. - Irrelevant
Better than the first one. - Irrelevant
This is not worth watching even once. - Irrelevant
This one is a pass. - Irrelevant


# Train model

In [16]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [17]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


In [18]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 40
weight_decay=0.01

In [19]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_name + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [22]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

In [None]:
# # train model
trainer.train()

In [55]:
# define list of examples
text_list = ["The party sends a warrant possession request asking a warrant to be released", #R
             "If the request is rejected the process will end", #R
             "For instance a sales person on a trip rents a car", #I
             "Since the company has expense rules there are circumstances where the supervisor can accept or reject the report upon first inspection", #I
             "This notice is done by the family doctor of somebody who is in need of mental treatment", #I
             "Unfortunately it is not coupled correctly to our Enterprise Resource Planning system ERP so the data must be transferred manually", #I
             "The customer enters the withdrawal amount", #R
             "Once that information is present it has to be entered into our production planning system PPS", #R
             "Besides it creates a list of parts to be procured", #R
             "A teenager who obtains 44 out of 50 is good for example"] #I

print("Trained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model.cpu()(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Trained model predictions:
----------------------------
The party sends a warrant possession request asking a warrant to be released - Relevant
If the request is rejected the process will end - Relevant
For instance a sales person on a trip rents a car - Irrelevant
Since the company has expense rules there are circumstances where the supervisor can accept or reject the report upon first inspection - Relevant
This notice is done by the family doctor of somebody who is in need of mental treatment - Relevant
Unfortunately it is not coupled correctly to our Enterprise Resource Planning system ERP so the data must be transferred manually - Irrelevant
The customer enters the withdrawal amount - Relevant
Once that information is present it has to be entered into our production planning system PPS - Relevant
Besides it creates a list of parts to be procured - Irrelevant
A teenager who obtains 44 out of 50 is good for example - Relevant
