In [1]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoConfig, DataCollatorWithPadding, TrainingArguments, Trainer, AutoModelForSequenceClassification
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
from winioctlcon import RETURN_SMART_STATUS

from new import sentences

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
# loading the dataset
imdb_dataset = load_dataset('imdb')

README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [4]:
# I define subsample size here as the dataset is large
N= 2000

# generate random subsample indexes
rand_idx = np.random.randint(24999, size = N)

In [6]:
# Extracting the train and test data
x_train = imdb_dataset['train'][rand_idx]['text']
y_train = imdb_dataset['train'][rand_idx]['label']

x_test = imdb_dataset['test'][rand_idx]['text']
y_test = imdb_dataset['test'][rand_idx]['label']

dataset = DatasetDict({'train':Dataset.from_dict({'label':y_train, 'text':x_train}),
                       'validation':Dataset.from_dict({'label':y_test,'text':x_test})})

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 2000
    })
})

In [9]:
# percentage of label 1 in training data
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

np.float64(0.5)

In [10]:
# Now we define the model
checkpoint = 'distilbert-base-uncased'

# define label maps
id2label = {0:'Negative', 1:'Positive'}
label2id = {'Negative':0, 'Positive':1}

model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels = 2, id2label=id2label, label2id=label2id)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# Preprocessing the Data

In [12]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=True)

# add pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_tokens':'[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [15]:
# Now we will create tokenize function
def tokenize_function(example):
    text = example['text']
    tokenizer.truncation_side = 'left'
    tokenized_inputs = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        padding= True,
        max_length=512,
    )
    return tokenized_inputs

In [16]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluation

In [18]:
accuracy = evaluate.load('accuracy')

In [20]:
# Now we will define the evaluation function
def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy :':accuracy.compute(predictions = predictions, references = labels)}

# Applying Untrained Model

In [21]:
sentences = ['This was great','I do not like this movie','It was a pass for me']
for text in sentences:
    # Tokenize text
    inputs = tokenizer.encode(text, return_tensors='pt')
    # Compute Logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)
    print(text + '-' + id2label[predictions.tolist()])

This was great-Negative
I do not like this movie-Negative
It was a pass for me-Negative


# Now Train The Model

In [22]:
peft_config= LoraConfig(
    task_type='SEQ_CLS',
    r = 4,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules=['q_lin']
)

In [23]:
peft_config

LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=4, target_modules={'q_lin'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, use_qalora=False, qalora_group_size=16, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False, target_parameters=None)

In [24]:
model = get_peft_model(model,peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [25]:
# HyperParameters
learning_rate = 1e-3
batch_size = 8
num_epochs = 10

In [31]:
# Now We Define Training Arguments
training_args = TrainingArguments(
    output_dir=checkpoint + '-Lora-Text-Classification',
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    #evaluation_strategy="epoch",
    weight_decay=0.01,

    #save_strategy = 'epoch',
    #load_best_model_at_end=True,
)

In [32]:
# Create trainer Object
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation'],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)
# Train the model
trainer.train()

  trainer = Trainer(


Step,Training Loss
500,0.3506
1000,0.1854
1500,0.0914
2000,0.0343
2500,0.0179


TrainOutput(global_step=2500, training_loss=0.13592799263000488, metrics={'train_runtime': 322.0216, 'train_samples_per_second': 62.108, 'train_steps_per_second': 7.763, 'total_flos': 2687993364480000.0, 'train_loss': 0.13592799263000488, 'epoch': 10.0})

In [34]:
model.to('cuda')

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.01, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=7

# Now predicting with the Trained Model

In [37]:
sentences = ['The movie was dull', 'It was an awesome movie']

In [38]:
for text in sentences:
    inputs = tokenizer.encode(text, return_tensors='pt').to('cuda')
    logits = model(inputs).logits
    predictions = torch.argmax(logits)
    print(text+ '-' + id2label[predictions.tolist()])

The movie was dull-Negative
It was an awesome movie-Positive
