In [1]:
! pip install datasets
! pip install transformers
! pip install peft
! pip install evaluate

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

In [2]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [3]:
dataset = load_dataset('Senem/Nostalgic_Sentiment_Analysis_of_YouTube_Comments_Data')
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/978 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/219k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentiment', 'comment'],
        num_rows: 1500
    })
})

In [4]:
import pandas as pd
df = pd.DataFrame(dataset['train'])

In [5]:
df['sentiment'] = df['sentiment'].map({'nostalgia': 1, 'not nostalgia': 0})

In [6]:
df['text'] = df['comment']
df['label'] = df['sentiment']
df.drop(['comment', 'sentiment'], axis=1, inplace=True)

In [7]:
np.array(df['label']).sum()/len(df['label'])

0.5

In [8]:
model_checkpoint = 'distilbert-base-uncased'
id2label = {1:'nostalgia', 0:'not nostalgia'}
label2id = {'nostalgia': 1, 'not nostalgia': 0}

In [9]:
# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=2, id2label=id2label, label2id=label2id)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', add_prefix_space=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
def calculate_max_length(df):
    max_length = 0
    for text in df['text']:
        tokens = tokenizer.tokenize(text)
        if len(tokens) > max_length:
            max_length = len(tokens)
    return max_length

In [12]:
df.head()

Unnamed: 0,text,label
0,He was a singer with a golden voice that I lov...,0
1,The mist beautiful voice ever I listened to hi...,1
2,I have most of Mr. Reeves songs. Always love ...,1
3,30 day leave from 1st tour in Viet Nam to conv...,0
4,listening to his songs reminds me of my mum wh...,1


In [13]:
max_length = calculate_max_length(df)

In [14]:

def tokenize_function(text):
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        max_length=max_length,
        padding='max_length',
    )
    return tokenized_inputs

In [15]:
df[['input_ids', 'attention_mask']] = df['text'].apply(lambda x: pd.Series(tokenize_function(x)))

In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [18]:
# TBL
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [19]:
text_list = ["I remember how we used to dance to his songs",
             "This is not good.",
             "I loved his voice sense I was young",
             "This was originally made in 1960.",
             "This reminds me of old times."]
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

I remember how we used to dance to his songs - not nostalgia
This is not good. - not nostalgia
I loved his voice sense I was young - not nostalgia
This was originally made in 1960. - not nostalgia
This reminds me of old times. - not nostalgia


In [20]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [21]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [22]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [23]:
# hyperparameters
lr = 1e-3
batch_size = 10
num_epochs = 15

In [24]:
training_args = TrainingArguments(
    output_dir= "/content/drive/MyDrive/models/FineTuned/" + "lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [25]:
df['input_ids'][0]

array([[  101,  2002,  2001,  1037,  3220,  2007,  1037,  3585,  2376,
         2008,  1045,  2293,  2000,  2963,  2035,  1996,  2051,  1012,
         1045,  2001,  2010,  2307,  5470,  2012,  1996,  2287,  1997,
         2385, 29100,  2015,  1999,  2216,  2420,  1998,  2145,  2085,
         1012,  2348,  2057,  2031,  2116,  8453,  2085,  1010,  2021,
         1010,  1045,  2064, 29536, 10875,  2005,  3958, 17891,  2035,
         1996,  2051,  1012,  2017,  2514,  8363,  1010,  6832,  1998,
         8295,  1012,  4067,  2017,  2935,  2005,  2010,  2166,  1012,
          102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0]])

In [26]:
df['input_ids'] = df['input_ids'].apply(lambda x: x[0].tolist())
df['attention_mask'] = df['attention_mask'].apply(lambda x: x[0].tolist())
hf_dataset = Dataset.from_pandas(df)

train_test_split = hf_dataset.train_test_split(test_size=0.2)
split_dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

In [27]:
import torch

# Assuming you have the train_dataset and tokenizer
sample = split_dataset['train'][0]

# Extract the input_ids and attention_mask
input_ids = torch.tensor([sample['input_ids']], dtype=torch.long)
attention_mask = torch.tensor([sample['attention_mask']], dtype=torch.float)

# Print the shapes
print(f"input_ids shape: {input_ids.shape}")
print(f"attention_mask shape: {attention_mask.shape}")

# Print the data types
print(f"input_ids dtype: {input_ids.dtype}")
print(f"attention_mask dtype: {attention_mask.dtype}")

input_ids shape: torch.Size([1, 120])
attention_mask shape: torch.Size([1, 120])
input_ids dtype: torch.int64
attention_mask dtype: torch.float32


In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.310498,{'accuracy': 0.9}
2,No log,0.252059,{'accuracy': 0.9066666666666666}
3,No log,0.321574,{'accuracy': 0.9166666666666666}
4,No log,0.371463,{'accuracy': 0.9133333333333333}
5,0.170800,0.370481,{'accuracy': 0.9266666666666666}
6,0.170800,0.406111,{'accuracy': 0.9266666666666666}
7,0.170800,0.435625,{'accuracy': 0.9266666666666666}
8,0.170800,0.488439,{'accuracy': 0.9266666666666666}
9,0.022100,0.506325,{'accuracy': 0.9266666666666666}
10,0.022100,0.495552,{'accuracy': 0.9233333333333333}


Trainer is attempting to log a value of "{'accuracy': 0.9}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9066666666666666}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9166666666666666}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9133333333333333}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9266666666666666}" of type <class '

TrainOutput(global_step=1800, training_loss=0.05547500809033712, metrics={'train_runtime': 158.0551, 'train_samples_per_second': 113.884, 'train_steps_per_second': 11.388, 'total_flos': 566998600320000.0, 'train_loss': 0.05547500809033712, 'epoch': 15.0})

In [32]:
# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the appropriate device
model.to(device)

print("Trained model predictions:")
print("----------------------------")
for text in text_list:
    # Tokenize text and move to the same device
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)  # Move inputs to the same device

    # Compute logits
    logits = model(inputs).logits

    # Convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.item()])  # Use .item() to get the value from the tensor

Trained model predictions:
----------------------------
I remember how we used to dance to his songs - nostalgia
This is not good. - not nostalgia
I loved his voice sense I was young - nostalgia
This was originally made in 1960. - not nostalgia
This reminds me of old times. - nostalgia
