In [2]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load dataset
IMDB = pd.read_csv("/Users/fariddamania/Downloads/SEM VII/SNLP/Dataset/IMDB Dataset.csv")
IMDB.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
id_to_label = {0: "negative", 1: "positive"}
label_to_id = {label: id_ for id_, label in id_to_label.items()}

IMDB["label"] = IMDB["sentiment"].map(label_to_id)

In [5]:
print(IMDB.shape)
IMDB.head()

(50000, 3)


Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [6]:
model_config = {
    "max_length": 360,
    "model_path": "microsoft/xtremedistil-l6-h256-uncased",
    "output_dir": "./my_model_output",
    "train_batch_size": 64,
    "valid_batch_size": 64,
    "learning_rate": 3e-5,
    "epochs": 3,
    "debug": True,
}

In [7]:
import transformers

tokenizer_model = transformers.AutoTokenizer.from_pretrained(model_config["model_path"])

In [8]:
from sklearn import model_selection

train_data, valid_data = model_selection.train_test_split(
    IMDB,
    test_size=0.2,
    random_state=123,
    shuffle=True,
    stratify=IMDB["label"]
)

In [9]:
import torch

class SentimentDataset:
    
    def __init__(self, dataset, tokenizer, config):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = config["max_length"]
        
    def __len__(self):
        return self.dataset.shape[0]
    
    def __getitem__(self, index):
        row = self.dataset.iloc[index]
        
        encoded = self.tokenizer(
            row["review"],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        
        return {
            "input_ids": encoded["input_ids"].squeeze(0), 
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "labels": torch.tensor(row["label"], dtype=torch.long)
        }

In [10]:
train_dataset = SentimentDataset(train_data, tokenizer_model, model_config)
valid_dataset = SentimentDataset(valid_data, tokenizer_model, model_config)

In [11]:
valid_dataset[0]

{'input_ids': tensor([  101,  1045,  2387,  1996,  3185,  2044,  9361,  2049,  5790,  2006,
         10047, 18939,  1012,  2067,  2059,  1010,  2009,  2001,  2012,  1022,
          1012,  1014,  1998,  1045,  2245,  1010,  1000, 10166,   999,  2008,
          2442,  2022,  1037,  2204,  2028,  1000,  1012,  1045,  2245,  3308,
          1012,  1996,  2927,  1997,  1996,  3185,  2941,  7906,  2054,  1996,
          5436, 10659,  1010,  2021,  2059,  2009,  3632, 27258,  2135,  2091,
          7650,  2049,  8102,  1012,  1045,  2228,  2302,  1996,  2839,  1997,
         14411, 23330,  1010,  2009,  2453,  2031,  2042, 10303,  1011,  2348,
          2002,  2003,  1996,  3114,  1996,  2466,  2240,  3138,  1996,  2607,
          2009,  2515,  1012,  1996,  2839,  2003,  2074,  2205,  6034,  2005,
          2026, 16663,  1010,  1998,  2524,  2000, 18094,  1012,  2036,  1010,
          1996,  4990,  1996,  2364,  2839,  3138,  2013,  1996,  2927,  1997,
          1996,  2466,  6229,  2049,  1

In [21]:
classification_model = transformers.AutoModelForSequenceClassification.from_pretrained(model_config['model_path'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from sklearn import metrics

def evaluate_metrics(eval_result):
    logits, true_labels = eval_result
    predictions = np.argmax(logits, axis=-1)
    
    return {
        "f1": metrics.f1_score(true_labels, predictions, average="weighted")
    }

In [23]:
training_args = transformers.TrainingArguments(
    output_dir=model_config["output_dir"],
    per_device_train_batch_size=model_config["train_batch_size"],
    per_device_eval_batch_size=model_config["valid_batch_size"],
    learning_rate=model_config["learning_rate"],
    num_train_epochs=model_config["epochs"],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)

In [24]:
trainer = transformers.Trainer(
    model=classification_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer_model,
    compute_metrics=evaluate_metrics,
)

In [25]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1875 [00:00<?, ?it/s]

{'loss': 0.6809, 'grad_norm': 0.5785415172576904, 'learning_rate': 2.9840000000000002e-05, 'epoch': 0.02}
{'loss': 0.6302, 'grad_norm': 0.9751928448677063, 'learning_rate': 2.968e-05, 'epoch': 0.03}
{'loss': 0.5626, 'grad_norm': 2.5507569313049316, 'learning_rate': 2.9520000000000002e-05, 'epoch': 0.05}
{'loss': 0.5189, 'grad_norm': 3.1713480949401855, 'learning_rate': 2.936e-05, 'epoch': 0.06}
{'loss': 0.4882, 'grad_norm': 2.110952138900757, 'learning_rate': 2.92e-05, 'epoch': 0.08}
{'loss': 0.4429, 'grad_norm': 5.068164825439453, 'learning_rate': 2.904e-05, 'epoch': 0.1}
{'loss': 0.4336, 'grad_norm': 2.3121767044067383, 'learning_rate': 2.888e-05, 'epoch': 0.11}
{'loss': 0.4261, 'grad_norm': 2.1600167751312256, 'learning_rate': 2.8720000000000003e-05, 'epoch': 0.13}
{'loss': 0.3866, 'grad_norm': 8.545737266540527, 'learning_rate': 2.856e-05, 'epoch': 0.14}
{'loss': 0.4149, 'grad_norm': 5.2312164306640625, 'learning_rate': 2.84e-05, 'epoch': 0.16}
{'loss': 0.4013, 'grad_norm': 4.44275

  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.23756061494350433, 'eval_f1': 0.9057680968468685, 'eval_runtime': 56.4564, 'eval_samples_per_second': 177.128, 'eval_steps_per_second': 2.781, 'epoch': 1.0}
{'loss': 0.2758, 'grad_norm': 3.2346560955047607, 'learning_rate': 1.9920000000000002e-05, 'epoch': 1.01}
{'loss': 0.237, 'grad_norm': 6.097232818603516, 'learning_rate': 1.976e-05, 'epoch': 1.02}
{'loss': 0.2652, 'grad_norm': 5.409407615661621, 'learning_rate': 1.96e-05, 'epoch': 1.04}
{'loss': 0.2417, 'grad_norm': 2.236219882965088, 'learning_rate': 1.944e-05, 'epoch': 1.06}
{'loss': 0.2025, 'grad_norm': 3.221360683441162, 'learning_rate': 1.9280000000000002e-05, 'epoch': 1.07}
{'loss': 0.2284, 'grad_norm': 2.56514573097229, 'learning_rate': 1.912e-05, 'epoch': 1.09}
{'loss': 0.2308, 'grad_norm': 5.76525354385376, 'learning_rate': 1.896e-05, 'epoch': 1.1}
{'loss': 0.247, 'grad_norm': 7.27003812789917, 'learning_rate': 1.8800000000000003e-05, 'epoch': 1.12}
{'loss': 0.2791, 'grad_norm': 3.645059823989868, 'learning

  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.22694045305252075, 'eval_f1': 0.9149914991499151, 'eval_runtime': 56.4748, 'eval_samples_per_second': 177.07, 'eval_steps_per_second': 2.78, 'epoch': 2.0}
{'loss': 0.2377, 'grad_norm': 4.046150207519531, 'learning_rate': 9.84e-06, 'epoch': 2.02}
{'loss': 0.2081, 'grad_norm': 2.6765658855438232, 'learning_rate': 9.68e-06, 'epoch': 2.03}
{'loss': 0.177, 'grad_norm': 3.790120840072632, 'learning_rate': 9.52e-06, 'epoch': 2.05}
{'loss': 0.1886, 'grad_norm': 2.427640676498413, 'learning_rate': 9.36e-06, 'epoch': 2.06}
{'loss': 0.2241, 'grad_norm': 2.901737928390503, 'learning_rate': 9.2e-06, 'epoch': 2.08}
{'loss': 0.2713, 'grad_norm': 3.3363330364227295, 'learning_rate': 9.04e-06, 'epoch': 2.1}
{'loss': 0.2497, 'grad_norm': 4.722509384155273, 'learning_rate': 8.88e-06, 'epoch': 2.11}
{'loss': 0.1962, 'grad_norm': 3.5236868858337402, 'learning_rate': 8.720000000000001e-06, 'epoch': 2.13}
{'loss': 0.2056, 'grad_norm': 4.68654203414917, 'learning_rate': 8.56e-06, 'epoch': 2.14

  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.22296598553657532, 'eval_f1': 0.9182987573240989, 'eval_runtime': 56.5586, 'eval_samples_per_second': 176.808, 'eval_steps_per_second': 2.776, 'epoch': 3.0}
{'train_runtime': 2397.8584, 'train_samples_per_second': 50.045, 'train_steps_per_second': 0.782, 'train_loss': 0.25412537110646566, 'epoch': 3.0}


TrainOutput(global_step=1875, training_loss=0.25412537110646566, metrics={'train_runtime': 2397.8584, 'train_samples_per_second': 50.045, 'train_steps_per_second': 0.782, 'total_flos': 1245553977600000.0, 'train_loss': 0.25412537110646566, 'epoch': 3.0})

In [26]:
device_id = 0 if torch.cuda.is_available() else -1

In [27]:
text_pipe = transformers.pipeline(
    task='text-classification',
    model=classification_model,
    tokenizer=tokenizer_model,
    batch_size=4,
    device=device_id
)

In [28]:
class AltTextDataset:
    
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        encoded_data = tokenizer_model(
            row["text"],
            max_length=10,
            truncation=True,
            padding="max_length"
        )
        
        return {
            "input_ids": torch.tensor(encoded_data["input_ids"]),
            "attention_mask": torch.tensor(encoded_data["attention_mask"]),
        }

In [29]:
IMDB = pd.read_csv("/Users/fariddamania/Downloads/SEM VII/SNLP/Dataset/IMDB Dataset.csv").rename(columns={"review": "text"})
IMDB["label"] = IMDB["sentiment"].map(label_to_id)

In [36]:
checkpoint_path = "/Users/fariddamania/Downloads/SEM VII/SNLP/Finetuning Transformers/my_model_output/checkpoint-1875"
tokenizer_model = transformers.AutoTokenizer.from_pretrained(checkpoint_path)
classification_model = transformers.AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

In [37]:
dataset_instance = AltTextDataset(IMDB)
data_loader = torch.utils.data.DataLoader(
    dataset_instance,
    batch_size=2,
    shuffle=False,
    num_workers=2,
)

In [2]:
!pip3 install torch torchvision torchaudio



In [4]:
import torch
x = torch.rand(5, 3)
print(x)

tensor([[0.5989, 0.3815, 0.6990],
        [0.5309, 0.1359, 0.7691],
        [0.1670, 0.5767, 0.9900],
        [0.9364, 0.3018, 0.9668],
        [0.8974, 0.0624, 0.4086]])


In [6]:
import torch
print(torch.cuda.is_available())

False


In [8]:
!pip3 install torch torchvision torchaudio



In [3]:
import torch
device = torch.device("mps")  # Use the M1 GPU
classification_model.to(device)

NameError: name 'classification_model' is not defined

In [None]:
sample_context = "My name is Farid Damania. I was born in Daman."
sample_question = "Where was Farid born?"

In [None]:
sample_context = "My name is Farid Damania. I was born in Daman."
sample_question = "Where was Farid born?"

In [5]:
classification_model.to("cuda")

NameError: name 'classification_model' is not defined

In [43]:
for index, data_batch in enumerate(data_loader):
    print(data_batch)
    
    # Move data to CUDA
    data_batch = {key: value.to('cuda') for key, value in data_batch.items()}
    
    with torch.no_grad():
        output = classification_model(input_ids=data_batch["input_ids"], attention_mask=data_batch["attention_mask"])
    
    if index == 5:
        break

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'AltTextDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 

In [None]:
output

In [None]:
text_pipe = transformers.pipeline(
    "text-classification",
    model=checkpoint_path,
    batch_size=4,
    device=0
)

In [None]:
text_pipe(["I hated how bad the movie was."] * 10)