In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install -U transformers==4.37.0 evaluate accelerate==0.26.0 peft==0.6.0 trl==0.7.4



In [3]:
import torch

torch.backends.cudnn.deterministic = True

RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cuda


In [4]:
import requests
import shutil
import gzip
import pandas as pd

url = ("https://github.com/rasbt/machine-learning-book/raw/main/ch08/movie_data.csv.gz")
filename = url.split("/")[-1]
with open(filename, "wb") as f:
    r = requests.get(url)
    f.write(r.content)
with gzip.open('movie_data.csv.gz', 'rb') as f_in:
    with open('movie_data.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [23]:
from datasets import load_dataset
movie_train_ds = load_dataset("csv", data_files="movie_data.csv", sep=",", names=["review", "label"], skiprows=1, split='train[:70%]')
movie_valid_ds = load_dataset("csv", data_files="movie_data.csv", sep=",", names=["review", "label"], skiprows=1, split='train[70%:85%]')
movie_test_ds = load_dataset("csv", data_files="movie_data.csv", sep=",", names=["review", "label"], skiprows=1, split='train[85%:]')

print(movie_train_ds)
print(movie_valid_ds)

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['review', 'label'],
    num_rows: 35000
})
Dataset({
    features: ['review', 'label'],
    num_rows: 7500
})


In [25]:
movie_train_ds['label'][:2]

[1, 0]

In [7]:
# train_texts = df.iloc[:35000]['review'].values
# train_labels = df.iloc[:35000]['sentiment'].values
# valid_texts = df.iloc[35000:40000]['review'].values
# valid_labels = df.iloc[35000:40000]['sentiment'].values
# test_texts = df.iloc[40000:]['review'].values
# test_labels = df.iloc[40000:]['sentiment'].values
# print(train_texts.shape)
# print(train_labels.shape)

In [26]:
import tokenizers
print(tokenizers.__version__)
import transformers
print(transformers.__version__)


from transformers import AutoTokenizer

MODEL = f"distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=False, use_auth_token=False)

0.15.2
4.37.0




In [27]:
def tokenize(batch):
    return tokenizer(batch["review"], padding=True, truncation=True)

movie_train_tokenized = movie_train_ds.map(tokenize, batched=True, batch_size=None)
movie_valid_tokenized = movie_valid_ds.map(tokenize, batched=True, batch_size=None)
movie_test_tokenized = movie_test_ds.map(tokenize, batched=True, batch_size=None)

print(movie_train_tokenized)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Dataset({
    features: ['review', 'label', 'input_ids', 'attention_mask'],
    num_rows: 35000
})


In [10]:
# class IMDbDataset(torch.utils.data.Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels
#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx], device=DEVICE) for key, val in self.encodings.items()}
#         item['labels'] = torch.tensor(self.labels[idx], device=DEVICE)
#         return item
#     def __len__(self):
#         return len(self.labels)

# train_dataset = IMDbDataset(train_encodings, train_labels)
# valid_dataset = IMDbDataset(valid_encodings, valid_labels)
# # test_dataset = IMDbDataset(test_encodings, test_labels)
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True) 
# valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=False) 
# # test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

In [28]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2).to(DEVICE)
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [29]:
import evaluate
import numpy as np
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    # logits, labels = eval_pred
    # predictions = np.argmax(logits, axis=-1)
    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)
    return metric.compute(predictions=preds, references=labels)

In [30]:
print(movie_train_tokenized[:3].items())

dict_items([('review', ['In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />"Murder in Greenwich" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The

In [34]:
movie_train_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
movie_valid_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [36]:
from transformers import Trainer, TrainingArguments

batch_size = 16
logging_steps = len(movie_train_tokenized) // batch_size
model_name = f"{MODEL}-finetuned-movie-sentiment"
training_args = TrainingArguments(output_dir=model_name, num_train_epochs=3, learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  logging_steps=logging_steps,
                                  log_level="error", report_to="none")

trainer = Trainer(
    model=model, args=training_args, 
    train_dataset=movie_train_tokenized, eval_dataset=movie_valid_tokenized, 
    tokenizer=tokenizer, compute_metrics=compute_metrics,
)


In [37]:
trainer.train()

KeyboardInterrupt: 

In [38]:
preds_output = trainer.predict(movie_test_tokenized)
print(preds_output.metrics)

{'test_loss': 0.20976227521896362, 'test_accuracy': 0.9208, 'test_runtime': 70.2644, 'test_samples_per_second': 106.74, 'test_steps_per_second': 6.675}
