## Imports

In [16]:
import pandas as pd
import numpy as np

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

In [None]:
# !pip install peft==0.10.0

Collecting peft==0.10.0
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft==0.10.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft==0.10.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft==0.10.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft==0.10.0)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft==0.10.0)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from 

In [2]:
# !pip uninstall -y transformers
# !pip uninstall -y tokenizers
# !pip install transformers==4.40.1 --no-cache-dir

Found existing installation: transformers 4.40.1
Uninstalling transformers-4.40.1:
  Successfully uninstalled transformers-4.40.1
Found existing installation: tokenizers 0.19.1
Uninstalling tokenizers-0.19.1:
  Successfully uninstalled tokenizers-0.19.1
Collecting transformers==4.40.1
  Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.1)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.40.1-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m116.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB

In [2]:
import transformers
transformers.__version__

'4.40.1'

## Data loading

In [3]:
df = pd.read_csv("movie_reviews.csv")
df.head()

Unnamed: 0,text,label
0,If you havent seen this movie than you need to...,1
1,but Cinderella gets my vote not only for the w...,0
2,This movie is pretty cheesy but I do give it c...,1
3,I have not seen a Van Damme flick for a while ...,1
4,This is a sleeper It defines Nicholas Cage The...,1


In [4]:
dataset = Dataset.from_pandas(df)

## Pretrained distilbert

In [21]:
model_name = "sarahai/movie-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)



## Train params

In [13]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [14]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["test"]

In [22]:
from datasets import load_metric

def compute_metrics(eval_pred):
    metric = load_metric('accuracy')
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return metric.compute(predictions=preds, references=labels)

In [23]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_total_limit=3,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2198,0.189889,0.933
2,0.1537,0.225419,0.93225
3,0.0809,0.290452,0.92875
4,0.0497,0.318009,0.93425
5,0.0273,0.3677,0.9355


  metric = load_metric('accuracy')


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

TrainOutput(global_step=11250, training_loss=0.11268526708814833, metrics={'train_runtime': 8859.4359, 'train_samples_per_second': 20.317, 'train_steps_per_second': 1.27, 'total_flos': 2.384413175808e+16, 'train_loss': 0.11268526708814833, 'epoch': 5.0})

In [25]:
trainer.evaluate()

{'eval_loss': 0.3677004873752594,
 'eval_accuracy': 0.9355,
 'eval_runtime': 70.4008,
 'eval_samples_per_second': 56.818,
 'eval_steps_per_second': 0.895,
 'epoch': 5.0}

## Submit

In [26]:
test = pd.read_csv("test_data.csv")
test.head()

Unnamed: 0,Id,text
0,0,What can possibly said about this movie other ...
1,1,I dont care how many bad reviews purple rain g...
2,2,Ken Russell directed this weird Not very ero...
3,3,This is a great movie from the lost age of rea...
4,4,I have a problem with the movie snobs who cons...


In [27]:
test.shape

(10000, 2)

In [28]:
test.isna().sum()

Unnamed: 0,0
Id,0
text,0


In [34]:
test_dataset = Dataset.from_dict({"text": test["text"].tolist()})
test_dataset

Dataset({
    features: ['text'],
    num_rows: 10000
})

In [35]:
test_dataset = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [36]:
results = trainer.predict(test_dataset)

In [37]:
results[:5]

(array([[ 4.436788 , -4.6799355],
        [-4.040261 ,  3.8046088],
        [ 4.1065927, -4.3822975],
        ...,
        [-4.1520042,  3.9233365],
        [-3.7942557,  3.5335526],
        [ 4.2339587, -4.4829493]], dtype=float32),
 None,
 {'test_runtime': 175.4049,
  'test_samples_per_second': 57.011,
  'test_steps_per_second': 0.895})

In [39]:
results.predictions.shape

(10000, 2)

In [41]:
import numpy as np

pd.DataFrame({
    "Id": list(range(len(results.predictions))),
    "Category": np.argmax(results.predictions, axis=1)
}).to_csv("distilbert_finetuned.csv", index=False)