In [1]:
from datasets import Dataset

In [2]:
import pandas as pd

In [4]:
df = pd.read_csv('data/small_corpus_neural.csv',index_col=0)
df['reviews']= df['reviews'].astype(str)

In [5]:
def score_to_Target(value):
    if value >= 5:
        return 2
    if value <= 4 and value >= 2:
        return 1
    else:
        return 0

In [6]:
df['rating_class'] = df['ratings'].apply(lambda x:score_to_Target(x))

In [7]:
from sklearn.utils import shuffle
df = shuffle(df)

In [8]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df,
                                     stratify=df["rating_class"],
                                     random_state=42)

In [11]:
from transformers import DistilBertTokenizerFast

In [12]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [13]:
train_text = list(train_df['reviews'])
val_text = list(test_df['reviews'])

In [14]:
train_encodings = tokenizer(train_text, truncation=True, padding=True)
val_encodings = tokenizer(val_text, truncation=True, padding=True)

In [15]:
train_labels = list(train_df['rating_class'])
val_labels = list(test_df['rating_class'])

In [16]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
class SentimentAnalyserDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [18]:
train_dataset = SentimentAnalyserDataset(train_encodings, train_labels)
val_dataset = SentimentAnalyserDataset(val_encodings, val_labels)

In [19]:
len(val_dataset)

11250

In [20]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [21]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

In [22]:
batch_size = 16
logging_steps = len(train_dataset) // batch_size
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,)

In [23]:
from transformers import AutoModelForSequenceClassification

In [24]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=3).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

In [25]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=train_dataset,
                  eval_dataset=val_dataset)
trainer.train();

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6188,0.558466,0.754933,0.743397
2,0.4694,0.561262,0.762578,0.759666


In [26]:
results = trainer.evaluate()
results

{'eval_loss': 0.561262309551239,
 'eval_accuracy': 0.7625777777777778,
 'eval_f1': 0.7596657643328498,
 'eval_runtime': 81.6405,
 'eval_samples_per_second': 137.799,
 'epoch': 2.0,
 'eval_mem_cpu_alloc_delta': 65536,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_peaked_delta': 579226112}

In [27]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        'distilbert-base-uncased', num_labels=3)

In [31]:
len(val_dataset)-200

11050

In [32]:
train_subset,_= torch.utils.data.random_split(train_dataset, [1000, 32750])
val_subset,_ = torch.utils.data.random_split(val_dataset, [200, 11050])

In [35]:
torch.cuda.empty_cache() 

In [36]:
trainer = Trainer(model_init=model_init, args=training_args,
                  compute_metrics=compute_metrics, train_dataset=train_subset,
                  eval_dataset=val_subset)
best_run = trainer.hyperparameter_search(n_trials=3, direction="maximize")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

RuntimeError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 15.75 GiB total capacity; 14.82 GiB already allocated; 35.31 MiB free; 14.92 GiB reserved in total by PyTorch)

In [33]:
for key, value in best_run.hyperparameters.items():
    setattr(trainer.args, key, value)

trainer.train_dataset = train_dataset
trainer.eval_dataset = val_dataset
trainer.train();

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_clas

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6979,0.641599,0.735111,0.733305
2,0.576,0.628538,0.724444,0.720287
3,0.4694,0.63902,0.734222,0.734599
4,0.3561,0.673149,0.736889,0.734568
5,0.2974,0.686491,0.736889,0.734662
