# Using HuggingFace Transformers for Sentiment Analysis
## Pre-Trained vs Fine-Tuned Models

### 1. Dataset

#### Load and clean up the dataset

In [2]:
%%time
import numpy as np
import pandas as pd

data = pd.read_csv('/kaggle/input/phonereviews/data.csv')
data = data.dropna()
data_filtered = data.loc[data['Rating'].isin({1, 2, 4, 5})]
df = data_filtered
df = df.reset_index(drop=True)
df.loc[(df['Rating'] >= 4), 'Sentiment'] = 'positive'
df.loc[(df['Rating'] <= 2), 'Sentiment'] = 'negative'
df = df.loc[df['Review'].apply(lambda text: len(text) >= 20)]
df['Sentiment'].value_counts()

CPU times: user 4.63 s, sys: 510 ms, total: 5.14 s
Wall time: 8.41 s


positive    326183
negative     67920
Name: Sentiment, dtype: int64

#### Split into train and test sets

In [3]:
from sklearn.model_selection import train_test_split

texts = df['Review'].values
labels = df['Sentiment'].values
X_train, X_test, y_train, y_test = train_test_split(
    texts,
    labels,
    test_size=0.1,
    random_state=1,
)
print(f'len(X_train) = {len(X_train)}')
print(f'len(y_train) = {len(y_train)}')
print(f'len(X_test) = {len(X_test)}')
print(f'len(y_test) = {len(y_test)}')

len(X_train) = 354692
len(y_train) = 354692
len(X_test) = 39411
len(y_test) = 39411


### 2. Use a pre-trained HuggingFace Transformer model

The model: https://huggingface.co/cointegrated/rubert-tiny-sentiment-balanced  
It has 12M parameters.

#### First, prepare the tokenizer and the model

In [4]:
!pip install transformers sentencepiece --quiet

[0m

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_checkpoint = 'cointegrated/rubert-tiny-sentiment-balanced'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model_pretrained = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model_pretrained.cuda()

Downloading:   0%|          | 0.00/377 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/884 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.0M [00:00<?, ?B/s]

#### Define a model-independent function that will return a prediction for a single text

In [6]:
from typing_extensions import Literal  
# from typing import Literal  # in Python 3.8+

def predict(
    model: AutoModelForSequenceClassification, 
    text: str, 
    return_type: Literal['label', 'score', 'proba'] = 'label',
):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
        proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()[0]
    if return_type == 'label':
        return model.config.id2label[proba.argmax()]
    elif return_type == 'score':
        return proba.dot([-1, 0, 1])
    return proba

#### Run a couple of simple tests

In [7]:
text = 'Какая гадость эта ваша заливная рыба!'
# classify the text
print(predict(model_pretrained, text, 'label'))  # negative
# score the text on the scale from -1 (very negative) to +1 (very positive)
print(predict(model_pretrained, text, 'score'))  # -0.5894946306943893
# calculate probabilities of all labels
print(predict(model_pretrained, text, 'proba'))  # [0.7870447  0.4947824  0.19755007]

negative
-0.5894946306943893
[0.7870447  0.4947824  0.19755007]


In [8]:
examples = [
    'Отличный телефон - сколько пользуюсь, столько и радуюсь',
    'Ужасный телефон, хуже некуда!',
    'Сегодня отличная погода!',
    'У Васи ужасное настроение.',
    'Эта модель основана на трансформерах.'
]
for example in examples:
    print(predict(model_pretrained, example))

positive
negative
positive
negative
neutral


#### Define a model-independent function for evaluation on the test set

In [11]:
from typing import Optional

from sklearn.metrics import classification_report
from tqdm.auto import tqdm

def evaluate_model(
    model: AutoModelForSequenceClassification, 
    subset: Optional[int] = None,
):
    y_pred = []
    
    if subset is None:
        subset = X_test.shape[0]
    
    for x in tqdm(X_test[:subset]):
        prediction = predict(model, x)
        y_pred.append(prediction)
        
    print(classification_report(y_test[:subset], y_pred))

#### Evaluate the pre-trained model on the test set

In [12]:
%%time

evaluate_model(model_pretrained)  # takes about 15-18 min on CPU, 2 min on T100 GPU

  0%|          | 0/39411 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    negative       0.40      0.55      0.46      6742
     neutral       0.00      0.00      0.00         0
    positive       0.96      0.45      0.62     32669

    accuracy                           0.47     39411
   macro avg       0.45      0.34      0.36     39411
weighted avg       0.86      0.47      0.59     39411

CPU times: user 2min 1s, sys: 471 ms, total: 2min 1s
Wall time: 2min 1s


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
(0.46 + 0.62) / 2

0.54

The results are not so good, because the model was trained on other domains than mobile phones.

### 3. Fine-tune the pre-trained model on our dataset

#### Split the training data into train and dev

In [14]:
from datasets import Dataset, DatasetDict

n_train_examples = 10000
n_dev_examples = 5000
train_df = pd.DataFrame(
    {
        'text': X_train[:n_train_examples],
        'label': y_train[:n_train_examples],
    }
)
dev_df = pd.DataFrame(
    {
        'text': X_train[n_train_examples:n_train_examples + n_dev_examples],
        'label': y_train[n_train_examples:n_train_examples + n_dev_examples],
    }
)

data = DatasetDict(
    {
        'train': Dataset.from_pandas(train_df[['text', 'label']].reset_index(drop=True)),
        'dev': Dataset.from_pandas(dev_df[['text', 'label']].reset_index(drop=True)),
    }
)
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
    dev: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

#### Tokenize the data

In [15]:
all_labels = ['negative', 'neutral', 'positive']
data_tokenized = data.map(
    lambda row: tokenizer(row['text'], truncation=True), batched=True, remove_columns=['text']
)
data_tokenized = data_tokenized.map(
    lambda row: {'label': [all_labels.index(label) for label in row['label']]}, batched=True
)
data_tokenized

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
    dev: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

#### Prepare for training

In [16]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m892.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0m

In [17]:
import os

os.environ["WANDB_DISABLED"] = "true"

In [18]:
import evaluate
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

data_collator = DataCollatorWithPadding(tokenizer)

training_args = TrainingArguments(
    output_dir='test_trainer', 
    evaluation_strategy='epoch',
)
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(
        predictions=predictions, 
        references=labels,
    )

trainer = Trainer(
    model=model_pretrained,
    args=training_args,
    train_dataset=data_tokenized['train'],
    eval_dataset=data_tokenized['dev'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

#### Train and save model

In [19]:
trainer.train()  # GPU recommended

***** Running training *****
  Num examples = 10000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3750


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3167,0.312359,0.8802
2,0.2587,0.413976,0.88
3,0.1742,0.491137,0.8828


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2000
Configuration saved in test_trainer/checkpoint-2000/config.json
Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2500
Configuration saved in test_trainer/checkpoint-2500/config.json
Model weights saved in test_trainer/checkpoint-2500/p

TrainOutput(global_step=3750, training_loss=0.25770050557454427, metrics={'train_runtime': 164.9778, 'train_samples_per_second': 181.843, 'train_steps_per_second': 22.73, 'total_flos': 152357844439008.0, 'train_loss': 0.25770050557454427, 'epoch': 3.0})

In [20]:
trainer.save_model('my_finetuned_model')

Saving model checkpoint to my_finetuned_model
Configuration saved in my_finetuned_model/config.json
Model weights saved in my_finetuned_model/pytorch_model.bin


#### Load the fine-tuned model

In [21]:
# checkpoint = 'test_trainer/checkpoint-3000'  # 500, 1000, 1500, 2000, 3000
checkpoint = 'my_finetuned_model'
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny-sentiment-balanced')
model_finetuned = AutoModelForSequenceClassification.from_pretrained(checkpoint)
if torch.cuda.is_available():
    model_finetuned.cuda()

loading file https://huggingface.co/cointegrated/rubert-tiny-sentiment-balanced/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/92c2b9e2fa0ff2385ddcfaa42bc3c80da2b518d8c1d06f818f81606932015085.77a9cd5f52c58bd231a1d3bc7390917dc6d0fadc0f17cee179994e8dfe382aba
loading file https://huggingface.co/cointegrated/rubert-tiny-sentiment-balanced/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/beec8a371470ceff29166c53d40c56610e7e250be63a650526981481d04346e2.4fda28baf56969bde43ae6200f8299c1370bfd2992311ef8f737e7d619d8b5ed
loading file https://huggingface.co/cointegrated/rubert-tiny-sentiment-balanced/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/cointegrated/rubert-tiny-sentiment-balanced/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/4e57ac0d61c767e7d5db0f3b9e68f622ddedacc5730d4d01d5079eb011ff67ef.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd806

In [22]:
examples = [
    'Отличный телефон - сколько пользуюсь, столько и радуюсь',
    'Ужасный телефон, хуже некуда!',
    'Сегодня отличная погода!',
    'У Васи ужасное настроение.',
    'Эта модель основана на трансформерах.'
]
for example in examples:
    print(predict(model_finetuned, example))

positive
negative
positive
negative
positive


#### Evaluation

In [23]:
print(checkpoint)
evaluate_model(model_finetuned)  # GPU recommended

my_finetuned_model


  0%|          | 0/39411 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    negative       0.69      0.62      0.65      6742
     neutral       0.00      0.00      0.00         0
    positive       0.92      0.94      0.93     32669

    accuracy                           0.89     39411
   macro avg       0.54      0.52      0.53     39411
weighted avg       0.88      0.89      0.89     39411



  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
(0.65 + 0.93) / 2

0.79

#### Things to try further
* balanced dataset
* larger training data
* hyperparameter tuning
* sentence transformers!