# Pretrained: HuggingFace models

In this notebook I'm going to try pretrained models hosted on Hugging Face

In [1]:
# !pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113 --upgrade

## Imports

In [1]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [2]:
SEED = 42

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Paths

In [5]:
relative_path = os.path.join("../../../../", "data")

In [6]:
sentiment_analysis_data_path = os.path.join(relative_path, "3_sentiment_analysis")

## Data

### Loading data

In [7]:
reviews = pd.read_parquet(
    os.path.join(sentiment_analysis_data_path, "split_reviews.parquet")
)
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206737 entries, 0 to 206736
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   sentiment  206737 non-null  category
 1   review     206737 non-null  object  
 2   fold       206737 non-null  object  
dtypes: category(1), object(2)
memory usage: 3.4+ MB


In [8]:
train = reviews[reviews["fold"] == "train"]
test = reviews[reviews["fold"] == "test"]

In [25]:
sentiment_map = {"good": "positive", "neutral": "neutral", "bad": "negative"}
setiment_map_model = {"POSITIVE": "positive", "NEUTRAL": "neutral", "NEGATIVE": "negative"}

In [10]:
test_reviews = test["review"].values.tolist()
test_sentiment = test["sentiment"].map(sentiment_map).values


## Modelling

### Parameters

In [30]:
MAX_LENGTH = 512
BATCH_SIZE = 100

### Loading models

In [31]:
model_name = "Tatyana/rubert-base-cased-sentiment-new"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

### Evaluation

In [32]:
inputs = tokenizer(
    test_reviews[:BATCH_SIZE], padding=True, truncation=True, return_tensors="pt", max_length=MAX_LENGTH
).to(device)

In [33]:
with torch.no_grad():
    logits = model(**inputs).logits

In [34]:
predicted_class_ids = logits.argmax(axis=1).cpu().detach().numpy()

In [35]:
pred_labels_raw = [
    model.config.id2label[predicted_class_id]
    for predicted_class_id in predicted_class_ids
]
pred_labels = [setiment_map_model[pred_label_raw] for pred_label_raw in pred_labels_raw]

In [36]:
averaging = "micro"
f1 = f1_score(test_sentiment[:BATCH_SIZE], pred_labels, average=averaging)

In [37]:
print(f"F1 score with {averaging}-averaging is {f1.round(3)}")

F1 score with micro-averaging is 0.4
