# Pretrained: HuggingFace models

In this notebook I'm going to try pretrained models hosted on Hugging Face

In [1]:
# !pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113 --upgrade

## Imports

In [1]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [2]:
SEED = 42

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

## Functions

In [4]:
def generate_batches(X, y, batch_size):
    assert len(X) == len(y)

    np.random.seed(42)

    X = np.array(X)
    y = np.array(y)

    perm = np.random.permutation(len(X))

    for batch_start in range(0, len(X), batch_size):
        selection = perm[batch_start : batch_start + batch_size]
        X_batch = X[selection]
        y_batch = y[selection]

        yield X_batch, y_batch


## Paths

In [5]:
relative_path = os.path.join("../../../../", "data")

In [6]:
sentiment_analysis_data_path = os.path.join(relative_path, "3_sentiment_analysis")

## Data

### Loading data

In [7]:
reviews = pd.read_parquet(
    os.path.join(sentiment_analysis_data_path, "split_reviews.parquet")
)
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206737 entries, 0 to 206736
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   sentiment  206737 non-null  category
 1   review     206737 non-null  object  
 2   fold       206737 non-null  object  
dtypes: category(1), object(2)
memory usage: 3.4+ MB


In [8]:
train = reviews[reviews["fold"] == "train"]
test = reviews[reviews["fold"] == "test"]

In [9]:
sentiment_map = {"good": "positive", "neutral": "neutral", "bad": "negative"}
setiment_map_model = {
    "POSITIVE": "positive",
    "NEUTRAL": "neutral",
    "NEGATIVE": "negative",
}

In [10]:
test_reviews = test["review"].values.tolist()
test_sentiment = test["sentiment"].map(sentiment_map).values


In [11]:
test["review"] = test["review"].str.replace("<p>", " ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review"] = test["review"].str.replace("<p>", " ")


## Modelling

### Parameters

In [12]:
MAX_LENGTH = 512
BATCH_SIZE = 168

### Loading models

In [13]:
model_name = "Tatyana/rubert-base-cased-sentiment-new"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

### Evaluation

In [15]:
pred_labels = []

for test_reviews_batch, _ in generate_batches(test_reviews, test_sentiment, BATCH_SIZE):
    inputs = tokenizer(
        list(test_reviews_batch),
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=MAX_LENGTH,
    ).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_class_ids = logits.argmax(axis=1).cpu().detach().numpy()
    pred_labels_raw = [
        model.config.id2label[predicted_class_id]
        for predicted_class_id in predicted_class_ids
    ]
    pred_labels_batch = [
        setiment_map_model[pred_label_raw] for pred_label_raw in pred_labels_raw
    ]
    pred_labels.extend(pred_labels_batch)

    print(f"{len(pred_labels)}/{len(test_sentiment)}")

168/20674
336/20674
504/20674
672/20674
840/20674
1008/20674
1176/20674
1344/20674
1512/20674
1680/20674
1848/20674
2016/20674
2184/20674
2352/20674
2520/20674
2688/20674
2856/20674
3024/20674
3192/20674
3360/20674
3528/20674
3696/20674
3864/20674
4032/20674
4200/20674
4368/20674
4536/20674
4704/20674
4872/20674
5040/20674
5208/20674
5376/20674
5544/20674
5712/20674
5880/20674
6048/20674
6216/20674
6384/20674
6552/20674
6720/20674
6888/20674
7056/20674
7224/20674
7392/20674
7560/20674
7728/20674
7896/20674
8064/20674
8232/20674
8400/20674
8568/20674
8736/20674
8904/20674
9072/20674
9240/20674
9408/20674
9576/20674
9744/20674
9912/20674
10080/20674
10248/20674
10416/20674
10584/20674
10752/20674
10920/20674
11088/20674
11256/20674
11424/20674
11592/20674
11760/20674
11928/20674
12096/20674
12264/20674
12432/20674
12600/20674
12768/20674
12936/20674
13104/20674
13272/20674
13440/20674
13608/20674
13776/20674
13944/20674
14112/20674
14280/20674
14448/20674
14616/20674
14784/20674
14952/20

In [16]:
averaging = "micro"
f1 = f1_score(test_sentiment, pred_labels, average=averaging)

In [17]:
print(f"F1 score with {averaging}-averaging is {f1.round(3)}")

F1 score with micro-averaging is 0.346
