# Pretrained: HuggingFace models

In this notebook I'm going to try pretrained models hosted on Hugging Face

In [1]:
# !pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113 --upgrade


## Imports

In [2]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from typing import List
from typing import Any

In [3]:
SEED = 42


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Functions

In [5]:
def generate_batches(X: List[Any], y: List[Any], batch_size: int):
    assert len(X) == len(y)

    np.random.seed(42)

    X = np.array(X)
    y = np.array(y)

    perm = np.random.permutation(len(X))

    for batch_start in range(0, len(X), batch_size):
        selection = perm[batch_start : batch_start + batch_size]
        X_batch = X[selection]
        y_batch = y[selection]

        yield X_batch, y_batch


In [6]:
def tokenize(
    tokenizer,
    texts: List[str],
    max_length: int,
    padding: bool = True,
    truncation: bool = True,
) -> List[str]:

    inputs = tokenizer(
        texts,
        padding=padding,
        truncation=truncation,
        return_tensors="pt",
        max_length=max_length,
    ).to(device)

    return inputs

In [7]:
def get_logits(model, inputs):
    with torch.no_grad():
        logits = model(**inputs).logits

    return logits


In [8]:
def correct_labels(labels: List[str], correction_map: dict) -> List[str]:
    corrected_labels = [correction_map[label] for label in labels]

    return corrected_labels

In [9]:
def logits_to_labels(model, logits) -> List[str]:
    predicted_class_ids = logits.argmax(axis=1).cpu().detach().numpy()

    pred_labels = [
        model.config.id2label[predicted_class_id]
        for predicted_class_id in predicted_class_ids
    ]

    return pred_labels

In [33]:
def texts_to_sentiments(
    model, tokenizer, texts: List[str], max_length: int, correction_map: dict
) -> List[str]:

    inputs = tokenize(tokenizer, texts, max_length)
    logits = get_logits(model, inputs)
    pred_labels = logits_to_labels(model=model, logits=logits)
    pred_labels = correct_labels(pred_labels, correction_map)

    return pred_labels

## Paths

In [11]:
relative_path = os.path.join("../../../../", "data")


In [12]:
sentiment_analysis_data_path = os.path.join(relative_path, "3_sentiment_analysis")


## Data

### Loading data

In [13]:
reviews = pd.read_parquet(
    os.path.join(sentiment_analysis_data_path, "split_reviews.parquet")
)
reviews.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206737 entries, 0 to 206736
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   sentiment  206737 non-null  category
 1   review     206737 non-null  object  
 2   fold       206737 non-null  object  
dtypes: category(1), object(2)
memory usage: 3.4+ MB


In [14]:
train = reviews[reviews["fold"] == "train"]
test = reviews[reviews["fold"] == "test"]


In [15]:
sentiment_map = {"good": "positive", "neutral": "neutral", "bad": "negative"}


In [16]:
test_reviews = test["review"].values.tolist()
test_sentiment = test["sentiment"].map(sentiment_map).values

In [17]:
test["review"] = test["review"].str.replace("<p>", " ")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review"] = test["review"].str.replace("<p>", " ")


## Modelling

### Parameters

In [18]:
MAX_LENGTH = 512
BATCH_SIZE = 168


### Loading models

In [22]:
model_name = "Tatyana/rubert-base-cased-sentiment-new"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

correction_map = {
    "POSITIVE": "positive",
    "NEUTRAL": "neutral",
    "NEGATIVE": "negative",
}

### Evaluation

In [34]:
pred_labels = []

for test_reviews_batch, _ in generate_batches(test_reviews, test_sentiment, BATCH_SIZE):
    pred_labels.extend(
        texts_to_sentiments(
            model=model,
            tokenizer=tokenizer,
            texts=list(test_reviews_batch),
            max_length=MAX_LENGTH,
            correction_map=correction_map,
        )
    )

    print(f"{len(pred_labels)}/{len(test_sentiment)}")

168/20674
336/20674
504/20674

KeyboardInterrupt: 

In [16]:
averaging = "micro"
f1 = f1_score(test_sentiment, pred_labels, average=averaging)


In [17]:
print(f"F1 score with {averaging}-averaging is {f1.round(3)}")


F1 score with micro-averaging is 0.346
