# Pretrained: HuggingFace models

In this notebook I'm going to try pretrained models hosted on Hugging Face

In [1]:
# !pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113 --upgrade

## Imports

In [1]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from typing import List
from typing import Any

In [2]:
from shows_analysis.code.sentiment_analysis.huggingface_pipeilne import \
    InferencePipeline

In [3]:
SEED = 42

## Functions

## Paths

In [4]:
relative_path = os.path.join("../../../", "data")

In [5]:
sentiment_analysis_data_path = os.path.join(relative_path, "3_sentiment_analysis")

## Data

### Loading data

In [6]:
reviews = pd.read_parquet(
    os.path.join(sentiment_analysis_data_path, "split_reviews.parquet")
)
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206737 entries, 0 to 206736
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   sentiment  206737 non-null  category
 1   review     206737 non-null  object  
 2   fold       206737 non-null  object  
dtypes: category(1), object(2)
memory usage: 3.4+ MB


In [7]:
train = reviews[reviews["fold"] == "train"]
test = reviews[reviews["fold"] == "test"]

In [8]:
test_reviews = test["review"].values.tolist()
test_sentiment = test["sentiment"].values


In [9]:
test["review"] = test["review"].str.replace("<p>", " ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review"] = test["review"].str.replace("<p>", " ")


## Modelling

### Listing models

In [11]:
MODEL_NAME = "Tatyana/rubert-base-cased-sentiment-new"
pipeline = InferencePipeline(model_name=MODEL_NAME)

### Evaluation

In [None]:
pred_labels = []

for test_reviews_batch, _ in pipeline.generate_batches(
    test_reviews, test_sentiment
):
    pred_labels.extend(
        pipeline.texts_to_sentiments(
            texts=list(test_reviews_batch),
        )
    )

    print(f"{len(pred_labels)}/{len(test_sentiment)}")

In [16]:
averaging = "micro"
f1 = f1_score(test_sentiment, pred_labels, average=averaging)

In [17]:
print(f"F1 score with {averaging}-averaging is {f1.round(3)}")

F1 score with micro-averaging is 0.346
