# Pretrained: HuggingFace models

In this notebook I'm going to try pretrained models hosted on Hugging Face

In [1]:
# !pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113 --upgrade
# !pip3 install ipywidgets==7.7.2

## Imports

In [2]:
import os
from collections import defaultdict
from typing import Any, List

import numpy as np
import pandas as pd
import torch

In [3]:
from shows_analysis.code.sentiment_analysis.huggingface_pipeilne import (
    InferencePipeline,
)

In [4]:
SEED = 42


## Functions

## Paths

In [5]:
relative_path = os.path.join("../../../", "data")


In [6]:
sentiment_analysis_data_path = os.path.join(relative_path, "3_sentiment_analysis")


## Data

### Loading data

In [7]:
reviews = pd.read_parquet(
    os.path.join(sentiment_analysis_data_path, "split_reviews.parquet")
)
reviews.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206737 entries, 0 to 206736
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   sentiment  206737 non-null  category
 1   review     206737 non-null  object  
 2   fold       206737 non-null  object  
dtypes: category(1), object(2)
memory usage: 3.4+ MB


In [8]:
train = reviews[reviews["fold"] == "train"]
test = reviews[reviews["fold"] == "test"]


In [9]:
test["review"] = test["review"].str.replace("<p>", " ")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review"] = test["review"].str.replace("<p>", " ")


In [10]:
test_reviews = test["review"].values.tolist()
test_sentiment = test["sentiment"].values

## Modelling

### Listing models

In [11]:
MODELS = [
    "Tatyana/rubert-base-cased-sentiment-new",
    "blanchefort/rubert-base-cased-sentiment",
]

SCORES = defaultdict(float)

In [12]:
MODELS[1:]

['blanchefort/rubert-base-cased-sentiment']

### Evaluation

In [13]:
for model in MODELS[1:]:
    pipeline = InferencePipeline(
        texts=test_reviews, class_labels=test_sentiment, model_name=model
    )
    pipeline.batch_inference()
    SCORES[model] = pipeline.get_f1_score(pipeline.class_labels, pipeline.pred_labels)

Inferencing using blanchefort/rubert-base-cased-sentiment model


  0%|          | 0/124 [00:00<?, ?it/s]

In [14]:
print(SCORES)

defaultdict(<class 'float'>, {'blanchefort/rubert-base-cased-sentiment': 0.043})
