# Pretrained: Dostoevsky

In this notebook I'm going to try simple pretrained model from `Dostoevsky` package

In [2]:
# %pip install dostoevsky==0.6.0

In [5]:
# !python -m dostoevsky download fasttext-social-network-model

## Imports

In [6]:
import os

import numpy as np
import pandas as pd
from dostoevsky.models import FastTextSocialNetworkModel
from dostoevsky.tokenization import RegexTokenizer
from sklearn.metrics import f1_score

In [7]:
SEED = 42

## Paths

In [8]:
relative_path = os.path.join("../../../../", "data")

In [9]:
sentiment_analysis_data_path = os.path.join(relative_path, "3_sentiment_analysis")

## Data

### Loading data

In [11]:
reviews = pd.read_parquet(
    os.path.join(sentiment_analysis_data_path, "split_reviews.parquet")
)
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206737 entries, 0 to 206736
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   sentiment  206737 non-null  category
 1   review     206737 non-null  object  
 2   fold       206737 non-null  object  
dtypes: category(1), object(2)
memory usage: 3.4+ MB


In [12]:
train = reviews[reviews["fold"] == "train"]
test = reviews[reviews["fold"] == "test"]

## Modelling

### Evaluation

In [None]:
tokenizer = RegexTokenizer()
model = FastTextSocialNetworkModel(tokenizer=tokenizer)



In [53]:
sentiment_map = {"good": "positive", "neutral": "neutral", "bad": "negative"}

In [27]:
test_reviews = test["review"].values
test_sentiment = (
    test["sentiment"]
    .map(sentiment_map)
    .values
)

In [47]:
pred_labels_raw = model.predict(test_reviews, k=5)
pred_labels = [
    sorted(pred_label.items(), key=lambda item: item[1], reverse=True)[0][0]
    for pred_label in pred_labels_raw
]

In [57]:
pred_labels = [pred_label if pred_label in sentiment_map.values() else "neutral" for pred_label in pred_labels ]

In [58]:
pd.Series(pred_labels).value_counts(())

neutral     14803
negative     3821
positive     2050
dtype: int64

In [59]:
averaging = "micro"
f1 = f1_score(test_sentiment, pred_labels, average=averaging)

In [60]:
print(f"F1 score with {averaging}-averaging is {f1.round(3)}")

F1 score with micro-averaging is 0.233
