# [Model Source](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment)


# Model training data:
|Language |   Number of reviews|
|---------|---------|
|English |    150k|
|Dutch |      80k|
|German |     137k|
|French |     140k|
|Italian |    72k|
|Spanish |    50k|

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from langdetect import detect
from langdetect import DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from imblearn.metrics import macro_averaged_mean_absolute_error
from torch.nn import Softmax
from lxml import html
from datasets import Dataset

In [None]:

def strip_html(s):
    return str(html.fromstring(s).text_content())
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment") # uses subword-based tokenization
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
# sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
DetectorFactory.seed = 0


In [None]:

data=pd.read_csv('data/finegrained_sentiment_analysis.csv')
data.dropna(inplace=True)
print(data.shape)
print(data['score'].value_counts())

# Class imbalance
|Class|N|
|---|---|
|5  |  17574|
|4  |   3936|
|1  |   2571|
|3  |   2078|
|2  |   1463|


In [None]:
data=data[~data['review'].duplicated(keep=False)]
print(data.shape)
print(data)
data=data[:5000]

data['review_summary'] = data['review_summary'].apply(lambda x: strip_html(x))
data['review'] = data['review'].apply(lambda x: strip_html(x))

In [None]:
def detect_language(x):
    review = x['review']
    if (review == '') | (review == np.nan):
        return np.nan
    try:
        lan = detect(review)
    except LangDetectException as e:
        print(f'Insufficient input text: {x["review"]} - {e}')
        lan=np.nan
    return lan
# data['language'] = data.apply(detect_language, axis=1)
# print(data['language'].value_counts())

|language code|N|Language
|----|-----|-----|
|en   | 25272  |  english   |
|de   |     6  |  german    |
|es   |     5  |  spanish   |
|pt   |     1  |  portuguese    |
|fr   |     1  |  french    |
|cy   |     1  |  welsh |
|tl   |     1  |  tagalog   |
|it   |     1  |  italian   |
|nl   |     1  |  dutch |
|af   |     1  |  afrikaans |

In [None]:
#%%
d=data.copy()

def f(x):
    review=x['review']
    tokens_review = tokenizer(review, padding=True, truncation=True, return_tensors="pt")
    review_summary=x['review_summary']
    tokens_review_summary = tokenizer(review_summary, padding=True, truncation=True, return_tensors="pt")
    if len(tokens_review.input_ids) > 545:
        print(len(tokens_review.input_ids))
    with torch.no_grad():
        logits_review = model(**tokens_review, return_dict=True).logits
        logits_review_summary = model(**tokens_review_summary, return_dict=True).logits
    probs_review = logits_review.softmax(-1)[0]
    probs_review_summary = logits_review_summary.softmax(-1)[0]
    probs_combination = (logits_review * logits_review_summary).softmax(-1)[0]
    labels = model.config.id2label
    review_class_id = probs_review.argmax().item()
    review_summary_class_id = probs_review_summary.argmax().item()
    review_combination_class_id = probs_combination.argmax().item()
    return pd.Series({
        'sentiment_score': int(labels[review_combination_class_id].split(' ')[0]),
        'classification_confidence': probs_combination[review_combination_class_id].item(),
        'sentiment_score_review': int(labels[review_class_id].split(' ')[0]),
        'classification_confidence_review': probs_review[review_class_id].item(),
        'sentiment_score_review_summary': int(labels[review_summary_class_id].split(' ')[0]),
        'classification_confidence_review_summary': probs_review_summary[review_summary_class_id].item()
    }, index=['sentiment_score','classification_confidence', 'sentiment_score_review', 'classification_confidence_review', 'sentiment_score_review_summary', 'classification_confidence_review_summary'])
# data = Dataset.from_pandas(d)
# data = data.map(lambda e: f(e), batched=True)

d[['sentiment_score','classification_confidence', 'sentiment_score_review', 'classification_confidence_review', 'sentiment_score_review_summary', 'classification_confidence_review_summary']] = d.apply(f, axis=1)
d['error'] = np.abs(d['score'] - d['sentiment_score'])
d


# NOTE:
`ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.`


--> Zero cases of an imbalanced feature in the data

In [None]:
print(d['score'].value_counts(), d['sentiment_score'].value_counts())
print(np.sort(d['score'].unique()), np.sort(d['sentiment_score'].unique()))
if np.array_equal(np.sort(d['score'].unique()), np.sort(d['sentiment_score'].unique())):
    print('Warning, macro averaged will not work.')
macro_averaged_mean_absolute_error(d['score'].values , d['sentiment_score'].values, sample_weight=d['votes'])
