In [None]:
from transformers import AutoTokenizer, pipeline, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer

import numpy as np
import pandas as pd
import datasets
import pyarrow as pa
from datasets import load_metric

In [None]:
app_df = pd.read_json("../app_reviews/reviews.json")

In [None]:
app_df["score"].hist()

In [None]:
def to_sentiment(rating):
  rating = int(rating)
  if rating <= 2:
    return 0
  elif rating == 3:
    return 1
  else: 
    return 2

app_df['sentiment'] = app_df.score.apply(to_sentiment)

In [None]:
sentzer_df = pd.read_csv("../sentzer_labels_v1.csv")

In [None]:
class_names = ['negative', 'neutral', 'positive']

In [None]:
app_df.hist("sentiment")

In [None]:
sentzer_df.hist()

In [None]:
joined_df = pd.concat([app_df.sample(1000), sentzer_df])

In [None]:
joined_df

In [None]:
model_checkpoint = 'neuralmind/bert-base-portuguese-cased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def build_dataset(label_df, test_percentage=0.2):
    new_df = pd.DataFrame()
    new_df["label"] = label_df["sentiment"]
    new_df["text"] = label_df["content"]
    df_a = pa.Table.from_pandas(new_df, preserve_index=False)
    dataset = datasets.Dataset(df_a)
    split_dataset = dataset.train_test_split(test_percentage)
    return split_dataset

In [None]:
dataset = build_dataset(joined_df)

In [None]:
# Prepare the text inputs for the model
def preprocess_function(ex):
    return tokenizer(ex["text"], truncation=True)

tokenized_train = dataset["train"].map(preprocess_function, batched=True)
tokenized_test = dataset["test"].map(preprocess_function, batched=True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
labels = ["negative", "neutral", "positive"]
id2label = {i: label for i, label in enumerate(labels)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, id2label = id2label, label2id=label2id)

In [None]:

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
    load_precision = load_metric("precision")
    load_recall = load_metric("recall")
    
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = load_precision.compute(predictions=predictions, references=labels,average="macro")["precision"]
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels,average="macro")["f1"]
    recall = load_recall.compute(predictions=predictions, references=labels,average="macro")["recall"]

    return {"precision": precision, "recall": recall, "accuracy": accuracy, "f1": f1}

In [None]:
!wandb login 

In [None]:
repo_name = "model_sentzer_app"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    report_to= "wandb",
    evaluation_strategy="steps",
    eval_steps=20,
    save_strategy="epoch", 
)

In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model(repo_name)

In [None]:
trained_model = pipeline("sentiment-analysis", model=repo_name, tokenizer=tokenizer)

In [None]:
trained_model("Pq ifood é tão caro meu deus")

In [None]:
trained_model("@ravlls Quero é tudo do ifood")

In [None]:
trained_model("Pedi batata no ifood por 1 real para jantar. Te amo @iFood")

In [None]:
trained_model("Essa nova interface do IFood tá horrível")

In [None]:
trained_model("Disse pra minha namorada que tava triste e não queria fazer janta, ela mandou eu pedir ifood e meu deu o cartão")