In [1]:
import os
import warnings
import pandas
import torch
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

In [24]:
def get_pipeline():
    model = AutoModelForSequenceClassification.from_pretrained(
        "gchhablani/bert-base-cased-finetuned-sst2"
    )
    tokenizer = AutoTokenizer.from_pretrained(
        "gchhablani/bert-base-cased-finetuned-sst2", do_lower_case=False
    )
    senti_pipeline = pipeline(
        "sentiment-analysis", model=model, tokenizer=tokenizer, truncation=True
    )
    return senti_pipeline


In [15]:
def get_dir(path):
    files = []
    for file in os.listdir(path):
        if file.endswith(".csv") and not file.startswith("list"):
            files.append(file)
    return files

In [30]:
def get_reviews(df):
    reviews = []
    for i in range(0, len(df)):
        reviews.append(str(df["Avaliações"][i]))
    return reviews

In [22]:
def get_sentiments(reviews, senti_pipeline):
    sentiments = []
    for review in reviews:
        sentiments.append(str(senti_pipeline(review)[0].get("label")))
    return sentiments

In [12]:
def export(sentiments, reviews, path, name):
    df = pandas.DataFrame({"sentiment": sentiments, "review": reviews})
    path = path.replace("/../scrapes/", "/")
    df.to_csv(str(path) + str(name), index=False)

In [17]:
def analyse_directory(path, senti_pipeline):
    files = get_dir(path)
    for file in files:
        df = pandas.read_csv(path + file, encoding="utf-8")
        reviews = get_reviews(df)
        sentiments = get_sentiments(reviews, senti_pipeline)
        export(sentiments, reviews, path, file)

In [25]:
senti_pipeline = get_pipeline()

In [32]:
current_dir = os.getcwd()
path = current_dir + "/../scrapes/"
analyse_directory(path + "booking/hotels/", senti_pipeline)
analyse_directory(path + "zomato/restaurantes/", senti_pipeline)
analyse_directory(path + "tripadvisor/hotels/", senti_pipeline)
analyse_directory(path + "tripadvisor/activities/", senti_pipeline)
analyse_directory(path + "tripadvisor/restaurants/", senti_pipeline)