In [None]:
# Library imports
import os
import csv
import translators as ts
import translators.server as tss
import shutil
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

# Dataset processing - Create the csv
if os.getcwd().find("Dataset") == -1:
    os.chdir("./Dataset/")
dataset = os.listdir()

In [None]:
# create csv files
def createCSV():
    for file in dataset:
        fileName = file[:file.find(".")]+".csv" 
        with open(file, "r", encoding="utf-8",newline='') as file, open(fileName, 'w', newline='') as outfile:
            writer = csv.writer(outfile)
            for idx, line in enumerate(file):
                if idx == 0:
                    fields = ["word","accuracy"]
                else:
                    fields = line.strip().split('\t')
                # Write the data to the CSV file
                writer.writerow(fields)

In [None]:
# Translate the files .csv and format to be word,emotion,accuracy
if os.getcwd().find("Dataset") == -1:
    os.chdir("./Dataset/")
dataset = os.listdir()

def listToString(s):
    # initialize an empty string
    str1 = "\n"
    # return string 
    return (str1.join(s))

def translateFile(file, output):
    from_language, to_language = 'en', 'pt'
    pos = file.find(".csv")
    emotion = file[:pos]
    with open(file,"r", encoding="utf-8", newline='') as file:
        words = []
        data = []
        accuracies = []
        emotions = []
        for idx, line in enumerate(file):
            if idx == 0:
                continue
            fields = line.strip().split(',')
            accuracies.append(fields[1])
            words.append(fields[0])
            emotions.append(emotion)
    for i in range(0, len(words), 400):     
        block = words[i:i+400]
        block_size = len(block)  
        if (i + block_size) >= len(words):
            block = words[i:(len(words)-1)]
            break
        translated_text = tss.google(listToString(block), reset_host_url=None, from_language=from_language, to_language=to_language)
        for i, word in enumerate(translated_text.split("\n")):
            if i != 0:
                data.append(word[1:len(word)-1])
            else:    
                data.append(word[:len(word)-1])    
    with open(output, 'w', newline='', encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["word","emotion","accuracy"])
        for i, item in enumerate(data):
            writer.writerow([item,emotions[i],accuracies[i]])

In [None]:
# Translate files
translateFile("anger.csv", "anger_PT.csv")
translateFile("antecipation.csv", "antecipation_PT.csv")
translateFile("disgust.csv", "disgust_PT.csv")
translateFile("fear.csv", "fear_PT.csv")
translateFile("joy.csv", "joy_PT.csv")
translateFile("sadness.csv", "sadness_PT.csv")
translateFile("surprise.csv", "surprise_PT.csv")
translateFile("trust.csv", "trust_PT.csv")

In [None]:
# rename files
dataset = os.listdir()
for file in dataset:
    pos = file.find("_PT")
    if pos != -1:
       os.rename(file,file[:pos]+".csv")

In [None]:
# Clean dataset
mergedCSV = []
headers = True
dataset = os.listdir()
totalLines = 0
for file in dataset:
    pos = file.find(".csv") 
    if file.find(".csv") != -1:
        
        name = file[:pos]
        with open(file, 'r',encoding="utf-8") as fileData:
            reader = csv.reader(fileData)
            csvData = list(reader) 
            totalLines = totalLines + len(csvData)
            if headers:
                headers = False
            else:
                csvData = csvData[1:]  
            mergedCSV = mergedCSV + csvData
with open('dataset.csv', 'w', newline='',encoding="utf-8") as outfile:
    writer = csv.writer(outfile)
    writer.writerows(mergedCSV)
    print(len(mergedCSV),totalLines)  
data.to_csv('dataFinal.csv', index=False)      

In [None]:
# Load the dataset
data = pd.read_csv("dataFinal.csv",sep=",")
# Preprocess the text
# Tokenization, stemming, stop-word removal, etc.

# Feature extraction
tfidf_vectorizer = TfidfVectorizer(max_df=0.2, max_features=1000)

X = []
for word in data['word']:
    X.append(str(word))
y = data['emotion']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the text data into feature vectors
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
# Train the SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)

# Evaluate the model on the test set 
y_pred = svm_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

In [16]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Define the text to analyze
text = "Eu amo essa cidade. Ela é linda e cheia de vida."

# Tokenize the text
tokens = word_tokenize(text)

# Remove stopwords
stop_words = set(stopwords.words('portuguese'))
filtered_tokens = [word for word in tokens if not word in stop_words]

# Perform sentiment analysis using VADER
analyzer = SentimentIntensityAnalyzer()
scores = analyzer.polarity_scores(' '.join(filtered_tokens))

# Print the sentiment scores
print(scores)

[0.4724978 0.5275023]
dict_values(['negative', 'positive'])
