In [None]:
# Instala traductor
import csv, os, threading, torch, unidecode
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm

model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = MarianTokenizer.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = MarianMTModel.from_pretrained(model_name).to(device)

In [2]:
def translate(data):
    a,original,c = data
    translated = model.generate(**tokenizer(original, return_tensors="pt", padding=True).to(device))
    spanish = tokenizer.decode(translated[0], skip_special_tokens=True)
    return [int(a),spanish,int(c)]

In [3]:
if not os.path.exists("../dataset/"): os.mkdir("../dataset/")

rows = []
with open("../dataset/emotions.csv", "r",encoding='utf-8', newline='') as file:
    reader = csv.reader(file, delimiter=',', quotechar='"')
    for row in reader:
        rows.append(row)
file.close()
head = rows[0]

total = len(rows)

In [None]:
# Translate dataset test v1

num_threads = 4
chunk_size = len(rows) // num_threads
threads = []
progress_bar = tqdm(total=total, desc="Traduciendo dataset")
with open("../dataset/emociones.csv", "w", encoding='utf-8', newline='') as outfile:
    writer = csv.writer(outfile, delimiter=',', quotechar='"')
    writer.writerow([e for e in head])

    def translate_chunk(chunk, progress_bar):
        for row in chunk:
            t = translate(row)
            writer.writerow(t)
            progress_bar.update()

    for i in range(num_threads):
        start_idx = i * chunk_size
        end_idx = start_idx + chunk_size if i != num_threads - 1 else total
        chunk = rows[start_idx:end_idx]

        thread = threading.Thread(target=translate_chunk, args=(chunk, progress_bar))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    progress_bar.close()
    outfile.close()


In [None]:
# cut or combine datasets
rows = []
dir = os.listdir("../dataset/")
archivos_csv = [archivo for archivo in dir if archivo.endswith(".csv")]
for archivo in archivos_csv:
    print(f"../dataset/{archivo}")
    with open(f"../dataset/{archivo}", "r", encoding='utf-8', newline='') as file:
        reader = csv.reader(file, delimiter=',', quotechar='"')
        try:
            for row in reader:
                if len(str(row[0])) > 200: continue
                text = unidecode.unidecode(str(row[0]).lower())
                rows.append([text,row[1]])
        except Exception as e:
            print(e)
    file.close()

with open("../dataset/emociones.csv", "w", encoding='utf-8', newline='') as outfile:
    writer = csv.writer(outfile, delimiter=',', quotechar='"')
    writer.writerows(rows)

In [None]:
# Mimir
SLEEP = True
if SLEEP:
    try:
        os.system('shutdown /s /t 0')
    except:
        os.system("rundll32.exe powrprof.dll,SetSuspendState 0,1,0")