# Преобразование CEDR

In [1]:
import json
import csv

# Mapping of label codes to emotion categories
label_map = {0: "joy", 1: "sadness", 2: "surprise", 3: "fear", 4: "anger"}
all_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]

def convert_dataset_CEDR(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", newline="", encoding="utf-8") as outfile:
        writer = csv.writer(outfile)
        writer.writerow(["text"] + all_labels)  # Write CSV header
        
        for line in infile:
            data = json.loads(line)
            text = data["text"].strip()
            labels = data["label"]
            
            # Initialize label presence vector
            label_vector = [0] * len(all_labels)
            
            for label in labels:
                if label in label_map:
                    emotion = label_map[label]
                    if emotion in all_labels:
                        label_vector[all_labels.index(emotion)] = 1
                    
            # If no labels, set neutral to 1
            if not labels:
                label_vector[-1] = 1
            
            writer.writerow([text] + label_vector)



In [2]:
# Example usage
convert_dataset_CEDR("train.jsonl", "output.csv")

In [4]:
# Example usage
convert_dataset_CEDR("test.jsonl", "test.csv")

# преобразование ISEAR

In [1]:
!pip install transformers




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import torch
import tqdm
#Определим устройство для обучения
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
from transformers import pipeline
en_ru_translator = pipeline("translation_en_to_ru", "Helsinki-NLP/opus-mt-en-ru",device=device)

def translate_en2ru(text):
    answer = en_ru_translator(text)
    return answer[0]['translation_text']

In [9]:
import csv
from tqdm import tqdm

# Mapping of emotion labels to required format
valid_labels = {"anger": 0,"disgust":1, "fear": 2, "joy": 3, "sadness": 4, "surprise": 5}
all_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]

def convert_label_text_dataset(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", newline="", encoding="utf-8") as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        writer.writerow(["text"] + all_labels)  # Write CSV header
        

        
        for row in tqdm(reader):
            if len(row) < 2:
                continue  # Skip malformed lines
            
            emotion, text = row[0], row[1]  # Use second and third columns
            text = translate_en2ru(text)
            
            # Initialize label presence vector
            label_vector = [0] * len(all_labels)
            
            if emotion in valid_labels:
                label_vector[valid_labels[emotion]] = 1
            else:
                label_vector[-1] = 1  # Neutral if not in valid labels
            
            writer.writerow([text] + label_vector)




In [None]:
# Example usage
convert_label_text_dataset("isear-train.csv", "isear.csv")

5341it [21:16,  4.19it/s]


# XED

In [21]:
import csv

# Mapping of emotion labels to required format
valid_labels = {"anger": 1, "disgust": 2, "fear": 3, "joy": 4, "sadness": 5, "surprise": 6, "neutral": 7}
all_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]

def convert_label_text_dataset(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", newline="", encoding="utf-8") as outfile:
        reader = csv.reader(infile, delimiter="\t")
        writer = csv.writer(outfile)
        writer.writerow(["text"] + all_labels)  # Write CSV header
        
        for row in tqdm(reader):
            if len(row) < 2:
                continue  # Skip malformed lines
            
            text = row[0]
            text = translate_en2ru(text)
            labels = row[1].strip().split(",")
            
            # Initialize label presence vector
            label_vector = [0] * len(all_labels)
            
            has_valid_label = False
            for label in labels:
                label = label.strip()
                if label.isdigit():
                    label = int(label)
                    adjusted_label = label if label < 2 else label - 1  # Shift labels above 2 down by one
                    if adjusted_label in valid_labels.values():
                        label_vector[adjusted_label - 1] = 1  # Adjust index (1-based to 0-based)
                        has_valid_label = True
            
            if has_valid_label:
                writer.writerow([text] + label_vector)



In [None]:
# Example usage
convert_label_text_dataset("en-annotated.tsv", "XED.csv")

17528it [32:05,  9.10it/s]


: 

In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet('train.parquet')

In [3]:
df.to_csv('1.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211225 entries, 0 to 211224
Data columns (total 38 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   ru_text               211225 non-null  object 
 1   text                  211225 non-null  object 
 2   id                    211225 non-null  object 
 3   author                211225 non-null  object 
 4   subreddit             211225 non-null  object 
 5   link_id               211225 non-null  object 
 6   parent_id             211225 non-null  object 
 7   created_utc           211225 non-null  float32
 8   rater_id              211225 non-null  int32  
 9   example_very_unclear  211225 non-null  bool   
 10  admiration            211225 non-null  int32  
 11  amusement             211225 non-null  int32  
 12  anger                 211225 non-null  int32  
 13  annoyance             211225 non-null  int32  
 14  approval              211225 non-null  int32  
 15  

In [8]:
df.sample(30)

Unnamed: 0,ru_text,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
161256,Какое замечательное слово,What a great word,edugt9g,no_re-entry,LifeProTips,t3_aevoe8,t1_edt5qni,1547246000.0,18,False,...,0,0,0,0,0,0,0,0,0,0
17449,"ну... в детстве его не было, но сейчас мне нра...",well.. didn't have it when I was a kids but I ...,eff8d1t,asiawide,korea,t3_almsyz,t3_almsyz,1548918000.0,40,False,...,0,0,0,0,0,0,0,0,0,1
158008,"Звучит эпично, чувак",Sounds epic man,eek6ni3,Soaptimusprime,Blackops4,t3_ahy59u,t1_eek6e3j,1548014000.0,39,False,...,0,0,0,0,0,0,0,0,0,0
133278,На этом сайте об этом постоянно говорят. О чём...,It is being talked about all the time on this ...,ef6xvgp,Plasmic_Socialist,socialism,t3_akpxhy,t3_akpxhy,1548700000.0,7,False,...,0,0,0,0,0,0,0,0,0,0
33368,"Пожалуйста, не разрушайте нашу самоуничижитель...",Please don't ruin our self-deprecating-humor c...,edashk7,subhuman-male,IncelsWithoutHate,t3_accfsr,t1_ed9949p,1546697000.0,16,False,...,0,0,0,0,0,0,0,0,0,0
156881,Это странный способ написания Тобина.,That’s a weird way of spelling Tobin.,ef40m8c,VoidWaIker,fireemblem,t3_ak9g5z,t1_ef3l6w4,1548609000.0,43,False,...,0,0,0,0,0,0,0,0,0,0
883,"Если не считать травмы, [ИМЯ] встанет не поздн...","Barring an injury, [NAME] will be up by May at...",edll0o3,StanfordFox,NewYorkMets,t3_ae0yj2,t1_edlkwgr,1546994000.0,81,False,...,0,0,0,0,0,0,0,0,0,0
209117,"По моему мнению, показывать людям кадры с боен...",Confronting people with footage from slaughter...,ed02kga,SuperRandoBoi,philosophy,t3_ab9b3d,t1_ecyrdw0,1546330000.0,63,False,...,0,0,0,0,0,0,0,1,0,0
193140,>[ИМЯ] >не [ИМЯ] Вы как будто просите проиграть,>[NAME] >not [NAME] It's as if you're asking t...,eea7fdf,Ayd305,MortalKombat,t3_agz3s3,t3_agz3s3,1547744000.0,26,False,...,0,0,0,0,0,0,0,0,0,1
210644,"Баха-ха-ха, это золото.",Bahahaha this is gold.,edti0op,Ryant12,Blackops4,t3_aeppfz,t3_aeppfz,1547226000.0,39,False,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Определяем маппинг эмоций
emotion_mapping = {
    "anger": ["anger", "annoyance", "disapproval"],
    "disgust": ["disgust"],
    "fear": ["fear", "nervousness"],
    "joy": ["joy", "amusement", "approval", "excitement", "gratitude", "love", "optimism", "relief", "pride", "admiration", "desire", "caring"],
    "sadness": ["sadness", "disappointment", "embarrassment", "grief", "remorse"],
    "surprise": ["surprise", "realization", "confusion", "curiosity"],
    "neutral": ["neutral"]
}

# Загружаем датасет
def convert_dataset(input_file, output_file):
    df = pd.read_parquet(input_file)
    
    # Оставляем только нужные столбцы
    selected_columns = ["text"] + list(emotion_mapping.keys())
    new_df = pd.DataFrame(columns=selected_columns)
    new_df["text"] = df["ru_text"]
    
    # Заполняем эмоции бинарными значениями
    for target_emotion, source_emotions in emotion_mapping.items():
        new_df[target_emotion] = df[source_emotions].max(axis=1)
    
    # Сохраняем в CSV
    new_df.to_csv(output_file, index=False)



In [7]:
# Пример использования
convert_dataset('train.parquet', "goemo.csv")

In [None]:
#слияние

In [9]:
import pandas as pd


df1 = pd.read_csv('goemo.csv')
df2 = pd.read_csv('cedr.csv')
df3 = pd.read_csv('XED.csv')

In [10]:
df = pd.concat([df1,df2,df3])

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 236281 entries, 0 to 17527
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   text      236281 non-null  object
 1   anger     236281 non-null  int64 
 2   disgust   236281 non-null  int64 
 3   fear      236281 non-null  int64 
 4   joy       236281 non-null  int64 
 5   sadness   236281 non-null  int64 
 6   surprise  236281 non-null  int64 
 7   neutral   236281 non-null  int64 
dtypes: int64(7), object(1)
memory usage: 16.2+ MB


In [16]:
df.to_csv('all.csv')