In [42]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [50]:
PATH_TO_POLEMO_CONLL = "../datasets/polemo/dataset_conll"
PATH_TO_PLOTS = "../reports/plots"

In [53]:
files_to_compare = ["all.sentence.train.txt",
                    "all.text.train.txt"]

get_length_func = lambda text: len(text)
for file_to_compare in files_to_compare:
    with open(os.path.join(PATH_TO_POLEMO_CONLL,file_to_compare), "r", encoding="utf-8") as f:
        labels = []
        texts = []
        for line in f:
            labels.append(line[line.index("__label__"):-1])
            texts.append(line[:line.index("__label__")])

        df = pd.DataFrame(data={"text": texts, "label": labels})
        df['length'] = df['text'].apply(get_length_func)
        plt.figure(figsize=(12,12))
        plt.title(f"File - {file_to_compare}")
        plt.xlabel("Length of text")
        sns.histplot(df['length'], binwidth=50)
        plt.savefig(os.path.join(PATH_TO_PLOTS,f"pol_emo_{file_to_compare}_length.png"))
        plt.close()

In [28]:
files_to_extract = ["all.sentence.train.txt",
                    "all.sentence.test.txt",
                    "all.sentence.dev.txt"]

lines_for_dataset = {}
for file_to_extract in files_to_extract:
    lines = []
    with open(os.path.join(PATH_TO_POLEMO_CONLL,file_to_extract), "r", encoding="utf-8") as f:
        for line in f:
            lines.append(line)
        lines_for_dataset[file_to_extract] = lines

In [29]:
dataframes = {}
for dataset_file, lines in lines_for_dataset.items():
    labels = []
    texts = []
    for line in lines:
        labels.append(line[line.index("__label__"):-1])
        texts.append(line[:line.index("__label__")])

    dataframes[dataset_file] = pd.DataFrame(data={"text": texts, "label": labels})


In [30]:
all_labels = set()
for dataset_file, df in dataframes.items():
    all_labels = all_labels.union(df['label'].unique())

In [32]:
for dataset_file, df in dataframes.items():
    print(df['label'].value_counts())

__label__z_minus_m    16911
__label__z_plus_m     12293
__label__z_zero       11255
__label__z_amb         5515
Name: label, dtype: int64
__label__z_minus_m    2123
__label__z_plus_m     1522
__label__z_zero       1419
__label__z_amb         681
Name: label, dtype: int64
__label__z_minus_m    2122
__label__z_plus_m     1509
__label__z_zero       1427
__label__z_amb         689
Name: label, dtype: int64


In [31]:
all_labels

{'__label__z_amb',
 '__label__z_minus_m',
 '__label__z_plus_m',
 '__label__z_zero'}

In [33]:
replace_dict = {
    "__label__z_amb" : "ambiguous",
    "__label__z_minus_m": "negative",
    "__label__z_plus_m": "positive",
    "__label__z_zero": "neutral"
}

In [57]:
for dataset_file, df in dataframes.items():
    df['label'] = df['label'].replace(replace_dict)
    df.to_csv(os.path.join(PATH_TO_POLEMO_CONLL, ".".join(dataset_file.split(".")[:-1]) + "_processed.csv"), index=False)