In [7]:
import os
import sys

import pandas as pd

from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# add local modules to path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utils.logging.logger import init_pandas_tqdm

# init_pandas_tqdm()

In [8]:
data_path = os.path.join('..', 'storage', 'annotations', 'raw_data', 'eng', 'MWE-enWN-Rules-and-LR-and-simpleLR.csv')

df = pd.read_csv(data_path, sep='\t')

In [9]:
def get_final_label(row: object) -> str:
    if row['predLR'] == 'MWLU':
        return 1

    elif row['simpleLR'] == 'nonMWLU':
        return 0

    else:
        return -1

In [10]:
df['final_is_correct'] = df.progress_apply(lambda row: get_final_label(row), axis=1)

df['final_is_correct']

  0%|          | 0/39406 [00:00<?, ?it/s]

0        1
1       -1
2        0
3       -1
4        1
        ..
39401    0
39402    0
39403   -1
39404    0
39405   -1
Name: final_is_correct, Length: 39406, dtype: int64

In [11]:
# filter out rows containing unspecified MWEs
df = df[df['final_is_correct'] != -1]

df.rename(columns={'final_is_correct': 'is_correct', 'lemma': 'mwe'}, inplace=True)

df

Unnamed: 0,synset_id_PEWN,mwe,definition,PoS,variant,synonymy,cascDict,charLength,noOfSpaces,cosMPNet,predLR,confLR,rules,simpleLR,is_correct
0,417771,Illyrian movement,a pan-South-Slavist cultural and political cam...,n,2,1,1,16,1,0.425512,MWLU,0.432054,MWLU,MWLU,1
2,449392,bi-directional microphone,A microphone that receives sound equally from ...,n,1,1,0,24,1,0.736279,nonMWLU,0.680780,nonMWLU,nonMWLU,0
4,427444,bush tea,an infusion of the leaves of the rooibos plant...,n,1,1,0,7,1,0.390755,MWLU,0.479222,nonMWLU,MWLU,1
7,423462,glucosamine sulphate,Glucosamine sulfate (glucosamine sulphate) is ...,n,1,1,0,19,1,0.837984,nonMWLU,0.645533,nonMWLU,nonMWLU,0
9,424519,game controller,a device used with games or entertainment syst...,n,1,1,1,14,1,0.626435,MWLU,0.473589,MWLU,MWLU,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39399,7088917,gill raker,,n,1,1,1,9,1,0.118112,MWLU,0.388677,MWLU,MWLU,1
39400,7088918,occipital condyle,,n,1,1,1,16,1,0.029985,MWLU,0.428809,MWLU,MWLU,1
39401,7088918,condylus occipitalis,,n,1,1,0,19,1,0.029985,nonMWLU,0.560832,nonMWLU,nonMWLU,0
39402,7088920,radial styloid process,,n,1,1,0,20,2,0.055005,nonMWLU,0.593444,nonMWLU,nonMWLU,0


In [12]:
# save the DataFrame to TSV file
output_filepath = os.path.join('..', 'storage', 'annotations', 'preprocessed_data', 'eng', 'MWE-enWN-Rules-simpleLR_cleaned.tsv')

df.to_csv(output_filepath, sep='\t', index=False)