In [None]:
import pandas as pd
import collections
from pathlib import Path
import nltk
from nltk import sent_tokenize

current_path = Path().resolve().parent
data_path = current_path / 'data'
print(current_path)

In [None]:
dataframe_path = data_path / 'ibm' / 'stance_ibm.csv' # source data

stance_df = pd.read_csv(dataframe_path)

print(collections.Counter(stance_df.Label))

print(collections.Counter(stance_df.Set))

stance_df['Label'] = stance_df['Label'].apply(lambda x: {0: 'NEU', -1:'CON', 1: 'PRO'}[x])
print(collections.Counter(stance_df.Label))

stance_df['id'] = stance_df.index

In [3]:
"""
three versios  of each row are created, based on the three possible relations.
"""
def expand_dataframe(stance_df):
    predicted_values = ['PRO', 'CON', 'NEU']

    expanded_rows = []
    
    # Loop through each row and create 3 new rows with different 'predicted' values
    for _, row in stance_df.iterrows():
        for predicted in predicted_values:
            new_row = row.copy()
            new_row['predicted'] = predicted
            expanded_rows.append(new_row)
    
    return pd.DataFrame(expanded_rows).reset_index(drop = True)

"""
get combination to sample from counter of samples of each combination
"""
def get_comb(gold_stance, combinations_counter):
    subcounter = {k:v for k, v in combinations_counter.items() if k[0] == gold_stance}
    sorted_dict = dict(sorted(subcounter.items(), key=lambda item: item[1]))
    first_key = next(iter(sorted_dict))
    return first_key

"""
defines a combination counter, with all posibles combinations of correct and predicted label.
then, for each instancia, select combination to sample and add row to new data.
For balancing samples based on combinations.
"""
def get_combined_subset(df):
    combinations = [
        ('PRO', 'PRO'), ('PRO', 'CON'), ('PRO', 'NEU'),
        ('CON', 'PRO'), ('CON', 'CON'), ('CON', 'NEU'),
        ('NEU', 'PRO'), ('NEU', 'CON'), ('NEU', 'NEU')
    ]
    
    combinations_counter = {k: 0 for k in combinations}

    unique_ids = df['id'].unique()

    data_for_new_df = []

    for an_id in unique_ids:
        subdf = df[df['id'] == an_id]
        gold_stance = subdf.Label.unique()[0]
    
        comb = get_comb(gold_stance, combinations_counter)

        row = subdf[(subdf['Label'] == comb[0]) & (subdf['predicted'] == comb[1])].values.tolist()[0]
    
        combinations_counter[comb] += 1
    
        data_for_new_df.append(row)

    return pd.DataFrame(data_for_new_df, columns = df.columns)

"""
add error category based on correct and predicted label.
"""
def get_error_type(row):
    label, predicted = row['Label'], row['predicted']
    
    if label == predicted:
        return "correct"
    
    if (label == 'PRO') and (predicted == 'CON'):
        return "flipped"

    if (label == 'CON') and (predicted == 'PRO'):
        return "flipped"
    
    if (label in {'PRO', 'CON'}) and (predicted == 'NEU'):
        return "neutralized"
    
    if (label == 'NEU') and (predicted in {'PRO', 'CON'}):
        return "polarized"


In [None]:
expanded_train_df = expand_dataframe(stance_df[stance_df['Set'] == 'train'])

combined_train_df = get_combined_subset(expanded_train_df)
print(combined_train_df.shape)

# Group by the combinations of 'Label' and 'prediction'
grouped = combined_train_df.groupby(['Label', 'predicted'])
min_size = min(grouped.size())

combined_train_df = grouped.apply(lambda x: x.sample(min_size)).reset_index(drop=True)

combination_counts = combined_train_df.groupby(['Label', 'predicted']).size().reset_index(name='count')
print(combination_counts)

combined_train_df['error_type'] = combined_train_df.apply(lambda r: get_error_type(r), axis = 1)
print(collections.Counter(combined_train_df['error_type']))

In [None]:
expanded_dev_df = expand_dataframe(stance_df[stance_df['Set'] == 'dev'])

combined_dev_df = get_combined_subset(expanded_dev_df)
print(combined_dev_df.shape)

# Group by the combinations of 'Label' and 'prediction'
grouped = combined_dev_df.groupby(['Label', 'predicted'])
min_size = min(grouped.size())

combined_dev_df = grouped.apply(lambda x: x.sample(min_size)).reset_index(drop=True)

combination_counts = combined_dev_df.groupby(['Label', 'predicted']).size().reset_index(name='count')
print(combination_counts)

combined_dev_df['error_type'] = combined_dev_df.apply(lambda r: get_error_type(r), axis = 1)
print(collections.Counter(combined_dev_df['error_type']))

In [None]:
expanded_test_df = expand_dataframe(stance_df[stance_df['Set'] == 'test'])

combined_test_df = get_combined_subset(expanded_test_df)
print(combined_test_df.shape)

# Group by the combinations of 'Label' and 'prediction'
grouped = combined_test_df.groupby(['Label', 'predicted'])
min_size = min(grouped.size())

combined_test_df = grouped.apply(lambda x: x.sample(min_size)).reset_index(drop=True)

combination_counts = combined_test_df.groupby(['Label', 'predicted']).size().reset_index(name='count')
print(combination_counts)

combined_test_df['error_type'] = combined_test_df.apply(lambda r: get_error_type(r), axis = 1)
print(collections.Counter(combined_test_df['error_type']))

In [None]:
new_df = pd.concat([combined_train_df, combined_dev_df, combined_test_df])

new_df = new_df.sample(frac=1).reset_index(drop=True)

print(new_df.shape)
for s in ['train', 'dev', 'test']:
    print(s, collections.Counter(new_df[new_df['Set'] == s].error_type))

In [8]:
# save new data
new_df.to_csv(data_path / 'ibm' / 'generated_ibm_sample1.csv', index = False)