# Experiment: Exp3A_1

In [1]:
import pandas as pd
pd.options.display.max_colwidth=500

import os
import sys

## Experiment Setup

In [2]:
root_dir = '../../'
base_dir = root_dir + 'data/magpie/'
data_file = base_dir + 'processed_MAGPIE_filtered_split_typebased.csv'

# Idioms data, to replace the idioms with their single-token representations
idioms_file = root_dir + 'data/token_files/option1_idioms.csv'

# NOTE: This notebook should ideally modify only the contents of this exp_dir.
exp_dir = './'
tmp_dir = exp_dir + 'tmp/'

In [3]:
if not os.path.isdir(tmp_dir):
    os.makedirs(tmp_dir)

In [4]:
df_data = pd.read_csv(data_file)
df_data

Unnamed: 0,sentence_0,idiom,confidence,label,split,variant_type,offsets
0,"For example , with fell running and mountain marathons gaining in popularity , how about some ideas for safe running off the beaten track ?",off the beaten track,1.000000,i,training,identical,"[[117, 120], [125, 131], [132, 137]]"
1,I 'd keep him well in the running .,in the running,0.770109,i,training,identical,"[[19, 21], [26, 33]]"
2,"He gives me the creeps , so I looked round , hmm hmm .",give someone the creeps,1.000000,i,training,combined-inflection,"[[3, 8], [9, 11], [16, 22]]"
3,"‘ He 's done us proud , as well,’ says Granville .",do someone proud,1.000000,i,training,combined-inflection,"[[8, 12], [13, 15], [16, 21]]"
4,"People quickly embraced formal democracy , but the tolerance and compromise that is at the heart of the democratic process took time to take root .",take root,1.000000,i,training,identical,"[[136, 140], [141, 145]]"
...,...,...,...,...,...,...,...
48390,Many also have second or third jobs to make ends meet .,make ends meet,0.854973,i,test,identical,"[[39, 43], [44, 48], [49, 53]]"
48391,"Take people to objections , take them to where you want them to be and bear in mind you 're always looking for an objection",bear in mind,1.000000,i,training,identical,"[[71, 75], [76, 78], [79, 83]]"
48392,"Indeed we are rarely aware of them as rules , until they are broken , since they are typical of the settings in which we received our moral training .",as a rule,1.000000,l,training,deletion-determiner,"[[35, 37], [38, 43]]"
48393,"Unlike in a firm that is a jack of all trades , the supplier is an independent business subject to market disciplines rather than another bit of a big bureaucracy .",jack of all trades,1.000000,i,training,identical,"[[27, 31], [32, 34], [35, 38], [39, 45]]"


In [5]:
# To convert string representation of 'offsets' into a valid list
import ast
df_data['offsets'] = df_data['offsets'].map(lambda os: ast.literal_eval(os))

In [6]:
columns=['sentence_0', 'idiom', 'confidence', 'label', 'split', 'variant_type', 'offsets']

## Replace idioms with their Single Token Representation

In [7]:
# Load the idioms file: <idiom phrase,token> mapping
df_idioms = pd.read_csv(idioms_file)
df_idioms = df_idioms.set_index('idiom')
IDIOM_TOKEN_DICT = df_idioms.to_dict()['idiom_token']
IDIOM_TOKEN_DICT

{'off the beaten track': 'IDoffthebeatentrackID',
 'in the running': 'IDintherunningID',
 'give someone the creeps': 'IDgivesomeonethecreepsID',
 'do someone proud': 'IDdosomeoneproudID',
 'take root': 'IDtakerootID',
 'clean house': 'IDcleanhouseID',
 'make history': 'IDmakehistoryID',
 'go all the way': 'IDgoallthewayID',
 'chapter and verse': 'IDchapterandverseID',
 'break the bank': 'IDbreakthebankID',
 'head for the hills': 'IDheadforthehillsID',
 'in a fog': 'IDinafogID',
 'bring up the rear': 'IDbringuptherearID',
 'in the hole': 'IDintheholeID',
 'true to form': 'IDtruetoformID',
 'rags to riches': 'IDragstorichesID',
 'on the ball': 'IDontheballID',
 'stake a claim': 'IDstakeaclaimID',
 'up for grabs': 'IDupforgrabsID',
 'up and running': 'IDupandrunningID',
 'behind bars': 'IDbehindbarsID',
 'in my book': 'IDinmybookID',
 'in black and white': 'IDinblackandwhiteID',
 'go up in smoke': 'IDgoupinsmokeID',
 'up the river': 'IDuptheriverID',
 'make your blood boil': 'IDmakeyourbl

In [8]:
# Map the idiom phrase with token and then replace the phrase in the 'sentence_0' column
def map_and_replace_by_idiom_token(row):
    # Get the token
    idiom_phrase=row['idiom']
    token=IDIOM_TOKEN_DICT[idiom_phrase]
    # Replace the idiom phrase with a single token in the sentence
    offsets,sentence = row['offsets'], row['sentence_0']
    start = offsets[0][0]  #start of idiom phrase
    end = offsets[-1][1]  #end of idiom phrase
    updated_sentence = ''.join([sentence[:start], token, sentence[end:]])
    return updated_sentence

df_data['sentence_0'] = df_data.apply(map_and_replace_by_idiom_token, axis=1)

## Prepare & save the train, dev & test sets

In [9]:
label_to_id = {'i': 0, 'l': 1}

In [10]:
df_data['split'].value_counts()

training       38715
test            4840
development     4840
Name: split, dtype: int64

In [11]:
df_tmp = df_data[['sentence_0', 'label', 'split']]

df_train = df_tmp[df_tmp['split'] == 'training']
df_dev = df_tmp[df_tmp['split'] == 'development']
df_test = df_tmp[df_tmp['split'] == 'test']

def clean_df(df):
    """Clean each of the datasets"""
    df = df.drop(columns=['split'])
    df['label'] = df['label'].map(label_to_id)
    return df

# Clean the datasets
df_train, df_dev, df_test = [clean_df(df) for df in [df_train, df_dev, df_test]]

In [12]:
# Save data to tmp files
train_csv = tmp_dir + 'train.csv'
dev_csv = tmp_dir + 'dev.csv'
test_csv = tmp_dir + 'test.csv'

df_train.to_csv(train_csv, index=False)
df_dev.to_csv(dev_csv, index=False)
df_test.to_csv(test_csv, index=False)
print(f'Saved the files to {tmp_dir}')

Saved the files to ./tmp/
