<a href="https://colab.research.google.com/github/AromaR/685_Project/blob/main/685_DataProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import time
import os
import sys
from google.colab import output

# import threading
# from ipywidgets import interactive
# from IPython.display import clear_output
# from ipywidgets.widgets import Button, Layout, Label, HBox

## **Step 0: Load original TACRED data and probing task data**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
## Manually upload filed to Colab. Option not considered.
# from google.colab import files 
# uploaded = files.upload()                

In [None]:
# Load the original TACRED training data
train_df = None
train_df = pd.read_json('/content/drive/My Drive/685_project/TACRED_original_data/train.json')    
## ATTENTION: If pandas cannot load, make an identical folder in your drive
train_df = train_df.reset_index(drop = False)

In [None]:
print(len(train_df))
print(train_df.columns)

68124
Index(['index', 'id', 'docid', 'relation', 'token', 'subj_start', 'subj_end',
       'obj_start', 'obj_end', 'subj_type', 'obj_type', 'stanford_pos',
       'stanford_ner', 'stanford_head', 'stanford_deprel'],
      dtype='object')


In [None]:
drive_dir = '/content/drive/My Drive/685_project'           ## ATTENTION: If pandas cannot load, make an identical folder in your drive
head_gr_df = pd.read_csv(drive_dir + '/argument_head_grammatical_role.txt',        
                         sep='\t', usecols=[0,1,2], header=None)
tail_gr_df = pd.read_csv(drive_dir + '/argument_tail_grammatical_role.txt', 
                         sep='\t', usecols=[0,1,2], header=None)
entity_exists = pd.read_csv(drive_dir + '/entity_exists_between_head_tail.txt',
                           sep='\t', usecols=[0,1,2], header=None)
sent_length = pd.read_csv(drive_dir + '/sentence_length.txt',
                           sep='\t', usecols=[0,1,2], header=None)
arg_order = pd.read_csv(drive_dir + '/argument_order.txt', 
                        sep='\t', usecols=[0,1,2], header=None)

In [None]:
# Some probing task value-meaning mappings
entity_dict = {0: 'Entity doesn\'t exist between head and tail.', 1: 'Entity exists between head and tail.'}
grammar_role_dict = {0: '0, other', 1: '1, nsubj (nominal subject)', 2: '2, dobj (direct object)',
                    3: '3, iobj (indirect object)', 4: '4, nsubjpass (nominal subject of passive clause)'}

In [None]:
print(len(head_gr_df))
print(head_gr_df.columns)
print(len(sent_length))
print(arg_order.columns)

105914
Int64Index([0, 1, 2], dtype='int64')
102897
Int64Index([0, 1, 2], dtype='int64')


# Define some useful functions

In [None]:
a = input("What's your hobby? \n")
print(f"So, your hobby is {a}. Awesome!")

What's your hobby? 
football
So, your hobby is football. Awesome!


In [None]:
def Add_attribute(src_df, target_df, attr_name):
    """src_df = source dataframe to add attribute from, imported from probing task data.
       target_df = target dataframe to add attribute to, imported from original TACRED data.
       attr_name = attribute name for new added column in target dataframe."""
    target_df[attr_name] = ''
    for i, row in src_df.iterrows():
        curr_id = row[1]
        if curr_id in target_df['id'].values:
            target_df.loc[target_df.id == curr_id, attr_name] = row[2]
            
# Display examples that satisfy certain property
def Display(dataframe, num, attribute=None, value=None):
    """dataframe = the input dataframe
       num = the number of examples to display
       attribute = selected attribute. If set none then randomly display.
       value = the value the attribute equals to. If none, randomly display num examples from dataframe. If set none then randomly display."""
    global entity_dict, grammar_role_dict
    if value != None and attribute != None:
        sub_df = dataframe.loc[dataframe[attribute] == value]
        sub_df = sub_df.sample(frac = 1)
    else:
        sub_df = dataframe.sample(frac = 1)
    sub_df = sub_df.reset_index(drop=True)
    for i, row in sub_df.iterrows():
        if i >= num:
            break
        print(f"ID: {row['id']}, relation: {row['relation']}.")
        token = row['token']
        sentence = ' '.join(token)
        subject = ' '.join(token[row['subj_start']: row['subj_end']+1])
        obj = ' '.join(token[row['obj_start']: row['obj_end']+1])
        # order = row['Arg_order']
        # head = subject if order == 0 else obj
        # head_idx = (row['subj_start'], row['subj_end']) if order==0 else (row['obj_start'], row['obj_end'])
        # tail_idx = (row['obj_start'], row['obj_end']) if order==0 else (row['subj_start'], row['subj_end'])
        # tail = obj if order == 0 else subject
        
        # print(sentence)
        for j in range(len(sentence)//150 + 1):                         # Wrap the text manually
          print(sentence[150*j:150*(j+1)])

        entity = row['Entity_exists']; sen_len = row['Sent_len']
        print(f"Subject: {subject} at {(row['subj_start'], row['subj_end'])}, object: {obj} at {(row['obj_start'], row['obj_end'])}. {entity_dict.get(entity, None)}")
        # print(f"Head: {head} at {head_idx}, tail: {tail} at {tail_idx}. {entity_dict.get(entity, None)}")
        head_gr = row['Head_gram_role']; tail_gr = row['Tail_gram_role']        
        print(f"Head grammatical role: {grammar_role_dict.get(head_gr, None)}; tail grammatical role: {grammar_role_dict.get(tail_gr, None)}.")
        print(f"Sentence length belongs to the {sen_len}th bin. \n")

## Step 1: Add some probing task properties to a single dataframe 

In [None]:
# Add probing task properties to train_df
srt = time.time()            
Add_attribute(entity_exists[:10000], train_df, 'Entity_exists')
print("entity exists")
Add_attribute(head_gr_df[:10000], train_df, 'Head_gram_role')
print("head gram role")
Add_attribute(tail_gr_df[:10000], train_df, 'Tail_gram_role')
print("tail gram role")
Add_attribute(sent_length[:10000], train_df, 'Sent_len')
print("sent_len")
Add_attribute(arg_order[:10000], train_df, 'Arg_order')
print("arg_order")
end = time.time()
print("Time elapsed {:.1f} secs.".format(end - srt))

entity exists
head gram role
tail gram role
sent_len
arg_order
Time elapsed 489.3 secs.


In [None]:
Display(train_df, 5, 'Tail_gram_role', 3)      # Randomly display 5 samples whose given attribute is given value

ID: 61b3a65fb93253d7873b, relation: no_relation.
I suppose now Scheider can compare notes once again with Bob Fosse , the director who gave him the finest role of his career , and the performance that should have won him an Academy Award - the statue that year went to Dustin Hof zz zz in Kramer vs Kra zz zz .
Subject: Scheider at (3, 3), object: him at (17, 17). Entity exists between head and tail.
Head grammatical role: 1, nsubj (nominal subject); tail grammatical role: 3, iobj (indirect object).
Sentence length belongs to the 8th bin. 

ID: 61b3a65fb9015842004e, relation: no_relation.
The Asian Development Bank on Thursday said it expected to offer Pakistan a $ 2 billion emergency loan for help repairing roads , bridges and other infrastructure damaged by the record flooding in the country .
Subject: Asian Development Bank at (1, 3), object: Pakistan at (11, 11). Entity exists between head and tail.
Head grammatical role: 1, nsubj (nominal subject); tail grammatical role: 3, iobj (in

In [None]:
Display(train_df[:10000], 5)     # Randomly display 5 samples

ID: 61b3a65fb9a02079219a, relation: no_relation.
It is notable that Zubaydah is not one of those charged on Monday , despite his alleged connection to the September 11 attacks .
Subject: his at (15, 15), object: one at (7, 7). Entity exists between head and tail.
Head grammatical role: 0, other; tail grammatical role: 0, other.
Sentence length belongs to the 2th bin. 

ID: 61b3a5c8c9129922723d, relation: org:founded_by.
1999 : President Lee Teng-hui confers the Order of the Brilliant Star with a Violet Grand Cordon on Samuel Noordhoff , founder of the Noordhoff Craniofacial Foundation , for his devoted service to local citizens over the past four decades .
Subject: Noordhoff Craniofacial Foundation at (24, 26), object: Samuel Noordhoff at (18, 19). Entity doesn't exist between head and tail.
Head grammatical role: 0, other; tail grammatical role: 0, other.
Sentence length belongs to the 6th bin. 

ID: 61b3a65fb91d549b91f5, relation: no_relation.
JetBlue has already matched Virgin Ameri

## Step 2: User Interface for Labeling (manually input)


In [None]:
def annotate(dataframe, start, end):
     # Main method to annotate data.
     # columns: ['index', 'id', 'docid', 'relation', 'token', 'subj_start', 'subj_end',
     #  'obj_start', 'obj_end', 'subj_type', 'obj_type', 'stanford_pos',
     #  'stanford_ner', 'stanford_head', 'stanford_deprel', 'Entity_exists',
     #  'Head_gram_role', 'Tail_gram_role', 'Sent_len', 'Arg_order']
     # dataframe: TACRED dataframe with data to be labeled.
     # start: index of the start sentence.
     # end: index of the end sentence.
    global entity_dict, grammar_role_dict
    print("5 for appositional modifier, 6 for noun compound modifier, 0 for neither.")
    for i in range(start, end):
        print(f"\n Iteration {i}")
        row = dataframe.iloc[i]
        print(f"ID: {row['id']}, relation: {row['relation']}.")
        token = row['token']
        sentence = ' '.join(token)
        subject = ' '.join(token[row['subj_start']: row['subj_end']+1])
        obj = ' '.join(token[row['obj_start']: row['obj_end']+1])
        for j in range(len(sentence)//150 + 1):                         # Wrap the text manually
          print(sentence[150*j:150*(j+1)])
        head_gr = row['Head_gram_role']; tail_gr = row['Tail_gram_role']
        entity = row['Entity_exists']
        print(f"Subject: {subject} at {(row['subj_start'], row['subj_end'])}, object: {obj} at {(row['obj_start'], row['obj_end'])}. {entity_dict.get(entity, None)}")
        print(f"Head grammatical role: {grammar_role_dict.get(head_gr, None)}, tail grammatical role: {grammar_role_dict.get(tail_gr, None)}.")
        print(f"{entity_dict.get(entity, 'Entity info unknown.')}")
        head_add = input("Additional grammatical role for the head argument? \n")
        tail_add = input("Additional grammatical role for the tail argument? \n")
        dataframe.loc[i, 'Head_add'] = int(head_add)
        dataframe.loc[i, 'Tail_add'] = int(tail_add)
        if i%5 == 0:
          output.clear()
          print("5 for appositional modifier, 6 for noun compound modifier, 0 for neither.")
    print("End of loop.")

def new_annotate(dataframe, start, end):
    """New method to annotate data, which shall be used for dataframe loaded from excel.
       columns: ['index', 'id', 'relation', 'Head_gram_role', 'Tail_gram_role', 'sentence', 'subject', 'object', 'Head_add', 'Tail_add']
       dataframe: TACRED dataframe with data to be labeled.
       start: index of the start sentence.
       end: index of the end sentence."""
    print("5 for appositional modifier, 6 for noun compound modifier, 0 for neither.")
    for i in range(start, end):
        print(f"\n Iteration {i}")
        row = dataframe.iloc[i]
        print(f"ID: {row['id']}, relation: {row['relation']}.")
        sentence = row['sentence']
        subject = row['subject']
        obj = row['object']
        for j in range(len(sentence)//150 + 1):                         # Wrap the text manually
          print(sentence[150*j:150*(j+1)])
        head_gr = row['Head_gram_role']; tail_gr = row['Tail_gram_role']
        print(f"Subject: {subject}, object: {obj}.")
        print(f"Head grammatical role: {head_gr}, tail grammatical role: {tail_gr}.")
        head_add = input("Additional grammatical role for the head argument? \n")
        tail_add = input("Additional grammatical role for the tail argument? \n")
        dataframe.loc[i, 'Head_add'] = int(head_add)
        dataframe.loc[i, 'Tail_add'] = int(tail_add)
        if i%5 == 0:
          output.clear()
          print("5 for appositional modifier, 6 for noun compound modifier, 0 for neither.")
    print("End of loop.")

def convert(tr_df, start, end):
  """Convert train dataframe to a simpler dataframe, to be read and written directly.
     tr_df: the train dataframe to be converted.
     start: index of the start sentence.
     end: index of the end sentence."""
  global grammar_role_dict
  out_df = tr_df[['index', 'id', 'relation']][start: end]
  out_df['Head_gram_role'] = ''; out_df['Tail_gram_role'] = ''; out_df['sentence'] = ''
  out_df['subject'] = ''; out_df['object'] = ''
  out_df['Head_add'] = ''; out_df['Tail_add'] = ''
  for i in range(start, end):
    row = tr_df.iloc[i]
    tokens = row['token']
    sentence = ' '.join(tokens)
    out_df.loc[i, 'sentence'] = sentence
    out_df.loc[i, 'Head_gram_role'] = grammar_role_dict.get(row['Head_gram_role'], 'NA')
    out_df.loc[i, 'Tail_gram_role'] = grammar_role_dict.get(row['Tail_gram_role'], 'NA')
    subject = ' '.join(tokens[row['subj_start']: row['subj_end']+1]) + " at" + str((row['subj_start'], row['subj_end']+1))
    obj = ' '.join(tokens[row['obj_start']: row['obj_end']+1]) + " at" + str((row['obj_start'], row['obj_end']+1))
    out_df.loc[i, 'subject'] = subject
    out_df.loc[i, 'object'] = obj
  print("Convert complete.")
  return out_df

In [None]:
annotate(train_df, 3001, 3010)

5 for appositional modifier, 6 for compound modifier, 0 for neither.

 Iteration 3001
ID: 61b3a65fb992d6b51be2, relation: per:title.
Others included Sir George Young , chairman of the Standards and Privileges Committee , and Sir Stuart Bell , who is one of the senior MPs reviewing parliamentary expenses .
Subject: George Young at (3, 4), object: chairman at (6, 6). Entity doesn't exist between head and tail.
Head grammatical role: 2, dobj (direct object), tail grammatical role: 0, other.
Entity doesn't exist between head and tail.
Additional grammatical role for the head argument? 
5


In [None]:
# Display your work
print(train_df[['id', 'Head_add', 'Tail_add']][0:100])

## Step 3: Output the Data



Attention: We still need to decide how to divide the data, so we don't do repetitve work. Be careful about exporting the data for now, to avoid overwriting others' work.


In [None]:
# change to your own file
# train_df[1500:3000].to_excel(r'labeled_your_name.xlsx', index=False)
# !cp labeled_your_name.xlsx '/content/drive/My Drive/685_project/REval/'

If your working file only contains these columns: ['index', 'id', 'relation', 'Head_gram_role', 'Tail_gram_role', 'sentence', 'subject', 'object', 'Head_add', 'Tail_add'], you can directly load data and annotate from here:

In [None]:
## Next time you label the data, you can directly read the excel file:
# train_df_KL = pd.read_excel('/content/drive/My Drive/685_project/REval/KL_df.xlsx')

In [None]:
# new_annotate(train_df_KL, 200, 220)

If your working file has many more columns, convert them into a simpler dataframe

In [None]:
# train_df_Bob = pd.read_excel('/content/drive/My Drive/685_project/REval/labeled_Bob.xlsx')
# tr_df_Bob = convert(train_df_Bob, 0, len(train_df_Bob))

In [None]:
# new_annotate(tr_df_Bob, 1000, 1020)