Make a dataframe from the micro corpus to predict premise-claim-pairs with the trained BERT model

In [1]:
import json
import pandas as pd
import random
import itertools
import pickle

In [2]:
def read_in_data(num):
    corpus_list=["cmv-hidey","usdeb","essay_1","micro_struc"]

    data = []

    print("Choosen corpus is:",corpus_list[num])


    with open("Data/"+corpus_list[num]+".json") as f:
        for line in f:
            data.append(json.loads(line))

    df = pd.DataFrame(data)
    
    if num ==3:
       
        df=df.drop("discourse",axis=1)
    
    return df

In [3]:
df=read_in_data(3)

Choosen corpus is: micro_struc


In [4]:
#make a dataframe containing the relations between units and the text belonging to them
df_list=[]
text_list=[]

for num in range(len(df)):
    text=[]

    for part_list in df.iloc[num,3]:
        tokens=part_list["tokens"]
    
        for word in tokens:
            text.append(word["surface"])

    units=df.iloc[num,4]["units"]
    arguments=df.iloc[num,4]["arguments"]

    #look up the text id and the role of each unit
    #the text id is given as the first and last word position of the unit
    unit_id_list=[]
    role_list=[]

    for i in units:

        unit_id=(i["tokens"][0],i["tokens"][-1])

        unit_id_list.append(unit_id)
        role_list.append(i["attributes"]["role"])


    units_dict={}
    for i in range (0,len(units)):

        units_dict[i]=unit_id_list[i],role_list[i]

    #make a dataframe
    unitframe = pd.DataFrame(arguments)

    unitframe["unit1_role"] = ""
    unitframe["unit2_role"] = ""

    unitframe["unit1_id"] = ""
    unitframe["unit2_id"] = ""

    for i in range(len(unitframe)) :
        

        #add into column unit_role1/2 the role of this unit the role is taken from the units dictonary
        #the correct entry in the dict is looked up from the unit column in the dataframe

        unitframe.at[i,"unit1_role"]=units_dict[unitframe.at[i,"unit1"]][1]

        unitframe.at[i,"unit2_role"]=units_dict[unitframe.at[i,"unit2"] ][1]

        #the same is done with the unit id

        unitframe.at[i,"unit1_id"]=units_dict[unitframe.at[i,"unit1"]][0]

        unitframe.at[i,"unit2_id"]=units_dict[unitframe.at[i,"unit2"] ][0]
     
    df_list.append(unitframe) 
    text_list.append(text)  

In [5]:
df_list[0]

Unnamed: 0,unit1,unit2,rtype,unit1_role,unit2_role,unit1_id,unit2_id
0,4,2,supports,Premise,Premise,"(45, 61)","(28, 37)"
1,2,0,rebuts,Premise,Claim,"(28, 37)","(0, 19)"
2,1,0,supports,Premise,Claim,"(20, 27)","(0, 19)"
3,3,2,supports,Premise,Premise,"(38, 44)","(28, 37)"


In [6]:
#match the text to the unit ids
text_frame_list=[]
for frame,text in zip(df_list,text_list):

    unit1_text_list=[]
    unit2_text_list=[]

    for i in range(len(frame)):

        #get the tuple indicating the start and end position of the text 
        # combine them into one sentence
        pos_1=frame.iloc[i,5]
        snip_1=text[pos_1[0]:pos_1[1]]
        sent_1 = ' '.join(snip_1)
    
        pos_2=frame.iloc[i,6]
        snip_2=text[pos_2[0]:pos_2[1]]
        sent_2 = ' '.join(snip_2)
        
        unit1_text_list.append(sent_1)
        unit2_text_list.append(sent_2)


    frame["unit1_text"]=unit1_text_list

    frame["unit2_text"]=unit2_text_list

    text_frame_list.append(frame)


In [7]:
def get_frame_big_essay(text_frame_list,even_distribution=True):
    """
    The function creates a dataframe containing pairs of target texts with labels if they show actual support realtions in the essays
    : text_frame_list: a dataframe containing the the support relations between targets and their texts
    :even_distribution: if True the number of examples choosen is capped by the smallest class
    """
    h=[]
    
    for i in range(len(text_frame_list)):
        
        #get the text from the units
        unit_1_text = text_frame_list[i]['unit1_text'].tolist()
        unit_2_text = text_frame_list[i]['unit2_text'].tolist()

        #combine them all to get the complete text for all premises and claims
        complete_text=unit_1_text+unit_2_text
        complete_text = list(set(complete_text))

        #holds the real support relation pairs
        true_list=list(zip(unit_1_text,unit_2_text))

        #all possible combinations of text pairs
        all_combs=list(itertools.combinations(complete_text,2))
        
        #add labels to the text pairs
        tr=("true",)
        true_labeled=[]
        for i in true_list:
            true_labeled.append(i+tr)


        false_list=set(all_combs)-set(true_list)
        fl=("false",)
        false_labeled=[]
        for i in false_list:
            false_labeled.append(i+fl)

        #make a dataframe with the text pairs and their labels
        labeled_list=true_labeled+false_labeled
        random.shuffle(labeled_list)
        labeled_list=pd.DataFrame(labeled_list)


        labeled_list=labeled_list.rename(columns={0:"Text_prem",1:"Text_sup",2:"Label"})
        labeled_list["text"] = labeled_list["Text_prem"] + " " + labeled_list["Text_sup"]
        h.append(labeled_list)


    frame=pd.concat(h)

    #for training numeric values are needed
    frame["Label"].replace("true",0,inplace=True)
    frame["Label"].replace("false",1,inplace=True)


    if even_distribution==True:
        #count examples pro class
        val=frame["Label"].value_counts()
        
        hold=[]
        #for each unique value take that many examples from the dataframe and shuffle
        for i in range(frame["Label"].nunique()):
          
            h=frame.loc[frame["Label"]==i]
            h=h.sample(frac=1)
            hold.append(h.iloc[0:val.min(),:])
        frame=pd.concat(hold)
        frame=frame.sample(frac=1)


    return frame


In [20]:
frame=get_frame_big_essay(text_frame_list,even_distribution=True)
frame["Label"].value_counts()

1    464
0    464
Name: Label, dtype: int64

In [9]:
#optional: pickle the dataframe
pickle.dump(frame, open( "match_bert_frame_et_micro.p", "wb" ) )