Makes dataframe from the small essay corpus with premise-claim pairs to train the BERT model

In [3]:
import pickle
import itertools
import pandas as pd
import random

In [4]:

ent_frame_list = pickle.load( open( "pickles/ent_frame_list.p", "rb" ) )
rel_frame_list = pickle.load( open( "pickles/rel_frame_list.p", "rb" ) )

In [5]:
#clean up text 
ent_frame_list[0]
for i in ent_frame_list:
    for j in range(len(i)):
        i.iloc[j,0]= i.iloc[j,0].replace("text=","")
        i.iloc[j,0]= i.iloc[j,0].replace(')','')
        i.iloc[j,0]= i.iloc[j,0].replace('\'','')
    

In [6]:
def get_frame(ent_frame_list,rel_frame_list,even_distribution=True):
    """
    The function creates a dataframe containing pairs of target texts with labels if they show actual support realtions in the essays
    :ent_frame_list: list containing dataframes with the text for each text in the corpus
    :rel_frame_list: list containing dataframes with the support relations for each text in the corpus
    :even_distribution: if the class distribution should be even if True the number of examples choosen 
                        is capped by the smallest class
    :return: dataframe containing target pairs texts and the label if they show true support relations
    """
    hold=[]
    for i in range(len(ent_frame_list)):
        
        #matching is only between premises and claims
        fr=ent_frame_list[i].loc[ent_frame_list[i]["Type"]!="MajorClaim"]
        
        text=fr["text"].to_list()
        
        #make all combinations of possible pairs in the text
        all_combs=list(itertools.combinations(text,2))

        rel_text_frame=pd.merge(rel_frame_list[i],ent_frame_list[i], on="Targets")
        rel_text_frame=pd.merge(rel_text_frame,ent_frame_list[i], left_on="Supported",right_on="Targets")
        rel_text_frame=rel_text_frame.drop(["Targets_y"],axis=1)

        #get the true combination pairs
        true_list= list(zip(rel_text_frame.text_x, rel_text_frame.text_y))

        #add a label to the text pairs either true or false
        tr=("true",)
        true_labeled=[]
        for i in true_list:
            true_labeled.append(i+tr)

        
        f=set(all_combs)-set(true_list)
        fl=("false",)
        false_labeled=[]
        for i in f:
            false_labeled.append(i+fl)

        labeld_list=false_labeled+true_labeled
        random.shuffle(labeld_list)
        labeld_list=pd.DataFrame(labeld_list)
        
        labeld_list=labeld_list.rename(columns={0:"Text_prem",1:"Text_sup",2:"Label"})
        labeld_list["text"] = labeld_list["Text_prem"] + labeld_list["Text_sup"]
        hold.append(labeld_list)
        
        
    frame=pd.concat(hold)

    #for training numeric values are needed
    frame["Label"].replace("true",0,inplace=True)
    frame["Label"].replace("false",1,inplace=True)


    if even_distribution==True:
        #count examples pro class
        val=frame["Label"].value_counts()
      
        hold=[]
        #for each unique value take that many examples from the dataframe and shuffle
        for i in range(frame["Label"].nunique()):
            
            h=frame.loc[frame["Label"]==i]
            
            h=h.sample(frac=1)
            hold.append(h.iloc[0:val.min(),:])
        frame=pd.concat(hold)
        frame=frame.sample(frac=1)


    return frame




In [13]:
frame=get_frame(ent_frame_list,rel_frame_list,even_distribution=True)
frame["Label"].value_counts()

1    906
0    906
Name: Label, dtype: int64

In [10]:
frame

Unnamed: 0,Text_prem,Text_sup,Label,text
54,animals are friendly and vital for people,"if there are no animals in the world, the bal...",1,animals are friendly and vital for people if ...
108,capital punishment curbs the inspiration and ...,When a criminal mind knows the quantum of pun...,1,capital punishment curbs the inspiration and ...
88,employer always prefer to hire an employee of...,"""Working helps children be more independent a...",1,employer always prefer to hire an employee of...
16,people who watch exhibitions on TV or interne...,authentic exhibits cannot be completely displ...,1,people who watch exhibitions on TV or interne...
42,the crimes done by criminals should be brough...,"""when the background of the convicted is dug ...",0,the crimes done by criminals should be brough...
...,...,...,...,...
13,whether they can earn money or not will depen...,"when children take jobs, they tend to be more...",0,whether they can earn money or not will depen...
34,the introduction of international laws can be...,alternative forms of transportation and inter...,0,the introduction of international laws can be...
11,"by having CCTV cameras at workplace, crimes s...",CCTVs must be put in all workplace so that cr...,0,"by having CCTV cameras at workplace, crimes s..."
21,"""following fashion trends constantly may unde...",the world would look monotonous,0,"""following fashion trends constantly may unde..."


In [14]:
#optional: pickle the dataframe
pickle.dump(frame, open( "match_bert_frame_et_small.p", "wb" ) )