Make a Dataframe containing only claims and their claim types (Fact, Value, or Policy) to feed in to a model for claim type prediction.


In [9]:
import pickle
import pandas as pd

In [10]:
att_frame_list = pickle.load( open( "pickles/att_frame_list.p", "rb" ) )
ent_frame_list = pickle.load( open( "pickles/ent_frame_list.p", "rb" ) )
rel_frame_list = pickle.load( open( "pickles/rel_frame_list.p", "rb" ) )

In [11]:
#clean up text 
ent_frame_list[0]
for frame in ent_frame_list:
    for i in range(len(frame)):
        frame.iloc[i,0]= frame.iloc[i,0].replace("text=","")
        frame.iloc[i,0]= frame.iloc[i,0].replace(')','')
        frame.iloc[i,0]= frame.iloc[i,0].replace('\'','')
    

In [12]:
att_frame_list[0]

Unnamed: 0,ID,Type,Targets,Values
0,A1,Stance,T3,For
1,A2,Stance,T7,Against
2,A3,Stance,T11,For
3,A4,PremiseType,T10,invented_instance
4,A5,Eloquence,T11,1
...,...,...,...,...
85,A88,Logos,T4,no
86,A89,ClaimType,T4,Value
87,A90,Ethos,T4,no
88,A91,Pathos,T4,no


In [13]:
def make_claim_frame(att_frame_list,ent_frame_list,binary=False,even_distribution=True):
    """
    The function creates a dataframe cointaining all claims and their types of all texts in the corpus

    :att_frame_list: list containing dataframes with the attributes for each text in the corpus
    :ent_frame_list: list containing dataframes with the text for each text in the corpus
    :binary: if the returned dataframe should contain only a binary distribution between Fact or not Fact
    :even_distribution: if the class distribution should be even if True the number of examples choosen 
                        is capped by the smallest class
    :return: dataframe containing claims, their type and the corresponding text
    """ 
    #make a combined frame with Targets,Text and ClaimType
    hold_list=[]
    for i,j in zip(att_frame_list,ent_frame_list):
        
        #merge on Targets
        hold=pd.merge(i,j,on="Targets")

        hold=hold.drop(["Type_y"], axis=1)
        
        #for training numeric values are needed
        hold["Values"].replace("Value",0,inplace=True)
        hold["Values"].replace("Fact",1,inplace=True)
        hold["Values"].replace("Policy",2,inplace=True)
        
        
        #training happens only over the claim type
        hold_list.append(hold.loc[hold["Type_x"]=="ClaimType"])
        
    
    claim_type_frame=pd.concat(hold_list,ignore_index=True)

    #pack Policy and Value into one category 
    if binary==True:
        claim_type_frame["Values"].replace(2,0,inplace=True)
    
    #even out the class distribution by reducing to classsize to the class with the fewest members
    if even_distribution==True:
        #count examples pro class
        val=claim_type_frame["Values"].value_counts()
        hold=[]
        #for each unique value take that many examples from the dataframe and shuffle
        for i in range(claim_type_frame["Values"].nunique()):
            h=claim_type_frame.loc[claim_type_frame["Values"]==i]
            h=h.sample(frac=1)
            hold.append(h.iloc[0:val.min(),:])
        claim_type_frame=pd.concat(hold)
        claim_type_frame=claim_type_frame.sample(frac=1)

    return claim_type_frame

In [16]:
claim_type_frame=make_claim_frame(att_frame_list,ent_frame_list,binary=False,even_distribution=False)



1    368
0    145
2     54
Name: Values, dtype: int64

In [17]:
#optional: pickle the dataframe
pickle.dump(claim_type_frame, open( "claim_type_frame_bf_ef.p", "wb" ) )