Matches premises to claims on the small essay corpus with a Word2Vec model

In [86]:
import pickle
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec
import pandas as pd
import statistics
from sklearn.metrics import f1_score
import random

In [87]:
#load dataframes
ent_frame_list = pickle.load( open( "pickles/ent_frame_list.p", "rb" ) )
att_frame_list = pickle.load( open( "pickles/att_frame_list.p", "rb" ) )
rel_frame_list = pickle.load( open( "pickles/rel_frame_list.p", "rb" ) )

In [88]:
#clean up text 
ent_frame_list[0]
for i in ent_frame_list:
    for j in range(len(i)):
        i.iloc[j,0]= i.iloc[j,0].replace("text=","")
        i.iloc[j,0]= i.iloc[j,0].replace(')','')
        i.iloc[j,0]= i.iloc[j,0].replace('\'','')
    

In [89]:
ent_frame_list[0]

Unnamed: 0,text,Targets,Type
0,we should attach more importance to cooperati...,T1,MajorClaim
1,"""a more cooperative attitudes towards life is...",T2,MajorClaim
2,"through cooperation, children can learn about...",T3,Claim
3,What we acquired from team work is not only h...,T4,Claim
4,"During the process of cooperation, children c...",T5,Premise
5,All of these skills help them to get on well ...,T6,Premise
6,competition makes the society more effective,T7,Claim
7,the significance of competition is that how t...,T8,Premise
8,when we consider about the question that how ...,T9,Claim
9,Take Olympic games which is a form of competi...,T10,Premise


In [90]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from gensim.parsing.preprocessing import remove_stopwords

In [91]:
#make a word2vec model for each text
model_list=[]
for i in ent_frame_list:
    
    examp=i
    
    text_list = examp['text'].tolist()

    word_list=[]
    for sent in text_list:


        hold=sent_tokenize(sent)
        
        for word in hold:

            filtered_sentence = remove_stopwords(word)
           
            word_list.append(word_tokenize(filtered_sentence))


    model = gensim.models.Word2Vec(word_list, min_count = 1,
    vector_size = 50, window = 20,sg=1, compute_loss=True)
    
    
    model.train(word_list, total_examples=model.corpus_count, epochs = 30, compute_loss=True)
    
    model_list.append(model)



In [92]:
model=model_list[0]

len(model.wv)
model.wv.most_similar("cooperation",topn=10)

[('members', 0.8606662154197693),
 ('occurred', 0.8433937430381775),
 ('Olympic', 0.8353438973426819),
 (',', 0.8351609110832214),
 ('gain', 0.8208107352256775),
 ('games', 0.8199472427368164),
 ('diet', 0.8074123859405518),
 ('care', 0.8023310899734497),
 ('compromise', 0.7992647886276245),
 ('win', 0.7862445116043091)]

Note about the support relationship types: The data allows for the following support realtionships: premise-claim, claim-claim, premise-premise and claim-premise. In cases where the relation is not premise-claim another claim will be supported upwards in this chain. To solve this in our data we ignored the indirect support relationship and focused on the direct support relation present in the data. Therefore we ignored the types of the Targets in the realtion and assigned every relation to describe a premise-claim support relation. 

In [93]:
#make a list of dataframes containing every relation in the frame and the text belonging to the targets
frame_list=[]

for i in range(len(ent_frame_list)):

    
    p=pd.merge(att_frame_list[i],ent_frame_list[i],on="Targets")
    
    p=p.loc[p["Type_x"]=="PremiseType"]

    text_list = p['text'].tolist()

    d=att_frame_list[i]
    d=d.loc[d["Type"]=="ClaimType"]


    g=pd.merge(rel_frame_list[i],ent_frame_list[i],on="Targets")
    g=pd.merge(g,ent_frame_list[i],left_on="Supported",right_on="Targets")
    g=pd.merge(g,d,left_on="Supported",right_on="Targets",how="left")


    g=g.drop(["ID_x","Targets_y","Targets","Type","ID_y"],axis=1)
    g=g.rename(columns={"text_x":"Text_Targets","text_y":"Text_Supported","Targets_x":"Targets","Type_x":"Type_Target","Type_y":"Type_Supported"})

    #every support relationship is labeled as premise and claim (see cell above)
  
    g.loc[g.Type_Target == "Claim" , ['Type_Target']] = 'Premise'
    g.loc[g.Type_Supported == "Claim" , ['Type_Target']] = 'Premise'
    g.loc[g.Type_Target == "Premise" , ['Type_Supported']] = 'Claim'
    


    
    frame_list.append(g)

In [94]:
#show distribution of support relationship types
hold_list=[]
for i in range(len(ent_frame_list)):
    g=pd.merge(rel_frame_list[i],ent_frame_list[i],on="Targets")
    g=pd.merge(g,ent_frame_list[i],left_on="Supported",right_on="Targets")
    hold_list.append(g)
big_frame = pd.concat(hold_list)

big_frame.groupby(["Type_x","Type_y"]).count()


Unnamed: 0_level_0,Unnamed: 1_level_0,ID,Stance,Targets_x,Supported,text_x,text_y,Targets_y
Type_x,Type_y,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Claim,Claim,183,183,183,183,183,183,183
Claim,Premise,16,16,16,16,16,16,16
Premise,Claim,602,602,602,602,602,602,602
Premise,Premise,105,105,105,105,105,105,105


In [95]:
#calculate the cosine similarity between each premise and claim example
#choose the pair with the highest value as prediction of which premise fits to which claim

cos_sim_all=[]
acc_list=[]

for ex in range(len(model_list)):

    #split the frame in information about the supporter(premise) and the supported(claim)
    prem_frame= frame_list[ex][['Text_Targets', 'Targets', 'Type_Target']].copy()
    clm_frame= frame_list[ex][['Text_Supported', 'Supported', 'Type_Supported']].copy()
    
    cos_sim=[]
    #tokenize the sentences to word lists
    for i in range(len(prem_frame)):
        premise=word_tokenize(prem_frame.iloc[i,0])
        
        for j in range(len(clm_frame)):
            #print("new claim")
            claim=word_tokenize(clm_frame.iloc[j,0])
            

            cos_sim_pair=[]
            #for every word pair in the premise-claim pair compute the cosine similarity 
            #of the embbeded word2vec model vector
            for word_prem in premise:
                for word_clm in claim:
                    try:
                        cos_sim_pair.append(model_list[ex].wv.similarity(word_prem,word_clm))
                    except KeyError:
                        pass


            

            cos_sim.append(statistics.mean(cos_sim_pair))
    cos_sim_all.append(cos_sim)


    #some texts have no premises theses are ignored 
    if len(prem_frame)==0:
        pass
        
    else:
        #split the obtained values into chunks each list shows how well all premises fits to one claim
        #list of [p1c1, p1c2, p1c3, p2c1, p2,c2 ...]
        length=len(prem_frame)
        
        id_list=[]
        chunks = [cos_sim[x:x+length] for x in range(0, len(cos_sim), length)]
        
        #claims appear multiple times in the list this takes only the first highest value
        #assumption: two different claims are never equally likely
        for i in chunks:
            id_list.append( max(range(len(i)), key=i.__getitem__))
        
        #get the claim id 
        claim_list=[]
        for i in id_list:
            claim_list.append(clm_frame.iloc[i,1])
        
        #make a list of tuple with the premise and the predicted claim
        prem_frame["claim"]=claim_list
        pred_list=list(zip(prem_frame.claim, prem_frame.Targets))
        #flip tuples
        pred_list = [t[::-1] for t in pred_list]

        #get a tuple list with the real premise claim pairs
        real_list=list(zip(frame_list[ex].Targets, frame_list[ex].Supported))
        
        rigth_set=set(pred_list) & set(real_list)
       
        acc=len(rigth_set)/len(real_list)
        acc_list.append(acc)
        


In [96]:
print(statistics.mean(acc_list))

0.26609069035539623
