In [31]:
from collections import defaultdict 
#Class to represent a graph 
class Graph: 
  
    def __init__(self, vertices): 
        self.V = vertices 

    # Floyd Warshall algorithm 
    def transitiveClosure(self,graph): 
        '''reach[][] will be the output matrix that will finally 
        have reachability values. 
        Initialize the solution matrix same as input graph matrix'''
        reach =[i[:] for i in graph] 
        '''Add all vertices one by one to the set of intermediate 
        vertices. 
         ---> Before start of a iteration, we have reachability value 
         for all pairs of vertices such that the reachability values 
          consider only the vertices in set  
        {0, 1, 2, .. k-1} as intermediate vertices. 
          ----> After the end of an iteration, vertex no. k is 
         added to the set of intermediate vertices and the  
        set becomes {0, 1, 2, .. k}'''
        for k in range(self.V): 
              
            # Pick all vertices as source one by one 
            for i in range(self.V): 
                  
                # Pick all vertices as destination for the 
                # above picked source 
                for j in range(self.V): 
                      
                    # If vertex k is on a path from i to j,  
                       # then make sure that the value of reach[i][j] is 1 
                    reach[i][j] = min( reach[i][j] , reach[i][k] + reach[k][j] )
        return reach

In [53]:
from os import listdir
from os.path import isfile, join
import xml.etree.ElementTree as ET
import tqdm
import numpy as np

num_PC = 0
num_CP = 0
num_CO = 0
num_NO = 0

def relation_getter(i, j, graph, num_CO, num_PC, num_CP, num_NO):
    if graph[i][j] < 100 or graph[j][i] < 100:
        num_CO += 1
        return "Coref", num_CO, num_PC, num_CP, num_NO
    elif graph[i][j] < 10000:
        num_PC += 1
        return "SuperSub", num_CO, num_PC, num_CP, num_NO
    elif graph[j][i] < 10000:
        num_CP += 1
        return "SubSuper", num_CO, num_PC, num_CP, num_NO
    else:
        num_NO += 1
        return "NoRel", num_CO, num_PC, num_CP, num_NO

mypath = "LDC2016E47_IC_Domain_Event_Annotation_From_CMU_V1.0/data/"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f)) and f[-3:] == "xml"]
print(len(onlyfiles))

#fname = "AFP_ENG_19950125.0289.src.xml.txt.blk.tok.stp.tbf.xml"

for fname in tqdm.tqdm(onlyfiles):
    tree = ET.parse(mypath+fname)
    root = tree.getroot()
    Text = ""
    event_dict = {}
    eventid2num = {}
    relation_dict = {}
    num = 1
    last_eventid = ''
    relations = ['coreference', 'subevent_of', 'in_reporting', 'member_of']
    with open('./IC_Processed/' + fname + '.tsvx', 'w', encoding='utf8') as fp:
        for sentence in root:
            for word in sentence:
                if word.get('wd'):
                    start = len(Text)
                    Text += word.get('wd') + " "
                    if word.get('eventid') and last_eventid != word.get('eventid'):
                        if word.get('event_type'):
                            event_type = word.get('event_type')
                        else:
                            event_type = 'event'
                        eventid2num[word.get('eventid')] = num
                        event_dict[num] = {'eventid': str(num), 'mention': word.get('wd'), 'event_type': event_type, 'char_ID': start}
                        num += 1
                        last_eventid = word.get('eventid')
                    for relation in relations:
                        if word.get(relation):
                            if word.get(relation).find('+'):
                                eventid_list = word.get(relation).split('+')
                                for eventid in eventid_list:
                                    relation_dict[(word.get('eventid'), eventid)] = relation
                            else:
                                relation_dict[(word.get('eventid'), word.get(relation))] = relation
            Text += ". "

        fp.write('Text\t' + Text + '\n')
        for key, value in event_dict.items():
            fp.write('Event\t')
            fp.write(value['eventid'] + "\t")
            fp.write(value['mention'] + "\t")
            fp.write(value['event_type'] + "\t")
            fp.write(str(value['char_ID']) + "\n")
            
        relation_dict_fixed = {}
        for key, value in relation_dict.items():
            try:
                relation_dict_fixed[(eventid2num[key[0]], eventid2num[key[1]])] = value
            except:
                print(fname)
                print(key[0])
                print(key[1])
            
        g = Graph(num)
        graph = np.ones((num, num))
        graph = graph * 10000
        for edge, rel in relation_dict_fixed.items():
            if rel == "coreference":
                graph[int(edge[0])][int(edge[1])] = 1
                graph[int(edge[1])][int(edge[0])] = 1
            if rel in ['subevent_of', 'member_of']:
                graph[int(edge[1])][int(edge[0])] = 100
        graph = g.transitiveClosure(graph) 
        for i in range(1, num):
            for j in range(i+1, num):
                if i < j:
                    relation, num_CO, num_PC, num_CP, num_NO = relation_getter(int(event_dict[i]['eventid']), int(event_dict[j]['eventid']), graph, num_CO, num_PC, num_CP, num_NO)
                    fp.write('Relation\t')
                    fp.write(event_dict[i]['eventid'] + "\t")
                    fp.write(event_dict[j]['eventid'] + "\t")
                    fp.write(relation + "\t")
                    fp.write("true\t")
                    fp.write(event_dict[i]["mention"] + "\t")
                    fp.write(event_dict[j]["mention"] + "\n")
                
print("num_PC:", num_PC)     
print("num_CP:", num_CP)     
print("num_CO:", num_CO)     
print("num_NO:", num_NO)

  1%|          | 1/100 [00:00<00:11,  8.54it/s]

100
APW_ENG_19961217.1333.src.xml.txt.blk.tok.stp.tbf.xml
19
27
APW_ENG_19961217.1333.src.xml.txt.blk.tok.stp.tbf.xml
24
22
APW_ENG_19970521.1259.src.xml.txt.blk.tok.stp.tbf.xml
12
45
APW_ENG_19970521.1259.src.xml.txt.blk.tok.stp.tbf.xml
13
45
APW_ENG_19970521.1259.src.xml.txt.blk.tok.stp.tbf.xml
15
45
APW_ENG_19970521.1259.src.xml.txt.blk.tok.stp.tbf.xml
17
45
APW_ENG_19970521.1259.src.xml.txt.blk.tok.stp.tbf.xml
20
19
APW_ENG_19970521.1259.src.xml.txt.blk.tok.stp.tbf.xml
22
19
APW_ENG_19970521.1259.src.xml.txt.blk.tok.stp.tbf.xml
26
25
APW_ENG_19970521.1259.src.xml.txt.blk.tok.stp.tbf.xml
27
25
APW_ENG_19970521.1259.src.xml.txt.blk.tok.stp.tbf.xml
29
28
APW_ENG_19970521.1259.src.xml.txt.blk.tok.stp.tbf.xml
30
28
APW_ENG_19970521.1259.src.xml.txt.blk.tok.stp.tbf.xml
34
33
APW_ENG_19970521.1259.src.xml.txt.blk.tok.stp.tbf.xml
35
33
APW_ENG_19970521.1259.src.xml.txt.blk.tok.stp.tbf.xml
37
33


  5%|▌         | 5/100 [00:00<00:08, 11.45it/s]

XIN_ENG_19960116.0110.src.xml.txt.blk.tok.stp.tbf.xml
16
15
XIN_ENG_19960116.0110.src.xml.txt.blk.tok.stp.tbf.xml
17
15
AFP_ENG_20030303.0023.src.xml.txt.blk.tok.stp.tbf.xml
4
2
AFP_ENG_20040813.0336.src.xml.txt.blk.tok.stp.tbf.xml
7
55
AFP_ENG_20040813.0336.src.xml.txt.blk.tok.stp.tbf.xml
13
57
AFP_ENG_20040813.0336.src.xml.txt.blk.tok.stp.tbf.xml
19
57
AFP_ENG_20040813.0336.src.xml.txt.blk.tok.stp.tbf.xml
20
57
AFP_ENG_20040813.0336.src.xml.txt.blk.tok.stp.tbf.xml
30
29
AFP_ENG_20040813.0336.src.xml.txt.blk.tok.stp.tbf.xml
38
57
AFP_ENG_20040813.0336.src.xml.txt.blk.tok.stp.tbf.xml
47
57
APW_ENG_19970713.0540.src.xml.txt.blk.tok.stp.tbf.xml
11
85
APW_ENG_19970713.0540.src.xml.txt.blk.tok.stp.tbf.xml
15
85
APW_ENG_19970713.0540.src.xml.txt.blk.tok.stp.tbf.xml
34
85
APW_ENG_19970713.0540.src.xml.txt.blk.tok.stp.tbf.xml
97
90


  9%|▉         | 9/100 [00:00<00:09,  9.39it/s]

XIN_ENG_20020806.0041.src.xml.txt.blk.tok.stp.tbf.xml
28
27
XIN_ENG_20020806.0041.src.xml.txt.blk.tok.stp.tbf.xml
29
27
XIN_ENG_20020806.0041.src.xml.txt.blk.tok.stp.tbf.xml
30
27


 12%|█▏        | 12/100 [00:01<00:11,  7.88it/s]

LTW_ENG_20050124.0091.src.xml.txt.blk.tok.stp.tbf.xml
5
59
LTW_ENG_20050124.0091.src.xml.txt.blk.tok.stp.tbf.xml
6
59
LTW_ENG_20050124.0091.src.xml.txt.blk.tok.stp.tbf.xml
14
59


 13%|█▎        | 13/100 [00:01<00:11,  7.68it/s]

APW_ENG_20020605.0452.src.xml.txt.blk.tok.stp.tbf.xml
96
94


 16%|█▌        | 16/100 [00:02<00:14,  5.62it/s]

XIN_ENG_20061121.0149.src.xml.txt.blk.tok.stp.tbf.xml
20
19
XIN_ENG_20061121.0149.src.xml.txt.blk.tok.stp.tbf.xml
21
19
XIN_ENG_20061121.0149.src.xml.txt.blk.tok.stp.tbf.xml
22
19
APW_ENG_20051127.0031.src.xml.txt.blk.tok.stp.tbf.xml
5
6
APW_ENG_20051127.0031.src.xml.txt.blk.tok.stp.tbf.xml
7
6
APW_ENG_20051127.0031.src.xml.txt.blk.tok.stp.tbf.xml
8
6
APW_ENG_20051127.0031.src.xml.txt.blk.tok.stp.tbf.xml
24
26
APW_ENG_20051127.0031.src.xml.txt.blk.tok.stp.tbf.xml
25
26
APW_ENG_20051127.0031.src.xml.txt.blk.tok.stp.tbf.xml
27
26
APW_ENG_20051127.0031.src.xml.txt.blk.tok.stp.tbf.xml
29
30
APW_ENG_20051127.0031.src.xml.txt.blk.tok.stp.tbf.xml
31
30
APW_ENG_20051127.0031.src.xml.txt.blk.tok.stp.tbf.xml
35
30
APW_ENG_20051127.0031.src.xml.txt.blk.tok.stp.tbf.xml
36
30
APW_ENG_20051127.0031.src.xml.txt.blk.tok.stp.tbf.xml
39
40
APW_ENG_20051127.0031.src.xml.txt.blk.tok.stp.tbf.xml
41
40
APW_ENG_20051127.0031.src.xml.txt.blk.tok.stp.tbf.xml
56
55
APW_ENG_20051127.0031.src.xml.txt.blk.tok.stp.

 20%|██        | 20/100 [00:02<00:12,  6.66it/s]

UNK_ENG_20060712.0002.src.xml.txt.blk.tok.stp.tbf.xml
19
18
UNK_ENG_20060712.0002.src.xml.txt.blk.tok.stp.tbf.xml
56
55
UNK_ENG_20060712.0002.src.xml.txt.blk.tok.stp.tbf.xml
63
62
UNK_ENG_20060712.0002.src.xml.txt.blk.tok.stp.tbf.xml
93
92
UNK_ENG_20060712.0002.src.xml.txt.blk.tok.stp.tbf.xml
94
92


 27%|██▋       | 27/100 [00:03<00:09,  7.70it/s]

APW_ENG_19981101.0355.src.xml.txt.blk.tok.stp.tbf.xml
8
7


 31%|███       | 31/100 [00:04<00:08,  7.85it/s]

AFP_ENG_20060213.0671.src.xml.txt.blk.tok.stp.tbf.xml
35
34


 35%|███▌      | 35/100 [00:04<00:07,  9.03it/s]

AFP_ENG_19970509.0516.src.xml.txt.blk.tok.stp.tbf.xml
14
12


 41%|████      | 41/100 [00:05<00:04, 12.05it/s]

AFP_ENG_20050426.0211.src.xml.txt.blk.tok.stp.tbf.xml
14
13
AFP_ENG_20050426.0211.src.xml.txt.blk.tok.stp.tbf.xml
15
13
AFP_ENG_20041224.0117.src.xml.txt.blk.tok.stp.tbf.xml
3
8
AFP_ENG_20041224.0117.src.xml.txt.blk.tok.stp.tbf.xml
7
6


 43%|████▎     | 43/100 [00:05<00:04, 12.43it/s]

APW_ENG_20040204.0281.src.xml.txt.blk.tok.stp.tbf.xml
22
20
LTW_ENG_20040319.0128.src.xml.txt.blk.tok.stp.tbf.xml
47
46
LTW_ENG_20040319.0128.src.xml.txt.blk.tok.stp.tbf.xml
60
59
LTW_ENG_20040319.0128.src.xml.txt.blk.tok.stp.tbf.xml
61
59
LTW_ENG_20040319.0128.src.xml.txt.blk.tok.stp.tbf.xml
88
85
LTW_ENG_20040319.0128.src.xml.txt.blk.tok.stp.tbf.xml
86
85


 45%|████▌     | 45/100 [00:05<00:06,  7.86it/s]

AFP_ENG_20041020.0093.src.xml.txt.blk.tok.stp.tbf.xml
38
37
AFP_ENG_20041020.0093.src.xml.txt.blk.tok.stp.tbf.xml
16
37
AFP_ENG_20040823.0382.src.xml.txt.blk.tok.stp.tbf.xml
20
19
AFP_ENG_20040823.0382.src.xml.txt.blk.tok.stp.tbf.xml
23
19
AFP_ENG_20040823.0382.src.xml.txt.blk.tok.stp.tbf.xml
51
19


 49%|████▉     | 49/100 [00:06<00:05,  8.79it/s]

AFP_ENG_20040315.0734.src.xml.txt.blk.tok.stp.tbf.xml
3
1
AFP_ENG_20040810.0489.src.xml.txt.blk.tok.stp.tbf.xml
19
17


 53%|█████▎    | 53/100 [00:06<00:04, 10.01it/s]

AFP_ENG_20050328.0290.src.xml.txt.blk.tok.stp.tbf.xml
38
37
AFP_ENG_20050104.0385.src.xml.txt.blk.tok.stp.tbf.xml
3
30
AFP_ENG_20050104.0385.src.xml.txt.blk.tok.stp.tbf.xml
4
30
AFP_ENG_20050104.0385.src.xml.txt.blk.tok.stp.tbf.xml
5
30


 63%|██████▎   | 63/100 [00:07<00:03,  9.94it/s]

AFP_ENG_19950125.0289.src.xml.txt.blk.tok.stp.tbf.xml
31
35
AFP_ENG_19950125.0289.src.xml.txt.blk.tok.stp.tbf.xml
33
32


 65%|██████▌   | 65/100 [00:07<00:03, 10.39it/s]

AFP_ENG_20020303.0113.src.xml.txt.blk.tok.stp.tbf.xml
3
5
AFP_ENG_20020303.0113.src.xml.txt.blk.tok.stp.tbf.xml
21
22
NYT_ENG_20050312.0073.src.xml.txt.blk.tok.stp.tbf.xml
39
38
NYT_ENG_20050312.0073.src.xml.txt.blk.tok.stp.tbf.xml
40
38
NYT_ENG_20050312.0073.src.xml.txt.blk.tok.stp.tbf.xml
44
38
NYT_ENG_20050312.0073.src.xml.txt.blk.tok.stp.tbf.xml
70
19


 68%|██████▊   | 68/100 [00:08<00:05,  5.92it/s]

APW_ENG_19990716.0669.src.xml.txt.blk.tok.stp.tbf.xml
41
51
AFP_ENG_20040810.0551.src.xml.txt.blk.tok.stp.tbf.xml
25
24


 70%|███████   | 70/100 [00:08<00:04,  6.37it/s]

AFP_ENG_20041222.0453.src.xml.txt.blk.tok.stp.tbf.xml
25
23
AFP_ENG_20041222.0453.src.xml.txt.blk.tok.stp.tbf.xml
26
23


 76%|███████▌  | 76/100 [00:09<00:02,  9.35it/s]

NYT_ENG_19941213.0208.src.xml.txt.blk.tok.stp.tbf.xml
14
13
AFP_ENG_19950321.0171.src.xml.txt.blk.tok.stp.tbf.xml
2
5
AFP_ENG_19950321.0171.src.xml.txt.blk.tok.stp.tbf.xml
3
5
AFP_ENG_19950321.0171.src.xml.txt.blk.tok.stp.tbf.xml
4
5
AFP_ENG_19950321.0171.src.xml.txt.blk.tok.stp.tbf.xml
11
66
AFP_ENG_19950321.0171.src.xml.txt.blk.tok.stp.tbf.xml
12
66
AFP_ENG_19950321.0171.src.xml.txt.blk.tok.stp.tbf.xml
13
65
AFP_ENG_19950321.0171.src.xml.txt.blk.tok.stp.tbf.xml
44
42
AFP_ENG_19950321.0171.src.xml.txt.blk.tok.stp.tbf.xml
57
56
AFP_ENG_19950321.0171.src.xml.txt.blk.tok.stp.tbf.xml
59
56


 78%|███████▊  | 78/100 [00:09<00:02, 10.68it/s]

APW_ENG_20030303.0093.src.xml.txt.blk.tok.stp.tbf.xml
36
59
AFP_ENG_20050805.0248.src.xml.txt.blk.tok.stp.tbf.xml
29
55
AFP_ENG_20050805.0248.src.xml.txt.blk.tok.stp.tbf.xml
43
44


 83%|████████▎ | 83/100 [00:10<00:02,  8.31it/s]

XIN_ENG_20060712.0237.src.xml.txt.blk.tok.stp.tbf.xml
28
27
XIN_ENG_20060712.0237.src.xml.txt.blk.tok.stp.tbf.xml
29
27
XIN_ENG_20060712.0237.src.xml.txt.blk.tok.stp.tbf.xml
30
27
XIN_ENG_20060712.0237.src.xml.txt.blk.tok.stp.tbf.xml
31
27
XIN_ENG_20060712.0237.src.xml.txt.blk.tok.stp.tbf.xml
32
34
XIN_ENG_20060712.0237.src.xml.txt.blk.tok.stp.tbf.xml
33
34
XIN_ENG_20060712.0237.src.xml.txt.blk.tok.stp.tbf.xml
37
36
XIN_ENG_20060712.0237.src.xml.txt.blk.tok.stp.tbf.xml
39
36
XIN_ENG_20060712.0237.src.xml.txt.blk.tok.stp.tbf.xml
41
40
XIN_ENG_20060712.0237.src.xml.txt.blk.tok.stp.tbf.xml
42
40
XIN_ENG_20060712.0237.src.xml.txt.blk.tok.stp.tbf.xml
43
40
AFP_ENG_20050117.0001.src.xml.txt.blk.tok.stp.tbf.xml
3
1
AFP_ENG_20050117.0001.src.xml.txt.blk.tok.stp.tbf.xml
8
1


 85%|████████▌ | 85/100 [00:10<00:01,  9.37it/s]

APW_ENG_20040406.0337.src.xml.txt.blk.tok.stp.tbf.xml
14
89
APW_ENG_20040406.0337.src.xml.txt.blk.tok.stp.tbf.xml
16
89
APW_ENG_20040406.0337.src.xml.txt.blk.tok.stp.tbf.xml
17
89


 87%|████████▋ | 87/100 [00:10<00:01,  7.90it/s]

AFP_ENG_20060209.0571.src.xml.txt.blk.tok.stp.tbf.xml
34
55
AFP_ENG_20060209.0571.src.xml.txt.blk.tok.stp.tbf.xml
42
44
NYT_ENG_20050125.0022.src.xml.txt.blk.tok.stp.tbf.xml
26
28


 93%|█████████▎| 93/100 [00:11<00:01,  6.58it/s]

XIN_ENG_20030919.0267.src.xml.txt.blk.tok.stp.tbf.xml
1
12
XIN_ENG_20030919.0267.src.xml.txt.blk.tok.stp.tbf.xml
13
12
XIN_ENG_20030919.0267.src.xml.txt.blk.tok.stp.tbf.xml
48
47
XIN_ENG_20030919.0267.src.xml.txt.blk.tok.stp.tbf.xml
50
47
XIN_ENG_20030919.0267.src.xml.txt.blk.tok.stp.tbf.xml
51
47


 98%|█████████▊| 98/100 [00:11<00:00,  9.21it/s]

NYT_ENG_20011116.0315.src.xml.txt.blk.tok.stp.tbf.xml
24
21
AFP_ENG_19950407.0079.src.xml.txt.blk.tok.stp.tbf.xml
31
30
AFP_ENG_20020724.0397.src.xml.txt.blk.tok.stp.tbf.xml
31
30


100%|██████████| 100/100 [00:11<00:00,  8.35it/s]

num_PC: 2248
num_CP: 2338
num_CO: 2353
num_NO: 81887





In [51]:
%debug

> [0;32m<ipython-input-50-8fdc7416698e>[0m(97)[0;36m<module>[0;34m()[0m
[0;32m     95 [0;31m            [0;32mfor[0m [0mj[0m [0;32min[0m [0mrange[0m[0;34m([0m[0mi[0m[0;34m+[0m[0;36m1[0m[0;34m,[0m [0mnum[0m[0;34m+[0m[0;36m1[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     96 [0;31m                [0;32mif[0m [0mi[0m [0;34m<[0m [0mj[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 97 [0;31m                    [0mrelation[0m[0;34m,[0m [0mnum_CO[0m[0;34m,[0m [0mnum_PC[0m[0;34m,[0m [0mnum_CP[0m[0;34m,[0m [0mnum_NO[0m [0;34m=[0m [0mrelation_getter[0m[0;34m([0m[0mint[0m[0;34m([0m[0mevent_dict[0m[0;34m[[0m[0mi[0m[0;34m][0m[0;34m[[0m[0;34m'eventid'[0m[0;34m][0m[0;34m)[0m[0;34m,[0m [0mint[0m[0;34m([0m[0mevent_dict[0m[0;34m[[0m[0mj[0m[0;34m][0m[0;34m[[0m[0;34m'eventid'[0m[0;34m][0m[0;34m)[0m[0;34m,[0m [0mgraph[0m[0;34m,[0m [0mnum_CO[0m[0;34m,[0m [0mnum_