In [1]:
import re
import os
import string
from tqdm import tqdm

## Extracting Wikidata entries per company

In [2]:
# Entity numbers
# NOTE: SPLP could not be found on Wikidata
name_to_num = {}
num_to_name = {}
entities = set()
with open('./links.csv') as file:
    next(file)
    for line in file:
        name, url = line.split(',')
        entity_num = int(re.sub('\D', '', url.split(',')[-1]))
        name_to_num[name] = entity_num
        num_to_name[entity_num] = name
        entities.add(entity_num)

In [13]:
# Extract entities (DON'T RERUN)
pattern = re.compile("wd:Q[0-9]+ a wikibase:Item ;")
counter = 0
extractions = 0
extracted = set()

with open('./wikidata-20150720-all-BETA.ttl') as file:
    line = next(file)
    while line:
        
        if counter % 10000000 == 0:
            print(counter, f'{extractions}/88 extracted', end='\r')
            
        counter += 1
        
        if pattern.match(line):
            entity_num = int(re.sub('\D', '', line))
            
            if entity_num not in entities:
                line = next(file)
                continue
            
            company = num_to_name[entity_num]
            
            with open(f'./wikidata_entries/{company}_wikidata.txt', 'w') as outfile:
                extractions += 1
                extracted.add(entity_num)
                print('---HEADER---')
                print('NUMBER', entity_num)
                print(line, '\n')
                
                outfile.write(line)

                line = next(file)
                while not pattern.match(line):
#                     print(line)
                    outfile.write(line)
                    line = next(file)
        else:
            line = next(file)

---HEADER---cted
NUMBER 1
wd:Q1 a wikibase:Item ;
 

---HEADER---
NUMBER 7414
wd:Q7414 a wikibase:Item ;
 

---HEADER---
NUMBER 35476
wd:Q35476 a wikibase:Item ;
 

---HEADER---
NUMBER 38076
wd:Q38076 a wikibase:Item ;
 

---HEADER---8 extracted
NUMBER 152057
wd:Q152057 a wikibase:Item ;
 

---HEADER---
NUMBER 154950
wd:Q154950 a wikibase:Item ;
 

---HEADER---
NUMBER 173395
wd:Q173395 a wikibase:Item ;
 

---HEADER---
NUMBER 190464
wd:Q190464 a wikibase:Item ;
 

---HEADER---
NUMBER 192314
wd:Q192314 a wikibase:Item ;
 

---HEADER---8 extracted
NUMBER 319642
wd:Q319642 a wikibase:Item ;
 

---HEADER---
NUMBER 464092
wd:Q464092 a wikibase:Item ;
 

---HEADER---
NUMBER 483551
wd:Q483551 a wikibase:Item ;
 

---HEADER---
NUMBER 489921
wd:Q489921 a wikibase:Item ;
 

---HEADER---
NUMBER 503182
wd:Q503182 a wikibase:Item ;
 

---HEADER---88 extracted
NUMBER 1359568
wd:Q1359568 a wikibase:Item ;
 

---HEADER---/88 extracted
NUMBER 14662364
wd:Q14662364 a wikibase:Item ;
 

---HEADER---/88 e

StopIteration: 

In [54]:
for e_id in entities - extracted:
    print('missing:', num_to_name[e_id])

missing: CODI
missing: AGFS


 ## Build graph

### First order relations

In [3]:
# Building first-order relationships
graph = {}
for name in name_to_num.keys():
    graph[name] = []
graph['SPLP'] = []

for filename in os.listdir('./wikidata_entries/'):
    with open('./wikidata_entries/' + filename) as file:
        if filename == '.DS_Store': continue
            
        orig_entity = filename.split('_')[0]
        print('\nSearching entity', orig_entity)
        
        for e_id in entities:
            file.seek(0, 0)
            if str(e_id) in file.read():
                print(f'{orig_entity} contains entity {num_to_name[e_id]}')
                graph[orig_entity].append(num_to_name[e_id])


Searching entity SLB
SLB contains entity GOOG
SLB contains entity AMZN
SLB contains entity SLB

Searching entity SNY
SNY contains entity BA
SNY contains entity GOOG
SNY contains entity AMZN
SNY contains entity AAPL
SNY contains entity SNY

Searching entity GD
GD contains entity BA
GD contains entity GOOG
GD contains entity GD
GD contains entity AMZN

Searching entity BA
BA contains entity BA
BA contains entity GOOG
BA contains entity INTC
BA contains entity AMZN

Searching entity BUD
BUD contains entity GOOG
BUD contains entity BUD
BUD contains entity INTC

Searching entity WFC
WFC contains entity BA
WFC contains entity GOOG
WFC contains entity WFC
WFC contains entity INTC
WFC contains entity AMZN

Searching entity PG
PG contains entity BA
PG contains entity GOOG
PG contains entity AMZN
PG contains entity FB
PG contains entity PG

Searching entity RDS-B
RDS-B contains entity BA
RDS-B contains entity GOOG
RDS-B contains entity AMZN
RDS-B contains entity RDS-B
RDS-B contains entity FB



In [5]:
# Make graph non-directional

for co1 in graph.keys():
    co2s = graph[co1]
    for co2 in co2s:
        if co1 not in graph[co2]:
            graph[co2].append(co1)

In [6]:
with open('./wikidata_adjacency_list.txt', 'w') as file:
    for key in graph.keys():
        file.write(key + ':' + str(graph[key]) + '\n')

### Second order relations

In [31]:
# Build graph mapping companies to all their related entities
entity_regex = re.compile(".+Q[0-9].+")

company_to_entities = {}
for name in name_to_num.keys():
    company_to_entities[name] = []
company_to_entities['SPLP'] = []

for filename in os.listdir('./wikidata_entries/'):
    with open('./wikidata_entries/' + filename) as file:
        if filename == '.DS_Store': continue
            
        orig_entity = filename.split('_')[0]
        print('\nSearching entity', orig_entity, name_to_num[orig_entity])
        
        for line in file:
            if entity_regex.match(line):
                try:
                    q_index = line.index('Q')
                    s_index = line[q_index:].index(' ') + q_index
                    related_entity = line[q_index + 1:s_index]
                    
                    if '-' not in related_entity:
                        print('>', related_entity)
                        if related_entity not in company_to_entities[orig_entity]:
                            company_to_entities[orig_entity].append(related_entity)
                except ValueError:
                    print('substring err')


Searching entity SLB 1425316
> 1425316
> 783794
> 8707371
> 106736
> 13677
> 1425316
> 1425316
> 783794
> 1425316
> 8707371
> 1425316
> 1425316
> 106736
> 1425316
> 13677
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425316
> 1425320
> 1425320

Searching entity SNY 158205
> 158205
> 783794
> 8703490
> 90
> 5489637
> 13677
> 158205
> 783794
> 158205
> 158205
> 8703490
> 158205
> 158205
> 90
> 158205
> 158205
> 5489637
> 158205
> 13677
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 158205
> 2>
> 158208
> 158208

Searching entity GD 502940
> 502940
> 783794
> 960949
> 8218166
> 408744
> 474200
> 13677
> 502940
> 783794
> 502940
> 960949
> 502940
> 82

> 54173
> 54173
> 54173
> 54173
> 8482065
> 54173
> 54173
> 218671
> 54173
> 225670
> 54173
> 54173
> 54173
> 754635
> 54173
> 54173
> 54173
> 54173
> 13677
> 54173
> 54173
> 7429887
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54173
> 54186
> 54186

Searching entity MO 445007
> 445007
substring err
substring err
> 1545718
> 13677
> 341639
> 30
> 891723
> 7875793
substring err
substring err
substring err
> 5087545
> 505275
> 445007
> 445007
> 1726300
> 445007
> 327751
> 445007
> 1545718
> 445007
> 13677
> 445007
> 445007
> 445007
> 341639
> 445007
> 445007
> 30
> 445007
> 891723
>

> 3614083
> 7850593
substring err
> 4934
> 4934
> 694178
substring err
substring err
substring err
substring err
substring err
substring err
substring err
substring err
substring err
substring err
substring err
substring err
substring err
substring err
> 164203
> 82059
substring err
substring err
> 12070726
substring err
substring err
> 133928
> 507
> 783794
> 4004268
substring err
> 3323451
> 17290934
> 364
> 15614763
> 95
> 3614083
> 95
> 95
> 95
> 95
> 7850593
> 95
> 92764
> 95
> 4934
> 95
> 4934
> 95
> 92747
> 95
> 694178
> 95
> 95
> 1318441
> 95
> 1053674
> 95
> 1816282
> 95
> 1114200
> 95
> 866
> 95
> 140258
> 95
> 942327
> 95
> 171186
> 95
> 15732990
> 95
> 2298325
> 95
> 4997299
> 95
> 2119882
> 95
> 15733006
> 95
> 7950747
> 95
> 164203
> 95
> 82059
> 95
> 95
> 75
> 95
> 880371
> 95
> 12070726
> 95
> 13462819
> 95
> 1377855
> 95
> 133928
> 95
> 507
> 95
> 783794
> 95
> 95
> 4004268
> 95
> 95
> 95
> 95
> 95
> 95
> 95
> 95
> 858637
> 95
> 3323451
> 95
> 95
> 17290934
> 17315765


> 219508
> 783794
> 13677
> 6423089
> 4636099
> 219508
> 219508
> 219508
> 783794
> 219508
> 13677
> 219508
> 219508
> 6423089
> 219508
> 219508
> 4636099
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 219508
> 1985727>
> 219509
> 219509

Searching entity DUK 1264404
> 1264404
> 13677
> 783794
> 8391707
> 16565
> 5120885
> 13677
> 1264404
> 783794
> 1264404
> 1264404
> 8391707
> 1264404
> 1264404
> 16565
> 1264404
> 5120885
> 1264404
> 1264404
> 1264404
> 1264404
> 1264404
> 1264404
> 1264404
> 1264404
> 1264404
> 1264405
> 1264405

Searching entity FB 380
> 380
substring err
substring err
substring err
substring err
> 4757939
> 82059
> 891723
> 74195
substri

> 159433
> 990856
> 159433
> 7056578
> 159433
> 159433
> 937629
> 159433
> 30
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159433
> 159442
> 159442

Searching entity KO 3295867
> 3295867
> 13677
substring err
> 1252971
> 744866
> 184253
> 23556
> 30
> 1094880
> 9139329
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 13677
> 3295867
> 783794
> 3295867
> 1252971
> 3295867
> 744866
> 3295867
> 184253
> 2329
> 3295867
> 3295867
> 23556
> 3295867
> 3295867
> 3295867
> 3295867
> 30
> 3295867
> 1094880
> 3295867
> 3295867
> 9139329
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 3295867
> 32958

In [36]:
# Build second-order relations
graph_2 = {}
for name in name_to_num.keys():
    graph_2[name] = set()
graph_2['SPLP'] = set()

def common_member(a, b): 
    a_set = set(a) 
    b_set = set(b) 
    if len(a_set.intersection(b_set)) > 0: 
        return(True)  
    return(False)  

for company in company_to_entities.keys():
    for other_company in company_to_entities.keys():
        if common_member(company_to_entities[company], company_to_entities[other_company]):
            graph_2[company].add(other_company)

In [40]:
# Make graph non-directional

for co1 in graph_2.keys():
    co2s = graph_2[co1]
    for co2 in co2s:
        if co1 not in graph_2[co2]:
            graph[co2].add(co1)

## First and second order combined

In [42]:
graph_1_2 = {}
for name in name_to_num.keys():
    graph_1_2[name] = set()
graph_1_2['SPLP'] = set()

for company in graph_1_2.keys():
    graph_1_2[company].update(graph[company])
    graph_1_2[company].update(graph_2[company])

In [45]:
# Output
with open('./wikidata_adjacency_list_first_and_second_order.txt', 'w') as file:
    for key in graph_1_2.keys():
        file.write(key + ':' + str(graph_1_2[key]) + '\n')

In [47]:
for c in graph.keys():
    print(c, len(graph[c]))

AAPL 5
ABB 4
ABBV 2
AEP 4
AGFS 0
AMGN 3
AMZN 5
BA 4
BABA 3
BAC 4
BBL 0
BCH 4
BHP 3
BP 6
BRK-A 3
BSAC 2
BUD 3
C 4
CAT 5
CELG 4
CHL 5
CHTR 2
CMCSA 5
CODI 0
CSCO 5
CVX 4
D 3
DHR 3
DIS 6
DUK 3
EXC 3
FB 6
GD 4
GE 5
GMRE 3
GOOG 5
HD 3
HON 5
HRG 4
HSBC 4
IEP 2
INTC 4
JNJ 4
JPM 5
KO 5
LMT 4
MA 4
MCD 4
MDT 4
MMM 5
MO 4
MRK 3
MSFT 7
NEE 2
NGG 3
NVS 6
ORCL 5
PCG 4
PCLN 1
PEP 4
PFE 3
PG 5
PICO 1
PM 4
PPL 1
PTR 4
RDS-B 5
REX 2
SLB 3
SNP 4
SNY 5
SO 3
SRE 1
T 2
TM 4
TOT 5
TSM 4
UL 0
UN 5
UNH 3
UPS 4
UTX 3
V 3
VZ 4
WFC 5
WMT 5
XOM 5
SPLP 0
