## TrumpWorld - analysing companies by country

I'll try to cross-reference the TrumpWorld dataset (specifically, the org-org one) with OpenCorporates.com to gather as much national data as possible for these companies and attribute them a country.

In [3]:
import sys
import json
import requests
import urllib2 as url
import pandas as pd
import numpy as np
from difflib import SequenceMatcher

In [4]:
# First step is loading the .csv and gathering all unique company names
tw_orgorg = pd.read_csv('https://query.data.world/s/94g9v4tj3pzz3ir495y8s3esl')
tw_perorg = pd.read_csv('https://query.data.world/s/50ivsaqkos6vzdf4m51ntq7qc')

In [5]:
org_a_uni = tw_orgorg['Organization A'].unique()
org_b_uni = tw_orgorg['Organization B'].unique()
org_c_uni = tw_perorg[tw_perorg.Person == 'DONALD J. TRUMP']['Organization'].unique()

org_uni = np.array(list(set(np.concatenate((org_a_uni, org_b_uni, org_c_uni)))))
org_uni.sort()

In [6]:
# Now for each of them run a query on OpenCorporates.com
query_header = 'https://api.opencorporates.com/v0.4/companies/search'
ocorp_data = {}

for i, ou in enumerate(org_uni):
    sys.stdout.write("\rOrganisation {0:2d}/{1}".format(i+1, len(org_uni)))
    ou_q = '+'.join(ou.lower().split())
    try:
        r = requests.get(query_header, params={'q': ou_q})
        resp = json.loads(r.text)
    except:
        # Something went wrong, skip
        continue
    comp_list = resp['results']['companies']
    # First, did we find anything?
    if len(comp_list) < 1:
        continue
    elif len(comp_list) > 1:
        # Some quick epurations: we don't need dissoluted companies
        comp_list = [c for c in comp_list if c['company']['dissolution_date'] is None]
    # Second, what are the names?
    names = [c['company']['name'] for c in comp_list]
    # Let's check which one fits better, if there are more than one
    if len(comp_list) == 1:
        comp_i = 0
    else:
        match = [SequenceMatcher(None, ou, n).ratio() for n in names]
        comp_i = np.argmax(match)
        # NOTE: needs improvement as sometimes multiple matches can have the same exact name...
    # But do we have address info?
    address = comp_list[comp_i]['company']['registered_address']
    if address is None or address['country'] is None:
        continue
    ocorp_data[ou] = address['country']

Organisation  2/965

### Distance analysis

In order to better interpret the result let's get an estimate of how far the companies are from Donald J. Trump

In [78]:
import networkx as nx

holdings_graph = nx.Graph()
# Add a special node, Donald J. Trump
holdings_graph.add_node('DONALD J. TRUMP')
# Add all organizations as nodes
holdings_graph.add_nodes_from(org_uni)
# Now connections. First, direct Donald-to-organization ones
for o in tw_perorg[tw_perorg.Person == 'DONALD J. TRUMP'].Organization:
    holdings_graph.add_edge('DONALD J. TRUMP', o)
# Then between organizations
for oa, ob in zip(tw_orgorg['Organization A'], tw_orgorg['Organization B']):
    holdings_graph.add_edge(oa, ob)

In [80]:
# Now compute distances
dist_table = {}

for ou in org_uni:
    try:
        dist = nx.shortest_path_length(holdings_graph, 'DONALD J. TRUMP', ou)
    except nx.NetworkXNoPath:
        dist = None
    dist_table[ou] = dist