In [1]:
import os
import pandas as pd
import urllib.request
import json 
from operator import itemgetter
import numpy as np
import requests
import pyreadr
import re
from sentence_transformers import SentenceTransformer
from fact_checking import fact_check_and_add
import graph_utils

  from tqdm.autonotebook import tqdm, trange


Connection to graph database established.


## Neo4J AuraDB Setup

### Resetting database

We reset the database to start with a blank slate.

In [2]:
# Removes all nodes and relationships
graph_utils.reset_graph()

In [3]:
# Removes all indexes and constraints
graph_utils.reset_constraints()

## Schema Constraints

In this section, we define constraints and indexes for the various nodes in our graph.

Key constraints and uniqueness constraints will ensure that nodes can be matched unambiguously.
We also employ semantic indexes for the purposes of node disambiguiation:
1. Full-text indexes allow keywords to be matched (e.g. country name aliases can be used to match to different aliases of a particular country)
2. Vector indexes allow similarity based search (e.g. industry description can be used to match to specific industry nodes)

### Region Node

In [4]:
# m49 is key
graph_utils.execute_query('''
CREATE CONSTRAINT region_m49_key IF NOT EXISTS
FOR (r:Region) REQUIRE r.m49 IS NODE KEY''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x125c9d790>, keys=[])

In [5]:
# name is unique
graph_utils.execute_query('''
CREATE CONSTRAINT region_name_unique IF NOT EXISTS
FOR (r:Region) REQUIRE r.name IS UNIQUE''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610677d0>, keys=[])

In [6]:
# name fulltext index
graph_utils.execute_query('''
CREATE FULLTEXT INDEX region_name_index IF NOT EXISTS
FOR (r:Region) ON EACH [r.name]''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16109c5d0>, keys=[])

### Country Node

In [7]:
# iso3 is key
graph_utils.execute_query('''
CREATE CONSTRAINT country_iso3_key IF NOT EXISTS
FOR (c:Country) REQUIRE c.iso3 IS NODE KEY''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161086bd0>, keys=[])

In [8]:
# iso2 is unique
graph_utils.execute_query('''
CREATE CONSTRAINT country_iso2_unique IF NOT EXISTS
FOR (c:Country) REQUIRE c.iso2 IS UNIQUE''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16109e650>, keys=[])

In [9]:
# name is unique
graph_utils.execute_query('''
CREATE CONSTRAINT country_name_unique IF NOT EXISTS
FOR (c:Country) REQUIRE c.name IS UNIQUE''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610a1390>, keys=[])

In [10]:
# aliases fulltext index
graph_utils.execute_query('''
CREATE FULLTEXT INDEX country_aliases_index IF NOT EXISTS
FOR (c:Country) ON EACH [c.aliases]''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610a3fd0>, keys=[])

### Sector Node

In [11]:
# gics is key
graph_utils.execute_query('''
CREATE CONSTRAINT sector_gics_key IF NOT EXISTS
FOR (s:Sector) REQUIRE s.gics IS NODE KEY''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610a1e90>, keys=[])

In [12]:
# name is unique
graph_utils.execute_query('''
CREATE CONSTRAINT country_name_unique IF NOT EXISTS
FOR (c:Country) REQUIRE c.name IS UNIQUE''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610b9710>, keys=[])

### Industry Node

In [13]:
# gics is key
graph_utils.execute_query('''
CREATE CONSTRAINT industry_gics_key IF NOT EXISTS
FOR (i:Industry) REQUIRE i.gics IS NODE KEY''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610b8a90>, keys=[])

In [14]:
# name is unique
graph_utils.execute_query('''
CREATE CONSTRAINT industry_name_unique IF NOT EXISTS
FOR (i:Industry) REQUIRE i.name IS UNIQUE''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610a2590>, keys=[])

In [15]:
# description vector index
graph_utils.execute_query('''
CREATE VECTOR INDEX industry_description_index IF NOT EXISTS
FOR (i:Industry)
ON i.embedding
OPTIONS { indexConfig: {
 `vector.quantization.enabled`: false
}}''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16107cfd0>, keys=[])

### Company Node

In [16]:
# ticker is key
graph_utils.execute_query('''CREATE CONSTRAINT company_ticker_key IF NOT EXISTS
FOR (c:Company) REQUIRE c.ticker IS NODE KEY''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16107ea10>, keys=[])

In [17]:
# names fulltext index
graph_utils.execute_query('''CREATE FULLTEXT INDEX company_names_index IF NOT EXISTS
FOR (c:Company) ON EACH [c.names]''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610b9f50>, keys=[])

## Adding Initial Data

### Region Nodes

Our country and region nodes, as well as the IS_IN relationships from countries to regions, are sourced from [UNSD](https://unstats.un.org/unsd/methodology/m49/overview/).

In [18]:
df_m49 = pd.read_csv('data/UNSD_m49.csv', sep=';')

In [19]:
continents = df_m49[['Region Code', 'Region Name']]\
                    .dropna()\
                    .drop_duplicates()\
                    .rename(columns={
                        'Region Code': 'm49',
                        'Region Name': 'name'
                    })

In [20]:
subregions = df_m49[['Sub-region Code', 'Sub-region Name']]\
                    .dropna()\
                    .drop_duplicates()\
                    .rename(columns={
                        'Sub-region Code': 'm49',
                        'Sub-region Name': 'name'
                    })

In [21]:
itdregions = df_m49[['Intermediate Region Code', 'Intermediate Region Name']]\
                    .dropna()\
                    .drop_duplicates()\
                    .rename(columns={
                        'Intermediate Region Code': 'm49',
                        'Intermediate Region Name': 'name'
                    })

In [22]:
regions = pd.concat([continents, subregions, itdregions], ignore_index=True)\
            .astype({'m49': int})

In [23]:
param_dicts = regions.to_dict('records')
param_dicts[:5]

[{'m49': 2, 'name': 'Africa'},
 {'m49': 19, 'name': 'Americas'},
 {'m49': 142, 'name': 'Asia'},
 {'m49': 150, 'name': 'Europe'},
 {'m49': 9, 'name': 'Oceania'}]

In [24]:
graph_utils.execute_query_with_params("MERGE (:Region{m49: $m49, name: $name})",
                                      *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610ca410>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610cac10>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610cb550>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610cbf90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610d0c10>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610d1850>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610d2450>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610d30d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610d3dd0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16

### Country Nodes

In [25]:
countries = df_m49[['ISO-alpha3 Code', 'ISO-alpha2 Code', 'Country or Area']]\
                    .dropna()\
                    .drop_duplicates()\
                    .rename(columns={
                        'ISO-alpha3 Code': 'iso3',
                        'ISO-alpha2 Code': 'iso2',
                        'Country or Area': 'name'
                    })

In [26]:
param_dicts = countries.to_dict('records')
param_dicts[:5]

[{'iso3': 'DZA', 'iso2': 'DZ', 'name': 'Algeria'},
 {'iso3': 'EGY', 'iso2': 'EG', 'name': 'Egypt'},
 {'iso3': 'LBY', 'iso2': 'LY', 'name': 'Libya'},
 {'iso3': 'MAR', 'iso2': 'MA', 'name': 'Morocco'},
 {'iso3': 'SDN', 'iso2': 'SD', 'name': 'Sudan'}]

In [27]:
graph_utils.execute_query_with_params("MERGE (:Country{iso3: $iso3, name: $name, iso2: $iso2})",
                                      *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610ff590>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610ffe90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16170c850>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16170d2d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16170d990>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16170e590>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16170f1d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16170fe10>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161710a50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16

### Country IS_IN Region Relationship

In [28]:
country_continent = df_m49[['ISO-alpha3 Code', 'Region Code']]\
                            .dropna()\
                            .drop_duplicates()\
                            .rename(columns={
                                'ISO-alpha3 Code': 'iso3',
                                'Region Code': 'm49'
                            })

In [29]:
country_subregion = df_m49[['ISO-alpha3 Code', 'Sub-region Code']]\
                            .dropna()\
                            .drop_duplicates()\
                            .rename(columns={
                                'ISO-alpha3 Code': 'iso3',
                                'Sub-region Code': 'm49'
                            })

In [30]:
country_itdregion = df_m49[['ISO-alpha3 Code', 'Intermediate Region Code']]\
                            .dropna()\
                            .drop_duplicates()\
                            .rename(columns={
                                'ISO-alpha3 Code': 'iso3',
                                'Intermediate Region Code': 'm49'
                            })

In [31]:
country_region = pd.concat([country_continent, country_subregion, country_itdregion], ignore_index=True)

In [32]:
param_dicts = country_region.to_dict('records')
param_dicts[:5]

[{'iso3': 'DZA', 'm49': 2.0},
 {'iso3': 'EGY', 'm49': 2.0},
 {'iso3': 'LBY', 'm49': 2.0},
 {'iso3': 'MAR', 'm49': 2.0},
 {'iso3': 'SDN', 'm49': 2.0}]

In [33]:
graph_utils.execute_query_with_params('''
MATCH
    (c:Country{iso3: $iso3}),
    (r:Region{m49: $m49})
MERGE (c)-[:IS_IN]->(r)''', *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610fcbd0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1610fd790>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161853e50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161867510>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161870610>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1618716d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161872750>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161873750>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16187c7d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16

### Country Aliases Property

Alternative names for countries. Source: [Kaggle](https://www.kaggle.com/datasets/wbdill/country-aliaseslist-of-alternative-country-names)/[Wikipedia](https://en.wikipedia.org/wiki/List_of_alternative_country_names)

In [34]:
df_alias = pd.read_csv('data/country_aliases.csv')

In [35]:
def split_alias(row):
    '''
    Splits a row if Alias contains multiple aliases seperated by " or "
    '''
    if ' or ' in row['Alias']:
        values = row['Alias'].split(' or ')
        return pd.DataFrame({'iso3': [row['iso3']] * len(values), 'Alias': values})
    return pd.DataFrame({'iso3': [row['iso3']], 'Alias': [row['Alias']]})

In [36]:
aliases = pd.concat([split_alias(row) for _, row in df_alias.iterrows()],
                  ignore_index=True)\
        .dropna()\
        .drop_duplicates()\
        .rename(columns={'Alias': 'alias'})

In [37]:
param_dicts = aliases.to_dict('records')
param_dicts[:5]

[{'iso3': 'AFG', 'alias': 'Afghanistan'},
 {'iso3': 'AFG', 'alias': 'Islamic Republic of Afghanistan'},
 {'iso3': 'AFG', 'alias': 'Da Afganistan Islami Jumhoryat'},
 {'iso3': 'AFG', 'alias': 'Jomhuriyyeh Eslamiyyeh Afganestan'},
 {'iso3': 'ALB', 'alias': 'Albania'}]

In [38]:
graph_utils.execute_query_with_params('''
MERGE (c:Country {iso3: $iso3})
SET c.aliases = 
    CASE
        WHEN c.aliases IS NULL THEN [$alias]
        WHEN NOT $alias IN c.aliases THEN c.aliases + $alias
        ELSE c.aliases
    END''', *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x125dc1e90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161855990>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161856290>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161856c90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161857890>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1618644d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161865090>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161865c50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1618667d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16

### Country Stats

Yearly stats for each country are sourced from [Worldbank](data.worldbank.org). Corporate Tax Rates sourced from [Tax Foundation](https://taxfoundation.org/data/all/global/corporate-tax-rates-by-country-2023).

In [39]:
def get_worldbank(indicator: str) -> pd.DataFrame:
    '''
    Get indicator data using worldbank API
    '''
    with urllib.request.urlopen(f"https://api.worldbank.org/v2/country/all/indicator/{indicator}?format=json&per_page=20000") as url:
        data = json.load(url)[1]
    ind = data[0]['indicator']['value']
    iso3 = map(itemgetter('countryiso3code'), data)
    year = map(itemgetter('date'), data)
    value = map(itemgetter('value'), data)
    return pd.DataFrame({
        'iso3': iso3,
        'year': year,
        ind: value
    }).replace('', np.nan)\
      .dropna()\
      .set_index(['iso3', 'year'])

In [40]:
population = get_worldbank('SP.POP.TOTL')

In [41]:
gdp = get_worldbank('NY.GDP.MKTP.CD')

In [42]:
pv = get_worldbank('PV.EST')

In [43]:
ctr = pd.read_excel('data/corp_tax_rate.xlsx')\
        .melt(id_vars='iso_3',
              value_vars=range(1980, 2024),
              var_name='year',
              value_name='corporate_tax_rate')\
        .rename(columns={'iso_3': 'iso3'})\
        .astype({'year': str})\
        .set_index(['iso3', 'year'])

In [44]:
stats = pd.concat([population, gdp, pv, ctr], axis=1).sort_index()\
          .reset_index()\
          .rename(columns={
              'Population, total': 'population',
              'GDP (current US$)': 'gdp',
              'Political Stability and Absence of Violence/Terrorism: Estimate': 'pv',
              'corporate_tax_rate': 'corporate_tax_rate'
          })

We use 2022 stats for now

In [45]:
param_dicts = stats[stats['year'] == '2022'].to_dict('records')
param_dicts[:3]

[{'iso3': 'ABW',
  'year': '2022',
  'population': 106445.0,
  'gdp': 3544707788.05664,
  'pv': 1.47468435764313,
  'corporate_tax_rate': 25.0},
 {'iso3': 'AFE',
  'year': '2022',
  'population': 720859132.0,
  'gdp': 1183962133998.87,
  'pv': nan,
  'corporate_tax_rate': nan},
 {'iso3': 'AFG',
  'year': '2022',
  'population': 41128771.0,
  'gdp': 14502158192.0904,
  'pv': -2.5508017539978,
  'corporate_tax_rate': 20.0}]

In [46]:
graph_utils.execute_query_with_params('''
MATCH (c:Country {iso3: $iso3})
SET
    c.population = $population,
    c.gdp = $gdp,
    c.pv = $pv,
    c.corporate_tax_rate = $corporate_tax_rate''', *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161f6c290>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161f6cc10>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161f6d750>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161f6e290>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161f6f190>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161d33d10>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161d329d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161d2ba90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161d2b6d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16

### Sector Node


The data used for Sector/Industry nodes comes from [bautheac/GICS](https://github.com/bautheac/GICS) packages the Global Industry Classification Standards (GICS) dataset for consumption in R.  The GICS hierarchy begins with 11 sectors and is followed by 24 industry groups, 68 industries, and 157 sub-industries. 

In [47]:
url = 'https://github.com/bautheac/GICS/raw/0c2b0e4c0ca56a0e520301fd978fc095ed4fc328/data/standards.rda'
response = requests.get(url)

rda_file_path = './data/standards.rda'
with open(rda_file_path, 'wb') as file:
    file.write(response.content)

# Load the .rda file using pyreadr
result = pyreadr.read_r(rda_file_path)

print(result.keys())  

df = result[list(result.keys())[0]]  

# Save the DataFrame as a CSV file and remove the rda file
df.to_csv('./data/standards.csv', index=False)

os.remove(rda_file_path)

print("Data has been saved as standards.csv")

odict_keys(['standards'])
Data has been saved as standards.csv


In [48]:
# data wrangling for industry/sector

def wrangling(csv_path):
    df = pd.read_csv(csv_path)
    
    df = df.dropna()

    df = df.drop_duplicates()
    
    df = df.rename(columns={
        'sector id': 'sector_id',
        'sector name': 'sector_name',
        'industry group id': 'industry_group_id',
        'industry group name': 'industry_group_name',
        'industry id': 'industry_id',
        'industry name': 'industry_name',
        'subindustry id': 'subindustry_id',
        'subindustry name': 'subindustry_name',
        'description': 'primary_activity'
    })

    
    df['sector_id'] = df['sector_id'].astype('Int64')  
    df['industry_group_id'] = df['industry_group_id'].astype('Int64')
    df['industry_id'] = df['industry_id'].astype('Int64')
    df['subindustry_id'] = df['subindustry_id'].astype('Int64')

    df.reset_index(drop=True, inplace=True)
    df.index += 1

    return df

df_standards = wrangling("./data/standards.csv")

In [49]:
sector = df_standards[['sector_id', 'sector_name']] \
        .drop_duplicates() \
        .rename(columns={
            'sector_id': 'gics',
            'sector_name': 'name'
        })

In [50]:
param_dicts = sector.to_dict('records')
param_dicts[:5]

[{'gics': 10, 'name': 'Energy'},
 {'gics': 15, 'name': 'Materials'},
 {'gics': 20, 'name': 'Industrials'},
 {'gics': 25, 'name': 'Consumer Discretionary'},
 {'gics': 30, 'name': 'Consumer Staples'}]

In [51]:
graph_utils.execute_query_with_params("MERGE (:Sector{gics: $gics, name: $name})", *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x162744e10>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1627441d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x162737cd0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x162737410>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16285ecd0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16285de90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16285d0d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16285c350>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x161f64450>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16

### Industry Node

In [52]:
industry = df_standards[['subindustry_id', 'subindustry_name', 'primary_activity']] \
           .drop_duplicates() \
           .rename(columns={
               'subindustry_id': 'gics',
               'subindustry_name': 'name',
               'primary_activity': 'description'
            })

We use a sentence embeddings model to generate embeddings for the industry description.

In [53]:
EMBEDDING_MODEL = SentenceTransformer("all-MiniLM-L6-v2")

In [54]:
industry_desc_embed = EMBEDDING_MODEL.encode(industry['description'].to_numpy())
industry['embedding'] = list(map(list, industry_desc_embed))

In [55]:
param_dicts = industry.to_dict('records')

In [56]:
graph_utils.execute_query_with_params("MERGE (:Industry{gics: $gics, name: $name, description: $description, embedding: $embedding})", *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16da40690>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16da73b50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16da72e90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16da724d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16da71a50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16da70b90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16da70150>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16da0f3d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16da0e7d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16

### Industry PART_OF Sector Relationship

In [57]:
industry_sector = df_standards[['subindustry_id', 'sector_id']] \
                  .drop_duplicates() \
                  .rename(columns={
                      'subindustry_id': 'industry_gics',
                      'sector_id': 'sector_gics'
                  })

In [58]:
param_dicts = industry_sector.to_dict('records')
param_dicts[:5]

[{'industry_gics': 10101010, 'sector_gics': 10},
 {'industry_gics': 10101020, 'sector_gics': 10},
 {'industry_gics': 10102010, 'sector_gics': 10},
 {'industry_gics': 10102020, 'sector_gics': 10},
 {'industry_gics': 10102030, 'sector_gics': 10}]

In [59]:
graph_utils.execute_query_with_params('''
MATCH
    (i:Industry{gics: $industry_gics}),
    (s:Sector{gics: $sector_gics})
MERGE (i)-[:PART_OF]->(s)''', *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d8ae190>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d81c890>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d88f7d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d88e990>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d88d810>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d88c750>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d87db50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d88c290>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d8433d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16

## Adding Company Data

### Import JSON files

In this part, import all the json files, `nasdaq_kg_schema.json`, `nasdaq_kg_schema_rank_1-10.json` and `nasdaq_kg_schema_rank_11-32.json`.

In [60]:
json_files = ['nasdaq_kg_schema.json',
              'nasdaq_kg_schema_rank_1-10.json',
              'nasdaq_kg_schema_rank_11-32.json',
              'nasdaq_kg_schema_rank_33-37.json',
              'nasdaq_kg_schema_rank_38-40.json']

def merge_json_files(json_list: list[str]) -> dict:

    merged_json = {
        "nodes": {},
        "relationships": {}
    }

    for file_path in json_list:
        with open(file_path, 'r') as file:
            try:
                data = json.load(file)
                
                if "nodes" in data:
                    for node_type, nodes in data["nodes"].items():
                        if node_type not in merged_json["nodes"]:
                            merged_json["nodes"][node_type] = []
                        merged_json["nodes"][node_type].extend(nodes)

               
                if "relationships" in data:
                    for relationship_type, relationships in data["relationships"].items():
                        if relationship_type not in merged_json["relationships"]:
                            merged_json["relationships"][relationship_type] = []
                        merged_json["relationships"][relationship_type].extend(relationships)

            except json.JSONDecodeError as e:
                print(f"Error decoding {file_path}: {e}")
                
            except FileNotFoundError:
                print("Error: The file was not found.")

    return merged_json

merged_json = merge_json_files(json_files)

### Data Validation

* This part is only for company nodes, where we want to ensure data integrity by checking the primary key ticker_code and ensuring other fields like company name adhere to specific formatting rules and constraints. 
* Validating ticker_code as a valid string consisting of 4 to 5 uppercase letters, is for accurate indexing in financial markets.
* The process identifies duplicates and keeps the more-info version of an entry over the other duplicates. 
* Standardizing company names into title case and eliminating special symbols maintains consistency in representation and usability.
* All data entry whose ticker_code and company_name does not meet these criteria will be removed to maintain a clean dataset for the reporting of company information.

In [61]:
def is_valid_ticker(ticker_code):
    """Helper function to check if the ticker code is valid (str, 4 to 5 letters, all upper case)."""
    return isinstance(ticker_code, str) and 4 <= len(ticker_code) <= 5 and ticker_code.isupper()

def remove_invalid_ticker_companies(data):
    """Remove companies whose ticker_code doesn't meet the 3, 4, or 5 letter criteria."""
    if isinstance(data, str):
        print("Warning: data is a string, attempting to load as JSON.")
        data = json.loads(data)  
    
    if "nodes" in data and "Company" in data["nodes"]:
        companies = data["nodes"]["Company"]
        filtered_companies = [company for company in companies if is_valid_ticker(company.get("ticker_code"))]
        data["nodes"]["Company"] = filtered_companies
    else:
        print("Warning: The expected structure is not found in the data.")

    return data

def is_more_comprehensive(entry1, entry2):
    """Helper function to determine which duplicate has more comprehensive details."""
    return sum(1 for v in entry1.values() if v) > sum(1 for v in entry2.values() if v)

def remove_duplicates(data):
    seen_tickers = {}
    for company in data["nodes"]["Company"]:   
        ticker = company.get("ticker_code", "")
        if ticker in seen_tickers:
            if is_more_comprehensive(company, seen_tickers[ticker]):
                seen_tickers[ticker] = company
        else:
            seen_tickers[ticker] = company 
    data["nodes"]["Company"] = list(seen_tickers.values())
    return data

def clean_names(name):
    name = name.title()
    name = re.sub(r'[^\w\s]', '', name)
    return name

def standarize_case(data):
    """function to standardize title case and no other special."""
    for company in data["nodes"]["Company"]:
        company["name"] = clean_names(company["name"])
    return data

In [62]:
def company_validate(data): 
    data = remove_invalid_ticker_companies(data)
    # data = remove_duplicates(data)
    data = standarize_case(data)
    print("Completed.")
    return data

validated_data = company_validate(merged_json)

Completed.


### Adding Company Nodes

In [63]:
companies = validated_data['nodes']['Company']
for company in companies:
    company['founded_year'] = company['founded_year'] or ""
companies[:5]

[{'name': 'Apple Inc', 'ticker_code': 'AAPL', 'founded_year': ''},
 {'name': 'Apple', 'ticker_code': 'AAPL', 'founded_year': ''},
 {'name': 'Airpods', 'ticker_code': 'AAPL', 'founded_year': ''},
 {'name': 'Apple', 'ticker_code': 'AAPL', 'founded_year': ''},
 {'name': 'Major League Soccer Mls',
  'ticker_code': 'MLFB',
  'founded_year': ''}]

In [64]:
graph_utils.execute_query_with_params('''
MERGE (c:Company {ticker: $ticker_code})
SET c.names = 
    CASE
        WHEN c.names IS NULL THEN [$name]
        WHEN NOT $name IN c.names THEN c.names + $name
        ELSE c.names
    END,
    c.founded_year = $founded_year''', *companies)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d60b810>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d60aad0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d609c90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d6085d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d5cc910>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16db910d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16db92850>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d5a1950>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16d5a07d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16

### Adding Company relationships

For the purposes of consistency checking later on, we prepare the company relationship data and store the edges in a list. We use the fulltext and vector indexes we created to match the data to the correct nodes.

In [65]:
all_edges = []

### Company-Industry relationship

One of the key relationships defined in the data is "IS_INVOLVED_IN", which helps in modeling how companies are categorized based on their primary activities and the sectors they contribute to.


"IS_INVOLVED_IN": [
            {
                "company_name": "McKinsey &#38; Company",
                "industry_name": "management consulting"}]

In [66]:
company_industries = validated_data['relationships']['IS_INVOLVED_IN']
company_industries[:5]

[{'company_name': 'Apple', 'industry_name': 'accounting'},
 {'company_name': 'Apple', 'industry_name': 'technology'},
 {'company_name': 'Apple', 'industry_name': 'information technology'},
 {'company_name': 'Brightstar Corporation',
  'industry_name': 'wireless device services'},
 {'company_name': 'Applied Materials, Inc.',
  'industry_name': 'semiconductor equipment'}]

In [67]:
for company_industry in company_industries:
    industry_name = company_industry['industry_name']
    company_industry['company_name'] = clean_names(company_industry['company_name'])
    company_industry['embedding'] = EMBEDDING_MODEL.encode(industry_name).tolist()

In [68]:
is_involved_in_edges = graph_utils.execute_query_with_params("""
CALL db.index.fulltext.queryNodes('company_names_index', $company_name)
    YIELD node AS c, score AS company_score
CALL db.index.vector.queryNodes('industry_description_index', 10, $embedding)
    YIELD node AS i, score AS industry_score
WHERE company_score > 1
AND industry_score > 0.7
RETURN
    c.ticker AS ticker,
    i.gics AS gics""", *company_industries)

for records, _, _ in is_involved_in_edges:
    for ticker, gics in records:
        all_edges.append((ticker, gics, "Company", "Industry", "IS_INVOLVED_IN", {}))

### Company-Country relationships

One of the key relationships defined in the data is "HEADQUARTERS_IN" and "OPERATES_IN", which helps in modeling how companies are categorized based on their countries.
* "HEADQUARTERS_IN" relationship signifies that a company has its headquarters located in a specific country. It helps in understanding the geographical base of the company and can have implications for regulatory compliance, market strategy, and more.
* "OPERATES_IN" relationship indicates that a company conducts its business operations in a specific country, which can include selling products, providing services, or having a physical presence (like branches or subsidiaries).

In [69]:
headquarters_data = validated_data['relationships']['HEADQUARTERS_IN']
for entry in headquarters_data:
    entry['company_name'] = clean_names(entry['company_name'])
headquarters_data[:5]

[{'company_name': 'Emea', 'country_name': 'Ireland'},
 {'company_name': 'Apple', 'country_name': 'United States'},
 {'company_name': 'Apple', 'country_name': 'United States'},
 {'company_name': 'Zhengzhou Technology Park', 'country_name': 'China'},
 {'company_name': 'Adobe', 'country_name': 'United States'}]

In [70]:
headquarters_edges = graph_utils.execute_query_with_params("""
CALL db.index.fulltext.queryNodes('company_names_index', $company_name)
    YIELD node AS company
CALL db.index.fulltext.queryNodes('country_aliases_index', $country_name)
    YIELD node AS country   
RETURN company.ticker AS ticker, country.iso3 AS iso3""", *headquarters_data)

for records, _, _ in headquarters_edges:
    for ticker, iso3 in records:
        all_edges.append((ticker, iso3, "Company", "Country", "HEADQUARTERS_IN", {}))

In [71]:
operates_data = validated_data['relationships']['OPERATES_IN_COUNTRY']
for entry in operates_data:
    entry['company_name'] = clean_names(entry['company_name'])
    entry['net_sales'] = entry.pop('net sales')
operates_data[:5]

[{'company_name': 'Apple',
  'country_name': 'Ireland',
  'headcount': 7343,
  'net_sales': 29831428},
 {'company_name': 'Apple',
  'country_name': 'Ireland',
  'headcount': 1665,
  'net_sales': -24931076},
 {'company_name': 'Apple',
  'country_name': 'United States',
  'headcount': 1585,
  'net_sales': -12538324},
 {'company_name': 'Apple',
  'country_name': 'United States',
  'headcount': 9447,
  'net_sales': -20940672},
 {'company_name': 'Emea',
  'country_name': 'Ireland',
  'headcount': 5825,
  'net_sales': -21823981}]

In [72]:
operates_edges = graph_utils.execute_query_with_params("""
CALL db.index.fulltext.queryNodes('company_names_index', $company_name)
    YIELD node AS company
CALL db.index.fulltext.queryNodes('country_aliases_index', $country_name)
    YIELD node AS country   
RETURN
    company.ticker AS ticker,
    country.iso3 AS iso3,
    $headcount AS headcount,
    $net_sales AS net_sales""", *operates_data)

for records, _, _ in operates_edges:
    for ticker, iso3, headcount, net_sales in records:
        all_edges.append((ticker, iso3, "Company", "Country", "OPERATES_IN",
                          {'headcount': headcount, 'net_sales': net_sales}))

### Company-Company relationships

The key relationships are "COMPETES_WITH" and "SUBSIDIARY_OF".
* "COMPETES_WITH" relationship is used to represent the competitive dynamics between companies. When two companies operate in the same industry or market and offer similar products or services, they are considered competitors.
* "SUBSIDIARY_OF" relationship indicates that one company is a subsidiary of another, meaning it is controlled or owned by a parent company. This is a common structure in corporate hierarchies.

In [73]:
company_competes = validated_data['relationships']['COMPETES_WITH']
for entry in company_competes:
    entry['company_name_1'] = clean_names(entry['company_name_1'])
    entry['company_name_2'] = clean_names(entry['company_name_2'])
company_competes[:5]

[{'company_name_1': 'Be Inc', 'company_name_2': 'Gassée', 'type': None},
 {'company_name_1': 'Gassée', 'company_name_2': 'Be Inc', 'type': None},
 {'company_name_1': 'Microsoft', 'company_name_2': 'Apple', 'type': None},
 {'company_name_1': 'Apple', 'company_name_2': 'Microsoft', 'type': None},
 {'company_name_1': 'Adobe', 'company_name_2': 'Shopify', 'type': None}]

In [74]:
competes_edges = graph_utils.execute_query_with_params("""
CALL db.index.fulltext.queryNodes('company_names_index', $company_name_1)
    YIELD node AS company1
CALL db.index.fulltext.queryNodes('company_names_index', $company_name_2)
    YIELD node AS company2
RETURN company1.ticker AS ticker1, company2.ticker AS ticker2""", *company_competes)

for records, _, _ in competes_edges:
    for ticker1, ticker2 in records:
        all_edges.append((ticker1, ticker2, "Company", "Company", "COMPETES_WITH", {}))

In [75]:
company_subsidiary = validated_data['relationships']['SUBSIDIARY_OF']
for entry in company_subsidiary:
    entry['company_name_1'] = clean_names(entry['company_name_1'])
    entry['company_name_2'] = clean_names(entry['company_name_2'])
company_subsidiary[:5]

[{'company_name_1': 'Flurry Analytics',
  'company_name_2': 'Verizon',
  'type': 'subsidiary'},
 {'company_name_1': 'Apple Energy Llc',
  'company_name_2': 'Apple Inc',
  'type': 'subsidiary'},
 {'company_name_1': 'Ace American Insurance Company',
  'company_name_2': 'Chubb Limited',
  'type': 'subsidiary'},
 {'company_name_1': 'Adp Indemnity',
  'company_name_2': 'Chubb Limited',
  'type': 'subsidiary'},
 {'company_name_1': 'Arm',
  'company_name_2': 'Arm Holdings',
  'type': 'subsidiary'}]

In [76]:
subsidiary_edges = graph_utils.execute_query_with_params("""
CALL db.index.fulltext.queryNodes('company_names_index', $company_name_1)
    YIELD node AS company1
CALL db.index.fulltext.queryNodes('company_names_index', $company_name_2)
    YIELD node AS company2
RETURN company1.ticker AS ticker1, company2.ticker AS ticker2""", *company_subsidiary)

for records, _, _ in subsidiary_edges:
    for ticker1, ticker2 in records:
        all_edges.append((ticker1, ticker2, "Company", "Company", "SUBSIDIARY_OF", {}))

In [77]:
company_supplies = validated_data['relationships']['PARTNERS_WITH']
for entry in company_supplies:
    entry['company_name_1'] = clean_names(entry['company_name_1'])
    entry['company_name_2'] = clean_names(entry['company_name_2'])
company_supplies[:5]

[{'company_name_1': 'Apple',
  'company_name_2': 'Nextstep',
  'type': 'suppliers'},
 {'company_name_1': 'Apple',
  'company_name_2': 'Lens Technology',
  'type': 'suppliers'},
 {'company_name_1': 'Apple',
  'company_name_2': 'Lens Technology',
  'type': 'suppliers'},
 {'company_name_1': 'Chubb Limited',
  'company_name_2': 'Adp Indemnity',
  'type': 'suppliers'},
 {'company_name_1': 'Chubb Limited',
  'company_name_2': 'Adp Indemnity',
  'type': 'suppliers'}]

In [78]:
subsidiary_edges = graph_utils.execute_query_with_params("""
CALL db.index.fulltext.queryNodes('company_names_index', $company_name_1)
    YIELD node AS company1
CALL db.index.fulltext.queryNodes('company_names_index', $company_name_2)
    YIELD node AS company2
RETURN company1.ticker AS ticker1, company2.ticker AS ticker2""", *company_supplies)

for records, _, _ in subsidiary_edges:
    for ticker1, ticker2 in records:
        all_edges.append((ticker1, ticker2, "Company", "Company", "SUPPLIES_TO", {}))

### Consistency Checking

In this section, we perform consistency checking of the edges we have gathered before adding them to the graph. We define that an edge is consistent with the current graph, if there is a positive evidence (pattern) in the graph that supports the edge.

We make use of the Graph Fact Checking rules (GFCs) mining algorithm introduced by this [paper](https://github.com/001waiyan/GDRB/blob/master/2018-DASFAA-GFC-paper.pdf). We created a [fork](https://github.com/001waiyan/GDRB) of the paper's repository for the purposes of integration with the project. In this fork, we set up the FactChecker API, which performs the following:
1. Generate patterns (GFCs) for the given relation and input graph.
2. Checks each input edge against each found pattern.

The GFC mining algorithm relies on Principal Closed World Assumption (PCWA) of the input graph, i.e. if the graph has at least one edge (V1)-[:R]->(V2), we assume that we have complete information of all (V1)-[:R]->(Vx). For example if the graph has two competitors of Apple, (Apple)-[:COMPETES_WITH]->(Google) and (Apple)-[:Competes_With]->(Samsung), anything not in the graph (e.g. (Apple)-[:COMPETES_WITH]->(Meta)) is considered false.

In this section, we systematically fact check and add to the Neo4j database the list of edges, while maintaining PCWA of the graph. The edges to add are grouped by same source node and relation type (e.g. all COMPETES_WITH edges for the Apple node). Each group is fact checked using the FactChecker API and added one at a time to maintain PCWA.

Algorithm used:
1. Group the edges by the same source node and relation type.
2. Add all groups that contain duplicate edges to the graph, without fact checking. (Assuming that these edges are more likely to be consistent)
3. For the remaining groups, run the FactChecker API on each of them.
4. If an edge was matched by at least one found pattern, add that edge to the graph.
5. If no patterns were found for the group, add all its edges to the graph.

In [79]:
patterns = fact_check_and_add(all_edges, min_supp=0.5, min_conf=0.1, top_k=50, max_size=2)

Processing group: ('AAPL', 'Company', 'Industry', 'IS_INVOLVED_IN')
10 edges found in group.
Adding 5 unique edges.

Processing group: ('KLAC', 'Company', 'Industry', 'IS_INVOLVED_IN')
144 edges found in group.
Adding 57 unique edges.

Processing group: ('CMCSA', 'Company', 'Industry', 'IS_INVOLVED_IN')
126 edges found in group.
Adding 49 unique edges.

Processing group: ('AMAT', 'Company', 'Industry', 'IS_INVOLVED_IN')
109 edges found in group.
Adding 31 unique edges.

Processing group: ('ADBE', 'Company', 'Industry', 'IS_INVOLVED_IN')
8 edges found in group.
Adding 4 unique edges.

Processing group: ('AMZN', 'Company', 'Industry', 'IS_INVOLVED_IN')
105 edges found in group.
Adding 37 unique edges.

Processing group: ('NVDA', 'Company', 'Industry', 'IS_INVOLVED_IN')
62 edges found in group.
Adding 21 unique edges.

Processing group: ('AMGN', 'Company', 'Industry', 'IS_INVOLVED_IN')
5 edges found in group.
Adding 3 unique edges.

Processing group: ('GOOG', 'Company', 'Industry', 'IS_IN

#### Interpretation of Mined Patterns

The statement "Company A COMPETES_WITH Company B" is considered consistent if at least one of the following is true:
1. Company A and B headquarter and operate in the same country
2. Company A headquarters and operates in a country that company B is operating in
3. Company A and B are involved in the same industry
4. Company A operates in a country that company B is headquartering and operating in
5. Company A and B operate in the same country

In [80]:
patterns[('Company', 'Company', 'COMPETES_WITH')]

{Pattern((Company1)-[:HEADQUARTERS_IN]->(Country3),
 (Company1)-[:OPERATES_IN]->(Country3),
 (Company2)-[:HEADQUARTERS_IN]->(Country3),
 (Company2)-[:OPERATES_IN]->(Country3)),
 Pattern((Company1)-[:HEADQUARTERS_IN]->(Country3),
 (Company1)-[:OPERATES_IN]->(Country3),
 (Company2)-[:OPERATES_IN]->(Country3)),
 Pattern((Company1)-[:IS_INVOLVED_IN]->(Industry3),
 (Company2)-[:IS_INVOLVED_IN]->(Industry3)),
 Pattern((Company1)-[:OPERATES_IN]->(Country3),
 (Company2)-[:HEADQUARTERS_IN]->(Country3),
 (Company2)-[:OPERATES_IN]->(Country3)),
 Pattern((Company1)-[:OPERATES_IN]->(Country3),
 (Company2)-[:OPERATES_IN]->(Country3))}

The statement "Company A SUBSIDIARY_OF Company B" is considered consistent if at least one of the following is true:
1. Company A and B headquarter and operate in the same country
2. Company A headquarters and operates in a country that company B is operating in
3. Company A and B are involved in the same industry
4. Company A operates in a country that company B is headquartering and operating in
5. Company A and B operate in the same country

In [82]:
patterns[('Company', 'Company', 'SUBSIDIARY_OF')]

{Pattern((Company1)-[:HEADQUARTERS_IN]->(Country3),
 (Company1)-[:OPERATES_IN]->(Country3),
 (Company2)-[:HEADQUARTERS_IN]->(Country3),
 (Company2)-[:OPERATES_IN]->(Country3)),
 Pattern((Company1)-[:HEADQUARTERS_IN]->(Country3),
 (Company1)-[:OPERATES_IN]->(Country3),
 (Company2)-[:OPERATES_IN]->(Country3)),
 Pattern((Company1)-[:IS_INVOLVED_IN]->(Industry3),
 (Company2)-[:IS_INVOLVED_IN]->(Industry3)),
 Pattern((Company1)-[:OPERATES_IN]->(Country3),
 (Company2)-[:HEADQUARTERS_IN]->(Country3),
 (Company2)-[:OPERATES_IN]->(Country3)),
 Pattern((Company1)-[:OPERATES_IN]->(Country3),
 (Company2)-[:OPERATES_IN]->(Country3))}