In [162]:
import dotenv
import os
from neo4j import GraphDatabase, EagerResult
import pandas as pd
import urllib.request, json 
from operator import itemgetter
import numpy as np
from collections.abc import Iterable
from itertools import repeat
import requests
import pyreadr
import re
from sentence_transformers import SentenceTransformer

## Neo4J AuraDB Setup

In [163]:
load_status = dotenv.load_dotenv("Neo4j-fccfe306-Created-2024-09-24.txt")
if load_status is False:
    raise RuntimeError('Environment variables not loaded.')

URI = os.getenv("NEO4J_URI")
AUTH = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()
    print("Connection established.")

Connection established.


### Helper functions

In [164]:
def execute_query(query: str) -> EagerResult:
    '''
    Executes a query without any parameters
    '''
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        result = driver.execute_query(query)
    
    return result

In [165]:
def execute_query_with_params(query: str,
                              *param_dicts: dict[str, str]) -> list[EagerResult]:
    '''
    Executes a given query with each param_dict in param_dicts.
    Transaction based - All queries must be successful for changes to be committed.
    '''
    results = []
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        with driver.session(database="neo4j") as session:
            with session.begin_transaction() as tx:
                for param_dict in param_dicts:
                    result = tx.run(query, param_dict)
                    results.append(result.to_eager_result())
                tx.commit()
    return results

### Resetting database

In [166]:
def reset_graph():
    '''
    Deletes all nodes and relationshipss
    '''
    execute_query("MATCH (n) DETACH DELETE n")

reset_graph()

In [167]:
def reset_constraints():
    '''
    Deletes all constraints and indexes
    '''
    execute_query("CALL apoc.schema.assert({},{},true) YIELD label, key RETURN *")

reset_constraints()

## Schema Constraints

### Region Node

In [168]:
# m49 is key
execute_query('''
CREATE CONSTRAINT region_m49_key IF NOT EXISTS
FOR (r:Region) REQUIRE r.m49 IS NODE KEY''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b989c450>, keys=[])

In [169]:
# name is unique
execute_query('''
CREATE CONSTRAINT region_name_unique IF NOT EXISTS
FOR (r:Region) REQUIRE r.name IS UNIQUE''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98a00d0>, keys=[])

In [170]:
# name fulltext index
execute_query('''
CREATE FULLTEXT INDEX region_name_index IF NOT EXISTS
FOR (r:Region) ON EACH [r.name]''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b9877150>, keys=[])

### Country Node

In [171]:
# iso3 is key
execute_query('''
CREATE CONSTRAINT country_iso3_key IF NOT EXISTS
FOR (c:Country) REQUIRE c.iso3 IS NODE KEY''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b989ca50>, keys=[])

In [172]:
# iso2 is unique
execute_query('''
CREATE CONSTRAINT country_iso2_unique IF NOT EXISTS
FOR (c:Country) REQUIRE c.iso2 IS UNIQUE''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98a2b10>, keys=[])

In [173]:
# name is unique
execute_query('''
CREATE CONSTRAINT country_name_unique IF NOT EXISTS
FOR (c:Country) REQUIRE c.name IS UNIQUE''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98a0410>, keys=[])

In [174]:
# aliases fulltext index
execute_query('''
CREATE FULLTEXT INDEX country_aliases_index IF NOT EXISTS
FOR (c:Country) ON EACH [c.aliases]''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b989ea10>, keys=[])

### Sector Node

In [175]:
# gics is key
execute_query('''
CREATE CONSTRAINT sector_gics_key IF NOT EXISTS
FOR (s:Sector) REQUIRE s.gics IS NODE KEY''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98a6e50>, keys=[])

In [176]:
# name is unique
execute_query('''
CREATE CONSTRAINT country_name_unique IF NOT EXISTS
FOR (c:Country) REQUIRE c.name IS UNIQUE''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b987cb90>, keys=[])

### Industry Node

In [177]:
# gics is key
execute_query('''
CREATE CONSTRAINT industry_gics_key IF NOT EXISTS
FOR (i:Industry) REQUIRE i.gics IS NODE KEY''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b9887190>, keys=[])

In [178]:
# name is unique
execute_query('''
CREATE CONSTRAINT industry_name_unique IF NOT EXISTS
FOR (i:Industry) REQUIRE i.name IS UNIQUE''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98af1d0>, keys=[])

In [179]:
# description vector index
execute_query('''
CREATE VECTOR INDEX industry_description_index IF NOT EXISTS
FOR (i:Industry)
ON i.embedding
OPTIONS { indexConfig: {
 `vector.quantization.enabled`: false
}}''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b6a5e910>, keys=[])

### Company Node

In [180]:
# ticker_code is unique
execute_query('''CREATE CONSTRAINT company_code_unique IF NOT EXISTS
FOR (c:Company) REQUIRE c.ticker_code IS UNIQUE''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98a6210>, keys=[])

In [181]:
# names fulltext index
execute_query('''CREATE FULLTEXT INDEX company_names_index IF NOT EXISTS
FOR (c:Company) ON EACH [c.names]''')

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98a3d10>, keys=[])

## Adding Initial Data

### Region Nodes

Our country and region nodes, as well as the IS_IN relationships from countries to regions, are sourced from [UNSD](https://unstats.un.org/unsd/methodology/m49/overview/).

In [182]:
df_m49 = pd.read_csv('data/UNSD_m49.csv', sep=';')

In [183]:
continents = df_m49[['Region Code', 'Region Name']]\
                    .dropna()\
                    .drop_duplicates()\
                    .rename(columns={
                        'Region Code': 'm49',
                        'Region Name': 'name'
                    })

In [184]:
subregions = df_m49[['Sub-region Code', 'Sub-region Name']]\
                    .dropna()\
                    .drop_duplicates()\
                    .rename(columns={
                        'Sub-region Code': 'm49',
                        'Sub-region Name': 'name'
                    })

In [185]:
itdregions = df_m49[['Intermediate Region Code', 'Intermediate Region Name']]\
                    .dropna()\
                    .drop_duplicates()\
                    .rename(columns={
                        'Intermediate Region Code': 'm49',
                        'Intermediate Region Name': 'name'
                    })

In [186]:
regions = pd.concat([continents, subregions, itdregions], ignore_index=True)\
            .astype({'m49': int})

In [187]:
param_dicts = regions.to_dict('records')
param_dicts[:5]

[{'m49': 2, 'name': 'Africa'},
 {'m49': 19, 'name': 'Americas'},
 {'m49': 142, 'name': 'Asia'},
 {'m49': 150, 'name': 'Europe'},
 {'m49': 9, 'name': 'Oceania'}]

In [188]:
execute_query_with_params("MERGE (:Region{m49: $m49, name: $name})",
                          *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b9887010>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98d3b50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98c8950>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98a0750>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98a5390>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98a64d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98adbd0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98ac110>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98ad9d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b

### Country Nodes

In [189]:
countries = df_m49[['ISO-alpha3 Code', 'ISO-alpha2 Code', 'Country or Area']]\
                    .dropna()\
                    .drop_duplicates()\
                    .rename(columns={
                        'ISO-alpha3 Code': 'iso3',
                        'ISO-alpha2 Code': 'iso2',
                        'Country or Area': 'name'
                    })

In [190]:
param_dicts = countries.to_dict('records')
param_dicts[:5]

[{'iso3': 'DZA', 'iso2': 'DZ', 'name': 'Algeria'},
 {'iso3': 'EGY', 'iso2': 'EG', 'name': 'Egypt'},
 {'iso3': 'LBY', 'iso2': 'LY', 'name': 'Libya'},
 {'iso3': 'MAR', 'iso2': 'MA', 'name': 'Morocco'},
 {'iso3': 'SDN', 'iso2': 'SD', 'name': 'Sudan'}]

In [191]:
execute_query_with_params("MERGE (:Country{iso3: $iso3, name: $name, iso2: $iso2})",
                          *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98f3410>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98f3d90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98e6dd0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98e5450>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98f8c10>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98f95d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98f9f50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98faad0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98fb6d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b

### Country IS_IN Region Relationship

In [192]:
country_continent = df_m49[['ISO-alpha3 Code', 'Region Code']]\
                            .dropna()\
                            .drop_duplicates()\
                            .rename(columns={
                                'ISO-alpha3 Code': 'iso3',
                                'Region Code': 'm49'
                            })

In [193]:
country_subregion = df_m49[['ISO-alpha3 Code', 'Sub-region Code']]\
                            .dropna()\
                            .drop_duplicates()\
                            .rename(columns={
                                'ISO-alpha3 Code': 'iso3',
                                'Sub-region Code': 'm49'
                            })

In [194]:
country_itdregion = df_m49[['ISO-alpha3 Code', 'Intermediate Region Code']]\
                            .dropna()\
                            .drop_duplicates()\
                            .rename(columns={
                                'ISO-alpha3 Code': 'iso3',
                                'Intermediate Region Code': 'm49'
                            })

In [195]:
country_region = pd.concat([country_continent, country_subregion, country_itdregion], ignore_index=True)

In [196]:
param_dicts = country_region.to_dict('records')
param_dicts[:5]

[{'iso3': 'DZA', 'm49': 2.0},
 {'iso3': 'EGY', 'm49': 2.0},
 {'iso3': 'LBY', 'm49': 2.0},
 {'iso3': 'MAR', 'm49': 2.0},
 {'iso3': 'SDN', 'm49': 2.0}]

In [197]:
execute_query_with_params('''
MATCH
    (c:Country{iso3: $iso3}),
    (r:Region{m49: $m49})
MERGE (c)-[:IS_IN]->(r)''', *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3356f9210>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98f0f50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b98f1dd0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7b0bd50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7b08050>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7b0a650>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7b08d50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7b5e990>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7b5e0d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b

### Country Aliases Property

Alternative names for countries. Source: [Kaggle](https://www.kaggle.com/datasets/wbdill/country-aliaseslist-of-alternative-country-names)/[Wikipedia](https://en.wikipedia.org/wiki/List_of_alternative_country_names)

In [198]:
df_alias = pd.read_csv('data/country_aliases.csv')

In [199]:
def split_alias(row):
    '''
    Splits a row if Alias contains multiple aliases seperated by " or "
    '''
    if ' or ' in row['Alias']:
        values = row['Alias'].split(' or ')
        return pd.DataFrame({'iso3': [row['iso3']] * len(values), 'Alias': values})
    return pd.DataFrame({'iso3': [row['iso3']], 'Alias': [row['Alias']]})

In [200]:
aliases = pd.concat([split_alias(row) for _, row in df_alias.iterrows()],
                  ignore_index=True)\
        .dropna()\
        .drop_duplicates()\
        .rename(columns={'Alias': 'alias'})

In [201]:
param_dicts = aliases.to_dict('records')
param_dicts[:5]

[{'iso3': 'AFG', 'alias': 'Afghanistan'},
 {'iso3': 'AFG', 'alias': 'Islamic Republic of Afghanistan'},
 {'iso3': 'AFG', 'alias': 'Da Afganistan Islami Jumhoryat'},
 {'iso3': 'AFG', 'alias': 'Jomhuriyyeh Eslamiyyeh Afganestan'},
 {'iso3': 'ALB', 'alias': 'Albania'}]

In [202]:
execute_query_with_params('''
MERGE (c:Country {iso3: $iso3})
SET c.aliases = 
    CASE
        WHEN c.aliases IS NULL THEN [$alias]
        WHEN NOT $alias IN c.aliases THEN c.aliases + $alias
        ELSE c.aliases
    END''', *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b73fb310>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x33570a350>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7b9d590>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7b9edd0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7b9cb50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7b9c550>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7b2bc90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7b29750>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7b2a8d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b

### Country Stats

Yearly stats for each country are sourced from [Worldbank](data.worldbank.org). Corporate Tax Rates sourced from [Tax Foundation](https://taxfoundation.org/data/all/global/corporate-tax-rates-by-country-2023).

In [203]:
def get_worldbank(indicator: str) -> pd.DataFrame:
    '''
    Get indicator data using worldbank API
    '''
    with urllib.request.urlopen(f"https://api.worldbank.org/v2/country/all/indicator/{indicator}?format=json&per_page=20000") as url:
        data = json.load(url)[1]
    ind = data[0]['indicator']['value']
    iso3 = map(itemgetter('countryiso3code'), data)
    year = map(itemgetter('date'), data)
    value = map(itemgetter('value'), data)
    return pd.DataFrame({
        'iso3': iso3,
        'year': year,
        ind: value
    }).replace('', np.nan)\
      .dropna()\
      .set_index(['iso3', 'year'])

In [204]:
population = get_worldbank('SP.POP.TOTL')

In [205]:
gdp = get_worldbank('NY.GDP.MKTP.CD')

In [206]:
pv = get_worldbank('PV.EST')

In [207]:
ctr = pd.read_excel('data/corp_tax_rate.xlsx')\
        .melt(id_vars='iso_3',
              value_vars=range(1980, 2024),
              var_name='year',
              value_name='corporate_tax_rate')\
        .rename(columns={'iso_3': 'iso3'})\
        .astype({'year': str})\
        .set_index(['iso3', 'year'])

In [208]:
stats = pd.concat([population, gdp, pv, ctr], axis=1).sort_index()\
          .reset_index()\
          .rename(columns={
              'Population, total': 'population',
              'GDP (current US$)': 'gdp',
              'Political Stability and Absence of Violence/Terrorism: Estimate': 'pv',
              'corporate_tax_rate': 'corporate_tax_rate'
          })

We use 2022 stats for now

In [209]:
param_dicts = stats[stats['year'] == '2022'].to_dict('records')
param_dicts[:3]

[{'iso3': 'ABW',
  'year': '2022',
  'population': 106445.0,
  'gdp': 3544707788.05664,
  'pv': 1.47468435764313,
  'corporate_tax_rate': 25.0},
 {'iso3': 'AFE',
  'year': '2022',
  'population': 720859132.0,
  'gdp': 1183962133998.87,
  'pv': nan,
  'corporate_tax_rate': nan},
 {'iso3': 'AFG',
  'year': '2022',
  'population': 41128771.0,
  'gdp': 14502158192.0904,
  'pv': -2.5508017539978,
  'corporate_tax_rate': 20.0}]

In [210]:
execute_query_with_params('''
MATCH (c:Country {iso3: $iso3})
SET
    c.population = $population,
    c.gdp = $gdp,
    c.pv = $pv,
    c.corporate_tax_rate = $corporate_tax_rate''', *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bb4e6d90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bb4e7810>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bb4e8310>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bb4e8e90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bb4e9d90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bb4eac90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bb4ebad0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bcc3be50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bcc3b150>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b

### Sector Node


The data used for Sector/Industry nodes comes from [bautheac/GICS](https://github.com/bautheac/GICS) packages the Global Industry Classification Standards (GICS) dataset for consumption in R.  The GICS hierarchy begins with 11 sectors and is followed by 24 industry groups, 68 industries, and 157 sub-industries. 

In [211]:
url = 'https://github.com/bautheac/GICS/raw/0c2b0e4c0ca56a0e520301fd978fc095ed4fc328/data/standards.rda'
response = requests.get(url)

rda_file_path = './data/standards.rda'
with open(rda_file_path, 'wb') as file:
    file.write(response.content)

# Load the .rda file using pyreadr
result = pyreadr.read_r(rda_file_path)

print(result.keys())  

df = result[list(result.keys())[0]]  

# Save the DataFrame as a CSV file and remove the rda file
df.to_csv('./data/standards.csv', index=False)

os.remove(rda_file_path)

print("Data has been saved as standards.csv")

odict_keys(['standards'])
Data has been saved as standards.csv


In [212]:
# data wrangling for industry/sector

def wrangling(csv_path):
    df = pd.read_csv(csv_path)
    
    df = df.dropna()

    df = df.drop_duplicates()
    
    df = df.rename(columns={
        'sector id': 'sector_id',
        'sector name': 'sector_name',
        'industry group id': 'industry_group_id',
        'industry group name': 'industry_group_name',
        'industry id': 'industry_id',
        'industry name': 'industry_name',
        'subindustry id': 'subindustry_id',
        'subindustry name': 'subindustry_name',
        'description': 'primary_activity'
    })

    
    df['sector_id'] = df['sector_id'].astype('Int64')  
    df['industry_group_id'] = df['industry_group_id'].astype('Int64')
    df['industry_id'] = df['industry_id'].astype('Int64')
    df['subindustry_id'] = df['subindustry_id'].astype('Int64')

    df.reset_index(drop=True, inplace=True)
    df.index += 1

    return df

df_standards = wrangling("./data/standards.csv")

In [213]:
sector = df_standards[['sector_id', 'sector_name']] \
        .drop_duplicates() \
        .rename(columns={
            'sector_id': 'gics',
            'sector_name': 'name'
        })

In [214]:
param_dicts = sector.to_dict('records')
param_dicts[:5]

[{'gics': 10, 'name': 'Energy'},
 {'gics': 15, 'name': 'Materials'},
 {'gics': 20, 'name': 'Industrials'},
 {'gics': 25, 'name': 'Consumer Discretionary'},
 {'gics': 30, 'name': 'Consumer Staples'}]

In [215]:
execute_query_with_params("MERGE (:Sector{gics: $gics, name: $name})", *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bcc4e950>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bcc4e0d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bcc4d4d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bcc4cad0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bcd8be90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x335229190>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x33519c310>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x33519e590>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x33519e950>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b

### Industry Node

In [216]:
industry = df_standards[['subindustry_id', 'subindustry_name', 'primary_activity']] \
           .drop_duplicates() \
           .rename(columns={
               'subindustry_id': 'gics',
               'subindustry_name': 'name',
               'primary_activity': 'description'
            })

In [217]:
EMBEDDING_MODEL = SentenceTransformer("all-MiniLM-L6-v2")

In [218]:
industry_desc_embed = EMBEDDING_MODEL.encode(industry['description'].to_numpy())
industry['embedding'] = list(map(list, industry_desc_embed))

In [219]:
param_dicts = industry.to_dict('records')

In [220]:
execute_query_with_params("MERGE (:Industry{gics: $gics, name: $name, description: $description, embedding: $embedding})", *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd3c2c90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd3c2290>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd3c1790>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd3c0d10>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd3c02d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd3f7690>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd3f6b50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd3f5e10>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd3f5050>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b

### Industry PART_OF Sector Relationship

In [221]:
industry_sector = df_standards[['subindustry_id', 'sector_id']] \
                  .drop_duplicates() \
                  .rename(columns={
                      'subindustry_id': 'industry_gics',
                      'sector_id': 'sector_gics'
                  })

In [222]:
param_dicts = industry_sector.to_dict('records')
param_dicts[:5]

[{'industry_gics': 10101010, 'sector_gics': 10},
 {'industry_gics': 10101020, 'sector_gics': 10},
 {'industry_gics': 10102010, 'sector_gics': 10},
 {'industry_gics': 10102020, 'sector_gics': 10},
 {'industry_gics': 10102030, 'sector_gics': 10}]

In [223]:
execute_query_with_params('''
MATCH
    (i:Industry{gics: $industry_gics}),
    (s:Sector{gics: $sector_gics})
MERGE (i)-[:PART_OF]->(s)''', *param_dicts)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd1ecc10>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd164ed0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd1640d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd10ef10>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd10de90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd1fc310>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd1fddd0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd1ff850>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3bd1cb2d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b

## Adding Company Data

### Import JSON files

In this part, import all the json files, `nasdaq_kg_schema.json`, `nasdaq_kg_schema_rank_1-10.json` and `nasdaq_kg_schema_rank_11-32.json`.

In [224]:
json_files = ['nasdaq_kg_schema.json', 'nasdaq_kg_schema_rank_1-10.json', 'nasdaq_kg_schema_rank_11-32.json']

def merge_json_files(json_list: list[str]) -> dict:

    merged_json = {
        "nodes": {},
        "relationships": {}
    }

    for file_path in json_list:
        with open(file_path, 'r') as file:
            try:
                data = json.load(file)
                
                if "nodes" in data:
                    for node_type, nodes in data["nodes"].items():
                        if node_type not in merged_json["nodes"]:
                            merged_json["nodes"][node_type] = []
                        merged_json["nodes"][node_type].extend(nodes)

               
                if "relationships" in data:
                    for relationship_type, relationships in data["relationships"].items():
                        if relationship_type not in merged_json["relationships"]:
                            merged_json["relationships"][relationship_type] = []
                        merged_json["relationships"][relationship_type].extend(relationships)

            except json.JSONDecodeError as e:
                print(f"Error decoding {file_path}: {e}")
                
            except FileNotFoundError:
                print("Error: The file was not found.")

    return merged_json

merged_json = merge_json_files(json_files)

### Data Validation

* This part is only for company nodes, where we want to ensure data integrity by checking the primary key ticker_code and ensuring other fields like company name adhere to specific formatting rules and constraints. 
* Validating ticker_code as a valid string consisting of 4 to 5 uppercase letters, is for accurate indexing in financial markets.
* The process identifies duplicates and keeps the more-info version of an entry over the other duplicates. 
* Standardizing company names into title case and eliminating special symbols maintains consistency in representation and usability.
* All data entry whose ticker_code and company_name does not meet these criteria will be removed to maintain a clean dataset for the reporting of company information.

In [225]:
def is_valid_ticker(ticker_code):
    """Helper function to check if the ticker code is valid (str, 4 to 5 letters, all upper case)."""
    return isinstance(ticker_code, str) and 4 <= len(ticker_code) <= 5 and ticker_code.isupper()

def remove_invalid_ticker_companies(data):
    """Remove companies whose ticker_code doesn't meet the 3, 4, or 5 letter criteria."""
    if isinstance(data, str):
        print("Warning: data is a string, attempting to load as JSON.")
        data = json.loads(data)  
    
    if "nodes" in data and "Company" in data["nodes"]:
        companies = data["nodes"]["Company"]
        filtered_companies = [company for company in companies if is_valid_ticker(company.get("ticker_code"))]
        data["nodes"]["Company"] = filtered_companies
    else:
        print("Warning: The expected structure is not found in the data.")

    return data

def is_more_comprehensive(entry1, entry2):
    """Helper function to determine which duplicate has more comprehensive details."""
    return sum(1 for v in entry1.values() if v) > sum(1 for v in entry2.values() if v)

def remove_duplicates(data):
    seen_tickers = {}
    for company in data["nodes"]["Company"]:   
        ticker = company.get("ticker_code", "")
        if ticker in seen_tickers:
            if is_more_comprehensive(company, seen_tickers[ticker]):
                seen_tickers[ticker] = company
        else:
            seen_tickers[ticker] = company 
    data["nodes"]["Company"] = list(seen_tickers.values())
    return data

def standarize_case(data):
    """function to standardize title case and no other special."""
    for company in data["nodes"]["Company"]:
        company["name"] = company["name"].title() 
        company["name"] = re.sub(r'[^a-zA-Z0-9\s&.-]', '',company["name"]) 
    return data

In [226]:
def company_validate(data): 
    data = remove_invalid_ticker_companies(data)
    # data = remove_duplicates(data)
    # data = standarize_case(data)
    print("Completed.")
    return data

validated_data = company_validate(merged_json)

Completed.


### Adding Company Nodes

In [227]:
companies = validated_data['nodes']['Company']
for company in companies:
    company['founded_year'] = company['founded_year'] or ""
companies[:5]

[{'name': 'Apple Inc.', 'ticker_code': 'AAPL', 'founded_year': ''},
 {'name': 'Apple', 'ticker_code': 'AAPL', 'founded_year': ''},
 {'name': 'AirPods', 'ticker_code': 'AAPL', 'founded_year': ''},
 {'name': 'Apple', 'ticker_code': 'AAPL', 'founded_year': ''},
 {'name': 'Major League Soccer (MLS)',
  'ticker_code': 'MLFB',
  'founded_year': ''}]

In [228]:
execute_query_with_params('''
MERGE (c:Company {ticker: $ticker_code})
SET c.names = 
    CASE
        WHEN c.names IS NULL THEN [$name]
        WHEN NOT $name IN c.names THEN c.names + $name
        ELSE c.names
    END,
    c.founded_year = $founded_year''', *companies)

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b6eb3e90>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3359eb6d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7659e50>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b7659750>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b76589d0>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b765b210>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b765b950>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b766c290>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b766d110>, keys=[]),
 EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b

### Adding Product Nodes

In [153]:
# products = validated_data['nodes']['Product']
# products[:5]

In [154]:
# execute_query_with_params("MERGE (p:Product {name: $product_name})", *products)

### Company-Industry relationship

One of the key relationships defined in the data is "IS_INVOLVED_IN", which helps in modeling how companies are categorized based on their primary activities and the sectors they contribute to.


"IS_INVOLVED_IN": [
            {
                "company_name": "McKinsey &#38; Company",
                "industry_name": "management consulting"}]

In [229]:
company_industries = validated_data['relationships']['IS_INVOLVED_IN']
company_industries[:5]

[{'company_name': 'Apple', 'industry_name': 'accounting'},
 {'company_name': 'Apple', 'industry_name': 'technology'},
 {'company_name': 'Apple', 'industry_name': 'information technology'},
 {'company_name': 'Brightstar Corporation',
  'industry_name': 'wireless device services'},
 {'company_name': 'Applied Materials, Inc.',
  'industry_name': 'semiconductor equipment'}]

In [230]:
for company_industry in company_industries:
    industry_name = company_industry['industry_name']
    company_industry['embedding'] = EMBEDDING_MODEL.encode(industry_name).tolist()

In [231]:
query_results = execute_query_with_params("""
CALL db.index.fulltext.queryNodes('company_names_index', $company_name)
    YIELD node AS c, score AS company_score
CALL db.index.vector.queryNodes('industry_description_index', 10, $embedding)
    YIELD node AS i, score AS industry_score
WHERE company_score > 1
AND industry_score > 0.7
MERGE (c)-[r:IS_INVOLVED_IN]->(i)
SET r.company_score = company_score,
    r.industry_score = industry_score
RETURN c.ticker, i.name, company_score, industry_score""", *company_industries)

In [232]:
query_results

[EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3b6fe1210>, keys=['c.ticker', 'i.name', 'company_score', 'industry_score']),
 EagerResult(records=[<Record c.ticker='AAPL' i.name='Technology Distributors' company_score=2.149622917175293 industry_score=0.703384280204773>, <Record c.ticker='AAPL' i.name='Communications Equipment' company_score=2.149622917175293 industry_score=0.7008560299873352>], summary=<neo4j._work.summary.ResultSummary object at 0x3b6f66c10>, keys=['c.ticker', 'i.name', 'company_score', 'industry_score']),
 EagerResult(records=[<Record c.ticker='AAPL' i.name='Systems Software' company_score=2.149622917175293 industry_score=0.736291766166687>, <Record c.ticker='AAPL' i.name='Health Care Technology' company_score=2.149622917175293 industry_score=0.7337765693664551>, <Record c.ticker='AAPL' i.name='Data Processing & Outsourced Services' company_score=2.149622917175293 industry_score=0.7209891080856323>], summary=<neo4j._work.summary.Result