In [84]:
import dotenv
import os
from neo4j import GraphDatabase
import pandas as pd
import json 
from operator import itemgetter
import numpy as np
import re
import glob

### Neo4j AuraDB Connection

In [103]:
load_status = dotenv.load_dotenv("Neo4j-fccfe306-Created-2024-09-24.txt")
if load_status is False:
    raise RuntimeError('Environment variables not loaded.')

URI = os.getenv("NEO4J_URI")
AUTH = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()
    print("Connection established.")

Connection established.


### Import Json file 
In this part, import all the json files, `nasdaq_kg_schema.json`, `nasdaq_kg_schema_rank_1-10.json` and `nasdaq_kg_schema_rank_11-32.json`.

In [267]:
json_files = glob.glob('**/*.json', recursive=True)
print(json_files)

def merge_json_files(json_list):

    merged_json = {
        "nodes": {},
        "relationships": {}
    }

    for file_path in json_list:
        with open(file_path, 'r') as file:
            try:
                data = json.load(file)

                
                if "nodes" in data:
                    for node_type, nodes in data["nodes"].items():
                        if node_type not in merged_json["nodes"]:
                            merged_json["nodes"][node_type] = []
                        merged_json["nodes"][node_type].extend(nodes)

               
                if "relationships" in data:
                    for relationship_type, relationships in data["relationships"].items():
                        if relationship_type not in merged_json["relationships"]:
                            merged_json["relationships"][relationship_type] = []
                        merged_json["relationships"][relationship_type].extend(relationships)

            except json.JSONDecodeError as e:
                print(f"Error decoding {file_path}: {e}")
                
            except FileNotFoundError:
                print("Error: The file was not found.")

    # Write the merged data to a JSON file
    return merged_json

merged_json = merge_json_files(json_files)

['merged_output.json', 'nasdaq_kg_schema.json', 'nasdaq_kg_schema_rank_1-10.json', 'nasdaq_kg_schema_rank_11-32.json']


In [268]:
def extract_nodes_from_json(data):
    """Extracts nodes from the specified JSON file and returns them in a structured format."""
    extracted_nodes = {
        'Company': set(),
        'Product': set(),
        'Country': set(),
        'Industry': set()
    }

    for company in data['nodes'].get('Company', []):
        extracted_nodes['Company'].add(company['name'])
    
    for product in data['nodes'].get('Product', []):
        extracted_nodes['Product'].add(product['name'])
    
    for country in data['nodes'].get('Country', []):
        extracted_nodes['Country'].add(country['name'])
    
    for industry in data['nodes'].get('Industry', []):
        extracted_nodes['Industry'].add(industry['name'])

    extracted_nodes = {key: list(value) for key, value in extracted_nodes.items()}
    
    data = {"nodes": extracted_nodes}
    return data

nodes_json = extract_nodes_from_json(merged_json)
nodes_json = json.dumps(merged_json, indent=4) 
print(nodes_json)



{
    "nodes": {
        "Company": [
            {
                "name": "Apple Inc.",
                "ticker_code": "AAPL",
                "founded_year": null
            },
            {
                "name": "Apple",
                "ticker_code": "AAPL",
                "founded_year": null
            },
            {
                "name": "App Store",
                "ticker_code": "id328412701",
                "founded_year": null
            },
            {
                "name": "Apple Arcade",
                "ticker_code": "apple-launches-game-subscription-apple-arcade-181647109--finance.html",
                "founded_year": null
            },
            {
                "name": "AirPods",
                "ticker_code": "AAPL",
                "founded_year": null
            },
            {
                "name": "Apple",
                "ticker_code": "AAPL",
                "founded_year": null
            },
            {
                "name": "MLS S

### Data Wrangling 

* This part is only for company nodes, where we want to ensure data integrity by checking the primary key ticker_code and ensuring other fields like company name adhere to specific formatting rules and constraints. 
* Validating ticker_code as a valid string consisting of 4 to 5 uppercase letters, is for accurate indexing in financial markets.
* The process identifies duplicates and keeps the more-info version of an entry over the other duplicates. 
* Standardizing company names into title case and eliminating special symbols maintains consistency in representation and usability.
* All data entry whose ticker_code and company_name does not meet these criteria will be removed to maintain a clean dataset for the reporting of company information.

In [269]:
def is_valid_ticker(ticker_code):
    """Helper function to check if the ticker code is valid (str, 4 to 5 letters, all upper case)."""
    return isinstance(ticker_code, str) and 4 <= len(ticker_code) <= 5 and ticker_code.isupper()

def remove_invalid_ticker_companies(data):
    """Remove companies whose ticker_code doesn't meet the 3, 4, or 5 letter criteria."""
    if isinstance(data, str):
        print("Warning: data is a string, attempting to load as JSON.")
        data = json.loads(data)  
    
    if "nodes" in data and "Company" in data["nodes"]:
        companies = data["nodes"]["Company"]
        filtered_companies = [company for company in companies if is_valid_ticker(company.get("ticker_code"))]
        data["nodes"]["Company"] = filtered_companies
    else:
        print("Warning: The expected structure is not found in the data.")

    return data

def is_more_comprehensive(entry1, entry2):
    """Helper function to determine which duplicate has more comprehensive details."""
    return sum(1 for v in entry1.values() if v) > sum(1 for v in entry2.values() if v)

def remove_duplicates(data):
    seen_tickers = {}
    for company in data["nodes"]["Company"]:   
        ticker = company.get("ticker_code", "")
        if ticker in seen_tickers:
            if is_more_comprehensive(company, seen_tickers[ticker]):
                seen_tickers[ticker] = company
        else:
            seen_tickers[ticker] = company 
    data["nodes"]["Company"] = list(seen_tickers.values())
    return data

def standarize_case(data):
    """function to standardize title case and no other special."""
    for company in data["nodes"]["Company"]:
        company["name"] = company["name"].title() 
        company["name"] = re.sub(r'[^a-zA-Z0-9\s&.-]', '',company["name"]) 
    return data

# Example jsondata to test with
input_data = {
    "nodes": {
        "Company": [
            {"name": "Apple Inc.", "ticker_code": "AAPL", "founded_year": None},
            {"name": "APPLIED MATERIALS INC /DE", "ticker_code": "AMAT", "founded_year": None},
            {"name": "APP STORE", "ticker_code": "id328412701", "founded_year": None}, 
            {"name": "Apple Arcade", "ticker_code": "apple-launches-game-subscription-apple-arcade-181647109--finance.html", "founded_year": None},  # Invalid
            {"name": "AirPods", "ticker_code": ".&nbsp", "founded_year": None}, 
            {"name": "COMCAST CORP", "ticker_code": "CMCSA", "founded_year": 1986},
            {"name": "Key Financial","ticker_code": "sg.finance.yahoo.com","founded_year": None},
            {"name": "Apple Inc.", "ticker_code": "AAPL", "founded_year": 1976},
            {"name": "Executive Officers","ticker_code": "about","founded_year": None},
        ]
    }
}
# remove_invalid_ticker_companies(input_data)
# remove_duplicates(input_data)
# standarize_case(input_data)

In [270]:
def wrangling(data): 
    data = remove_invalid_ticker_companies(data)
    data = remove_duplicates(data)
    data = standarize_case(data)
    print("Completed.")
    return data

wrangling(merged_json)


Completed.


{'nodes': {'Company': [{'name': 'Apple Inc.',
    'ticker_code': 'AAPL',
    'founded_year': None},
   {'name': 'Major League Soccer Mls',
    'ticker_code': 'MLFB',
    'founded_year': None},
   {'name': 'Adobe Inc.', 'ticker_code': 'ADBE', 'founded_year': None},
   {'name': 'Maxim Integrated Products',
    'ticker_code': 'MXIM',
    'founded_year': None},
   {'name': 'Applied Materials Inc De',
    'ticker_code': 'AMAT',
    'founded_year': None},
   {'name': 'Vantage Systems', 'ticker_code': 'VTAGY', 'founded_year': None},
   {'name': 'Microsoft', 'ticker_code': 'MSFT', 'founded_year': None},
   {'name': 'Coolrunner', 'ticker_code': 'CLCO', 'founded_year': None},
   {'name': 'Amazon Com Inc', 'ticker_code': 'AMZN', 'founded_year': None},
   {'name': 'Amgen Inc', 'ticker_code': 'AMGN', 'founded_year': None},
   {'name': 'Alphabet Inc.', 'ticker_code': 'GOOG', 'founded_year': None},
   {'name': 'Audio Magic', 'ticker_code': 'MGIC', 'founded_year': None},
   {'name': 'Gemini', 'ticker_

Here we use B24S and modify the scraping script to create a mapping dictionary which maps the ISO-alpha2 Code to the corresponding M49 Code. After scraping the data, we fill in the missing m49 values for each region in the regions dictionary based on the name field.

In [229]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://unstats.un.org/unsd/methodology/m49/overview/"

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

table = soup.find('table')

rows = table.find_all('tr')

data = []

for row in rows[1:]: 
    cols = row.find_all('td')
    if len(cols) >= 13:  
        entry = {
            "Global Code": cols[0].text.strip(),
            "Global Name": cols[1].text.strip(),
            "Region Code": cols[2].text.strip(),
            "Region Name": cols[3].text.strip(),
            "Sub-region Code": cols[4].text.strip(),
            "Sub-region Name": cols[5].text.strip(),
            "Intermediate Region Code": cols[6].text.strip(),
            "Intermediate Region Name": cols[7].text.strip(),
            "Country or Area": cols[8].text.strip(),
            "M49 Code": cols[9].text.strip(),
            "ISO-alpha2 Code": cols[10].text.strip(),
            "ISO-alpha3 Code": cols[11].text.strip()
        }
        data.append(entry)

df = pd.DataFrame(data)

print(df["Region Name"].unique())




['Africa' 'Americas' '' 'Asia' 'Europe' 'Oceania']


In [255]:
def rename_m49_to_region_code(regions):
    for region in regions['nodes']['Region']:
        region_value = region.get('m49')
        region['region_code'] = region_value
        del region['m49']
    return regions

# Example jsondata to test with
regions = { 
    "nodes": {
        "Region": [
            {"name": "AF", "m49": None}, 
            {"name": "AN", "m49": None},
            {"name": "AS", "m49": None},
            {"name": "EU", "m49": None},
            {"name": "NA", "m49": None},
            {"name": "OC", "m49": None},
            {"name": "SA", "m49": None}
        ]
    }
}

regions = rename_m49_to_region_code(regions)

# Self create a dictionary mapping of region abbreviations to their full names
region_map = {
    "AF": "Africa",
    "AN": "Antarctica",
    "AS": "Asia",
    "EU": "Europe",
    "NA": "Americas",
    "OC": "Oceania",
    "SA": "Americas"
}
def mapping(region_map,regions):
    region_code_mapping = {entry['Region Name']: entry['Region Code'] for entry in data if entry['Region Name'] and entry['Region Code']}

    for region in regions['nodes']['Region']:
        full_region_name = region_map.get(region['name'])

        region['region_code'] = region_code_mapping.get(full_region_name,"") 

    return regions

mapping(region_map,regions)


{'nodes': {'Region': [{'name': 'AF', 'region_code': '002'},
   {'name': 'AN', 'region_code': ''},
   {'name': 'AS', 'region_code': '142'},
   {'name': 'EU', 'region_code': '150'},
   {'name': 'NA', 'region_code': '019'},
   {'name': 'OC', 'region_code': '009'},
   {'name': 'SA', 'region_code': '019'}]}}

In [271]:
def wrangling(regions):
    """
    Cleans, wrangles, and removes duplicates from the Region data.
    - Rename record m49 to region_code
    - Add values to region code by mapping to data
    - Delete record if both name and region code is null or none
    - Ensures consistent formatting for string fields.
    - Removes duplicates based on 'name' and 'm49'.
    """

    data = rename_m49_to_region_code(regions)
    data = mapping(region_map,data)

    for region in data['nodes']['Region']:
        cleaned_name = region['name'].strip() if region['name'] else ""
        cleaned_region_code = region['region_code'] if region['region_code'] is not None else ""

        key = (cleaned_name, cleaned_region_code)
        data[key] = {"name": cleaned_name,"region_code": cleaned_region_code}

    return list(data.values())

wrangling(merged_json)

[{'Company': [{'name': 'Apple Inc.',
    'ticker_code': 'AAPL',
    'founded_year': None},
   {'name': 'Major League Soccer Mls',
    'ticker_code': 'MLFB',
    'founded_year': None},
   {'name': 'Adobe Inc.', 'ticker_code': 'ADBE', 'founded_year': None},
   {'name': 'Maxim Integrated Products',
    'ticker_code': 'MXIM',
    'founded_year': None},
   {'name': 'Applied Materials Inc De',
    'ticker_code': 'AMAT',
    'founded_year': None},
   {'name': 'Vantage Systems', 'ticker_code': 'VTAGY', 'founded_year': None},
   {'name': 'Microsoft', 'ticker_code': 'MSFT', 'founded_year': None},
   {'name': 'Coolrunner', 'ticker_code': 'CLCO', 'founded_year': None},
   {'name': 'Amazon Com Inc', 'ticker_code': 'AMZN', 'founded_year': None},
   {'name': 'Amgen Inc', 'ticker_code': 'AMGN', 'founded_year': None},
   {'name': 'Alphabet Inc.', 'ticker_code': 'GOOG', 'founded_year': None},
   {'name': 'Audio Magic', 'ticker_code': 'MGIC', 'founded_year': None},
   {'name': 'Gemini', 'ticker_code': 'G

### Populating Neo4j Nodes{} with Company, Country, Region, Industry, Product from JSON

* The unique constraint on ticker_code helps maintain data integrity by ensuring that each company is uniquely identifiable by its ticker code and allows for efficient index querying.
* A ticker code being null has no semantic meaning in the context of companies so we enforce not null constraint such that every company should have a valid ticker for identification purposes in the financial market.

In [354]:
def create_constraints(tx):
    queries = [
        "CREATE CONSTRAINT IF NOT EXISTS FOR (c:Company) REQUIRE c.ticker_code IS UNIQUE",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (c:Company) REQUIRE c.name IS NOT NULL",

        #"CREATE CONSTRAINT IF NOT EXISTS FOR (i:Industry) REQUIRE i.SIC_code IS UNIQUE", 
        "CREATE CONSTRAINT IF NOT EXISTS FOR (i:Industry) REQUIRE i.name IS UNIQUE",
        #"CREATE CONSTRAINT IF NOT EXISTS FOR (i:Industry) REQUIRE i.SIC_code IS NOT NULL",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (i:Industry) REQUIRE i.name IS UNIQUE",
        #"CREATE CONSTRAINT IF NOT EXISTS FOR (i:Industry) REQUIRE i.industry_group IS NOT NULL",
        
        "CREATE CONSTRAINT IF NOT EXISTS FOR (r:Region) REQUIRE r.region_code IS UNIQUE",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (r:Region) REQUIRE r.region_code IS NOT NULL",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (r:Region) REQUIRE r.name IS NOT NULL",
        
        "CREATE CONSTRAINT IF NOT EXISTS FOR (c:Country) REQUIRE c.iso3 IS UNIQUE",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (c:Country) REQUIRE c.iso3 IS NOT NULL",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (c:Country) REQUIRE c.iso2 IS UNIQUE",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (c:Country) REQUIRE c.name IS UNIQUE",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (c:Country) REQUIRE c.name IS NOT NULL"
    ]

    for query in queries:
        tx.run(query)
        
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
        session.execute_write(create_constraints)


While CREATE CONSTRAINT enforce uniqueness of properties, MERGE helps in avoiding duplicates during node creation and matching.

In [277]:
def add_company_nodes(tx, json_data):
    query = """
    MERGE (c:Company {name: $company_name, ticker: $ticker_code, founded_year: $founded_year})
    """
    
    companies = json_data['nodes']['Company'] 

    for company in companies:
        founded_year = company['founded_year'] if company['founded_year'] is not None else "" 
        tx.run(query, 
               company_name=company['name'], 
               ticker_code=company['ticker_code'], 
               founded_year=founded_year)


In [278]:
def add_country_nodes(tx, json_data):
    query = """
    MERGE (c:Country {iso3: $iso3,iso2: $iso2})  // Match by unique iso3 code
    ON CREATE SET 
      c.name = $country_name,
      c.source_city = $source_city,
      c.iso2 = $iso2,
      c.population = $population,
      c.gdp = $gdp,
      c.corporate_tax_rate = $corporate_tax_rate
    ON MATCH SET 
      c.population = $population,  // Optionally update on match
      c.gdp = $gdp,
      c.corporate_tax_rate = $corporate_tax_rate
    """
    
    countries = json_data['nodes']['Country']

    for country in countries:
        iso2 = country.get('iso2', "")
        iso3 = country.get('iso3', "")
        source_city = country.get('source_city', "")
        country_name = "" if country.get('name') == "Not Found" else country.get('name', "")

        
        tx.run(query, 
               source_city=source_city,
               country_name=country_name,
               iso2=iso2,
               iso3=iso3,
               population=country.get('population', 0),
               gdp=country.get('gdp', 0),
               corporate_tax_rate=country.get('corporate_tax_rate', 0)
              )


In [310]:
def add_region_nodes(tx, json_data):
    # Define the Cypher query using UNWIND to handle multiple regions
    query = """
    UNWIND $regions AS map
    MERGE (n:Region {region_code: map.region_code})
    ON CREATE SET n.name = map.name
    """

    regions = json_data['nodes']['Region']  

    tx.run(query, regions=regions)



In [350]:
def add_industry_nodes(tx, json_data):
    query = """
        MERGE (i:Industry {name: $industry_name})
        ON CREATE SET 
            i.SIC_code = $SIC_code,
            i.industry_group = $industry_group,
            i.subindustry_desc = $subindustry_desc,
            i.primary_activity = $primary_activity
        ON MATCH SET 
            i.industry_group = $industry_group,
            i.subindustry_desc = $subindustry_desc,
            i.primary_activity = $primary_activity
    """
    
    industries = json_data['nodes']['Industry']
    
    for industry in industries:
        properties = {
            "industry_name": industry["name"],
            "SIC_code": industry['SIC_code'] if industry['SIC_code'] is not None else "",
            "industry_group": industry['industry_group'] if industry['industry_group'] is not None else "",
            "subindustry_desc": industry['subindustry_desc'] if industry['subindustry_desc'] is not None else "",
            "primary_activity": industry['primary_activity'] if industry['primary_activity'] is not None else ""
        }
        
        try:
            tx.run(query, **properties)
        except Exception as e:
            print(f"Error adding industry {industry['name']}: {e}")


In [160]:
def add_product_nodes(tx, json_data):
    query = """
    MERGE (p:Product {name: $product_name})
    """
    
    products = json_data['nodes']['Product'] 

    for product in products:
        tx.run(query, 
               product_name=product['name'])


In [352]:
# Create the nodes using the Neo4j connection
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
        session.execute_write(add_company_nodes, merged_json)  
        session.execute_write(add_country_nodes, merged_json) 
        session.execute_write(add_region_nodes, merged_json) 
        session.execute_write(add_industry_nodes, merged_json) 
        session.execute_write(add_product_nodes, merged_json) 

#### Sample Creating vector indexes with BERT (will modify to other methods)

In [358]:
#!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')  # A pre-trained BERT model

def get_text_embedding(text):
    return model.encode(text).tolist()

# Example usage
company_name_embedding = get_text_embedding("Apple Inc.")


In [359]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

numerical_data = [[129, 3841, 20]]  # Population, GDP, corporate tax rate
scaled_data = scaler.fit_transform(numerical_data)


In [400]:
from neo4j import GraphDatabase

URI = os.getenv("NEO4J_URI")
AUTH = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))

driver = GraphDatabase.driver(URI, auth=AUTH)

def add_vectors(tx, node_label, name, vector, ticker_code=None):
    if node_label == "Company":
        query = """
        MERGE (n:{label} {{ticker_code: $ticker_code, name: $name}})
        SET n.vector = $vector
        """.format(label=node_label)
        tx.run(query, ticker_code=ticker_code, name=name, vector=vector)
    else:
        query = """
        MERGE (n:{label} {{name: $name}})
        SET n.vector = $vector
        """.format(label=node_label)
        tx.run(query, name=name, vector=vector)

# Example for company and country nodes
with driver.session() as session:
    company_name = "Apple Inc."
    company_vector = get_text_embedding(company_name)
    country_name = "Jamaica"
    country_vector = get_text_embedding(country_name)

    session.execute_write(add_vectors, "Company", company_name, company_vector, "AAPL")
    session.execute_write(add_vectors, "Country", country_name, country_vector)



In [393]:
def add_index_vector(tx):
    query = """
    CREATE INDEX IF NOT EXISTS FOR (c:Company)
    ON (c.vector);
    """

    tx.run(query)

with driver.session() as session:
    session.execute_write(add_index_vector)

driver.close()

### Populating Neo4j with Relationships from JSON
#### Company-Industry relationship
One of the key relationships defined in the data is "IS_INVOLVED_IN", which helps in modeling how companies are categorized based on their primary activities and the sectors they contribute to.


"IS_INVOLVED_IN": [
            {
                "company_name": "McKinsey &#38; Company",
                "industry_name": "management consulting"}]


In [397]:
def add_company_industry_relationship(tx, json_data):
    query = """
    MERGE (c:Company {name: $company_name})
    MERGE (i:Industry {name: $industry_name})
    MERGE (c)-[:IS_INVOLVED_IN]->(i)
    """
    involved_in = json_data['relationships']['IS_INVOLVED_IN']

    for relationship in involved_in:
        tx.run(query, company_name=relationship['company_name'], industry_name=relationship['industry_name'])


In [399]:
with driver.session() as session:
    session.execute_write(add_company_industry_relationship, merged_json)

driver.close()

  with driver.session() as session:


### Data Validation Testing

We also consider cross-referencing with other data sources or using validation rules to ensure data accuracy. (will elaborate more later)

In [138]:
def smoke_test(data):
    """Check if data is empty."""
    if not data:
        print("Smoke Test Failed: Data is empty.")
        return False
    print("Smoke Test Passed.")
    return True

def range_check_country(data):
    """Check for valid population and GDP values in countries."""
    for country in data['nodes'].get('Country', []):
        if country.get('population', 0) < 0 or country.get('gdp', 0) < 0:
            print(f"Range Check Failed: Invalid population or GDP in Country: {country}")
            return False
    print("Range Check Passed for Country.")
    return True

def anomaly_detection_relationship(data):
    """Identify extreme or outlier net sales in relationships."""
    for relationship in data['relationships'].get('OPERATES_IN_COUNTRY', []):
        if relationship.get('net sales') is not None and relationship['net sales'] < 0:
            print(f"Anomaly Detection Failed: Negative net sales in relationship: {relationship}")
            return False
    print("Anomaly Detection Passed for Relationships.")
    return True

def validate_relationship(data, relationship_type, required_fields):
    """Validate the relationships between companies and industries."""
    for relationship in data['relationships'].get(relationship_type, []):
        for field in required_fields:
            if relationship.get(field) is None:
                print(f"Relationship Validation Failed: Missing '{field}' in {relationship_type}.")
                return False
    print(f"Relationship Validation Passed for {relationship_type}.")
    return True

def run_tests(data):
    """Run all validation tests."""
    # Smoke Test
    if not smoke_test(data):
        return

    # Range Check for country fields
    range_check_country(data)

    # Anomaly Detection for net sales in relationships
    anomaly_detection_relationship(data)

    # Relationship Validation
    validate_relationship(data, 'PARTNERS_WITH', ['company_name_1', 'company_name_2'])
    validate_relationship(data, 'COMPETES_WITH', ['company_name_1', 'company_name_2'])
    validate_relationship(data, 'IS_INVOLVED_IN', ['company_name', 'industry_name'])

    print("All tests executed.")
run_tests(merged_json)


Smoke Test Passed.
Range Check Passed for Country.
Anomaly Detection Failed: Negative net sales in relationship: {'company_name': 'Apple', 'country_name': 'Ireland', 'net sales': -24931076, 'headcount': 1665}
Relationship Validation Passed for PARTNERS_WITH.
Relationship Validation Passed for COMPETES_WITH.
Relationship Validation Passed for IS_INVOLVED_IN.
All tests executed.


### Data Validation: Invalid Format Specifier

In [79]:
def validate_company_data(companies, fields_to_check):
    """ Regular expression to check if the name contains special characters """
    name_pattern = r"^[A-Za-z0-9\s.,&]+$"
    ticker_pattern = r"^[A-Z]{1,5}(\.[A-Z]{1,3})?$"
    invalid_companies = []
    invalid_count = 0 
    
    for company in companies:
        for field in fields_to_check:
            field_value = company.get(field)

            if field_value is None:
                continue

            if field == 'name' and not re.match(name_pattern, field_value):
                invalid_companies.append(company)
                print(f"Invalid name: '{field_value}' in company: {company}")
                invalid_count += 1  

            if field == 'ticker_code' and ('.html' in field_value or not re.match(ticker_pattern, field_value)):
                invalid_companies.append(company)
                print(f"Invalid ticker code: '{field_value}' in company: {company}")
                invalid_count += 1 

    return invalid_companies, invalid_count  


def validate_country_data(countries, fields_to_check):
    """Checking invalid format for iso2 and iso3"""
    iso2_pattern = r"^[A-Z]{2}$"
    iso3_pattern = r"^[A-Z]{3}$"
    invalid_countries = []
    invalid_count = 0  

    for country in countries:
        for field in fields_to_check:
            field_value = country.get(field)

            if field_value is None:
                continue

            if field == 'iso2' and not re.match(iso2_pattern, field_value):
                invalid_countries.append(country)
                print(f"'{field}' is invalid for country: {country}")
                invalid_count += 1  

            if field == 'iso3' and not re.match(iso3_pattern, field_value):
                invalid_countries.append(country)
                print(f"'{field}' is invalid for country: {country}")
                invalid_count += 1 

    return invalid_countries, invalid_count 
fields_to_check_company = list(merged_json["nodes"]["Company"][0].keys())
fields_to_check_country = list(merged_json["nodes"]["Country"][0].keys())

invalid_companies, invalid_company_count = validate_company_data(merged_json['nodes']['Company'], fields_to_check_company)
invalid_countries, invalid_country_count = validate_country_data(merged_json['nodes']['Country'], fields_to_check_country)

print(f"Total invalid companies: {invalid_company_count}")
print(f"Total invalid countries: {invalid_country_count}")



Total invalid companies: 0
Total invalid countries: 0


### Data Validation: Missing Mandatory Fields

In [81]:
def check_empty_fields(data, node_type):
    node_null_count = 0
    nodes = data['nodes'].get(node_type, [])
    fields_to_check = list(nodes[0].keys()) if nodes else []
    
    for node in nodes:
        for field in fields_to_check:
            if not node.get(field):
                node_null_count += 1
                print(f"'{field}' is empty for {node_type}: {node}") 
    return node_null_count 

individual_null_counts = {}
nodes_to_check = list(merged_json["nodes"].keys())

for node in nodes_to_check:
    null_count = check_empty_fields(merged_json, node)
    individual_null_counts[node] = null_count  

for entity, count in individual_null_counts.items():
    print(f"Total null fields for {entity}: {count}")

'founded_year' is empty for Company: {'name': 'Apple Inc.', 'ticker_code': 'AAPL', 'founded_year': None}
'founded_year' is empty for Company: {'name': 'Major League Soccer Mls', 'ticker_code': 'MLFB', 'founded_year': None}
'founded_year' is empty for Company: {'name': 'Adobe Inc.', 'ticker_code': 'ADBE', 'founded_year': None}
'founded_year' is empty for Company: {'name': 'Maxim Integrated Products', 'ticker_code': 'MXIM', 'founded_year': None}
'founded_year' is empty for Company: {'name': 'Applied Materials Inc De', 'ticker_code': 'AMAT', 'founded_year': None}
'founded_year' is empty for Company: {'name': 'Vantage Systems', 'ticker_code': 'VTAGY', 'founded_year': None}
'founded_year' is empty for Company: {'name': 'Microsoft', 'ticker_code': 'MSFT', 'founded_year': None}
'founded_year' is empty for Company: {'name': 'Coolrunner', 'ticker_code': 'CLCO', 'founded_year': None}
'founded_year' is empty for Company: {'name': 'Amazon Com Inc', 'ticker_code': 'AMZN', 'founded_year': None}
'fou

### Bonus part (Incomplete Data Augmentation)

In this first part, API call to load data
The data used in this analysis comes from bautheac/GICS packages the Global Industry Classification Standards (GICS) dataset for consumption in R. The GICS hierarchy begins with 11 sectors and is followed by 24 industry groups, 68 industries, and 157 sub-industries.

Also incorporated the data from `data/ECM_Datasets.csv` and `NASDAQ_10-K_URLs.csv`, in addition to the data extracted from D&B Hoovers database. By integrating new, relevant data from external databases or sources into the existing KG, we aim to enhance existing nodes with more detailed attributes, descriptions, or references.

(Not yet for incorporating into KG) The field names/properties to the nodes have NOT been properly standardized yet.

In [66]:
import requests
import pyreadr
import pandas as pd

url = 'https://github.com/bautheac/GICS/raw/0c2b0e4c0ca56a0e520301fd978fc095ed4fc328/data/standards.rda'
response = requests.get(url)

rda_file_path = './data/standards.rda'
with open(rda_file_path, 'wb') as file:
    file.write(response.content)

# Load the .rda file using pyreadr
result = pyreadr.read_r(rda_file_path)

print(result.keys())  

df = result[list(result.keys())[0]]  

# Save the DataFrame as a CSV file and remove the rda file
# df.to_csv('./data/standards.csv', index=False)

os.remove(rda_file_path)

#print("Data has been saved as standards.csv")

odict_keys(['standards'])


### Data Wrangling

In [72]:
# data wrangling for company
df1 = pd.read_csv('data/ECM_Datasets.csv',usecols = ['Company','Ticker','Year of 10-K '],sep=',')

df2 = pd.read_csv('data/NASDAQ_10-K_URLs.csv',usecols = ['Company Name','ticker','Latest Filing Year'],sep=',')


df1 = df1.rename(columns={
            'Company': 'company_name',
            'Ticker': 'ticker',
            'Year of 10-K ': 'year'  
        })
df2 = df2.rename(columns={
            'Company Name': 'company_name',
            'ticker': 'ticker',
            'Latest Filing Year': 'year'
        })
df1['company_name'] = df1['company_name'].str.title().str.strip()
df2['company_name'] = df2['company_name'].str.title().str.strip()

all_df = []
all_df.append(df1)
all_df.append(df2)

merged_df = pd.concat(all_df, ignore_index=True,axis=0)
merged_df["year"] = merged_df["year"].astype('Int64')

merged_df = merged_df.sort_values(by='year', ascending=False)
# duplicates with null value removed first
merged_df = merged_df.drop_duplicates(subset='ticker', keep='first')

# order by company names ASC
merged_df = merged_df.sort_values(by='company_name',ascending=True)

merged_df.reset_index(drop=True, inplace=True)
merged_df.index += 1

merged_df.head(10)



Unnamed: 0,company_name,ticker,year
1,Adobe Inc,ADBE,2024
2,Advanced Micro Devices Inc,AMD,2024
3,Alphabet Inc,GOOGL,2024
4,Alphabet Inc.,GOOG,2024
5,Amazon.Com Inc,AMZN,2024
6,Amgen Inc,AMGN,2024
7,Analog Devices Inc,ADI,2023
8,Apple Inc,AAPL,2023
9,Applied Materials Inc /De,AMAT,2023
10,Automatic Data Processing Inc,ADP,2024


In [86]:
# data wrangling for industry

def wrangling(csv_path):
    df = pd.read_csv(csv_path)
    
    df = df.dropna()

    df = df.drop_duplicates()
    
    df = df.rename(columns={
        'sector id': 'sector_id',
        'sector name': 'sector_name',
        'industry group id': 'industry_group_id',
        'industry group name': 'industry_group_name',
        'industry id': 'industry_id',
        'industry name': 'industry_name',
        'subindustry id': 'subindustry_id',
        'subindustry name': 'subindustry_name',
        'description': 'primary_activity'
    })

    
    df['sector_id'] = df['sector_id'].astype('Int64')  
    df['industry_group_id'] = df['industry_group_id'].astype('Int64')
    df['industry_id'] = df['industry_id'].astype('Int64')
    df['subindustry_id'] = df['subindustry_id'].astype('Int64')

    df.reset_index(drop=True, inplace=True)
    df.index += 1

    return df

industry = wrangling("./data/standards.csv")
industry.head()


Unnamed: 0,sector_id,sector_name,industry_group_id,industry_group_name,industry_id,industry_name,subindustry_id,subindustry_name,primary_activity
1,10,Energy,1010,Energy,101010,Energy Equipment & Services,10101010,Oil & Gas Drilling,Drilling contractors or owners of drilling rig...
2,10,Energy,1010,Energy,101010,Energy Equipment & Services,10101020,Oil & Gas Equipment & Services,"Manufacturers of equipment, including drilling..."
3,10,Energy,1010,Energy,101020,"Oil, Gas & Consumable Fuels",10102010,Integrated Oil & Gas,Integrated oil companies engaged in the explor...
4,10,Energy,1010,Energy,101020,"Oil, Gas & Consumable Fuels",10102020,Oil & Gas Exploration & Production,Companies engaged in the exploration and produ...
5,10,Energy,1010,Energy,101020,"Oil, Gas & Consumable Fuels",10102030,Oil & Gas Refining & Marketing,Companies engaged in the refining and marketin...


#### Additional data for Industry Node

Industry sectors represent large sections of the economy and include multiple companies.

In [27]:
def add_industry_sector_constraints(tx) -> None:
    tx.run("CREATE INDEX industry_id IF NOT EXISTS FOR (i:Industry) ON (i.industry_id)")
    tx.run("CREATE INDEX industry_name IF NOT EXISTS FOR (i:Industry) ON (i.industry_name)")
    tx.run("CREATE INDEX industry_group IF NOT EXISTS FOR (i:Industry) ON (i.industry_group)")
    tx.run("CREATE INDEX subindustry_name IF NOT EXISTS FOR (i:Industry) ON (i.subindustry_name)")
    tx.run("CREATE INDEX primary_activity IF NOT EXISTS FOR (i:Industry) ON (i.primary_activity)")
    

In [28]:
def add_industries(tx, industry: pd.DataFrame) -> None:
    queries = []
    for _, row in industry.iterrows():
        query ="""
        MERGE (:Industry {
            industry_id: $industry_id,
            industry_name: $industry_name,
            industry_group_name: $industry_group_name,
            subindustry_name: $subindustry_name,
            primary_activity: $primary_activity
        })
        """
        queries.append((query, row.to_dict()))
    for query, params in queries:
        tx.run(query, **params)

In [30]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
        session.execute_write(add_industry_sector_constraints) 
        session.execute_write(add_industries, industry=industry)

#### Additional data for industry-company relationship

This is the partial data table that is extracted from the D&B Hoovers portal
which can be accessed here: https://app-hoovers-dnb-com.libproxy1.nus.edu.sg/

D&B Hoovers assigns industry classifications based on the Standard Industrial Classification (SIC) system or its internal classifications.

In [1]:
## only 20 records can be downloaded from, even though 40 of them have the records
import pandas as pd

df = pd.read_csv('data/company_industry.csv')

# List of columns to extract
columns_to_extract = [
    'Company Name', 
    'Ticker', 
    'Country/Region', 
    'Sales (USD)', 
    'D&B Hoovers Industry', 
    'ISIC Rev 4 Code', 
    'ISIC Rev 4 Description'
]
extracted_df = df[columns_to_extract]

extracted_df.reset_index(drop=True, inplace=True)
extracted_df.index += 1

print(extracted_df)


                      Company Name Ticker Country/Region   Sales (USD)  \
1                       Apple Inc.   AAPL  United States  3.832850e+11   
2            Microsoft Corporation   MSFT  United States  2.451220e+11   
3                 Amazon.com, Inc.   AMZN  United States  5.747850e+11   
4     Costco Wholesale Corporation   COST  United States  2.422900e+11   
5                      Tesla, Inc.   TSLA  United States  9.677300e+10   
6                    Alphabet Inc.  GOOGL  United States  3.073940e+11   
7                    PepsiCo, Inc.    PEP  United States  8.639200e+10   
8                T-Mobile US, Inc.   TMUS  United States  7.855800e+10   
9            QUALCOMM Incorporated   QCOM  United States  3.582000e+10   
10  Texas Instruments Incorporated    TXN  United States  1.751900e+10   
11                      Amgen Inc.   AMGN  United States  2.819000e+10   
12         Applied Materials, Inc.   AMAT  United States  2.651700e+10   
13    Honeywell International Inc.    

#### Adding Company IS INVOLVED IN Industry Relationship

In [2]:
def add_company_industry(tx, company_industry: pd.DataFrame) -> None:
    query = """
    MERGE (c:Company {name: $company_name, ticker: $ticker})
    MERGE (i:Industry {isic_rev4_code: $isic_rev4_code, isic_rev4_description: $isic_rev4_description, dnb_hoovers_industry: $dnb_hoovers_industry})
    MERGE (c)-[:INVOLVED_IN]->(i)
    """
    
    for _, row in company_industry.iterrows():
        tx.run(query, 
               company_name=row['Company Name'], 
               ticker=row['Ticker'], 
               isic_rev4_code=row['ISIC Rev 4 Code'], 
               isic_rev4_description=row['ISIC Rev 4 Description'], 
               dnb_hoovers_industry=row['D&B Hoovers Industry'])



In [85]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
        session.execute_write(add_company_industry, company_industry=extracted_df)