In [1]:
import dotenv
import os
from neo4j import GraphDatabase
import pandas as pd
import urllib.request, json 
from operator import itemgetter
import numpy as np

#### API call to load data

The data used in this analysis comes from [bautheac/GICS](https://github.com/bautheac/GICS) packages the Global Industry Classification Standards (GICS) dataset for consumption in R.  The GICS hierarchy begins with 11 sectors and is followed by 24 industry groups, 68 industries, and 157 sub-industries. 


In [23]:
import requests
import pyreadr
import pandas as pd

url = 'https://github.com/bautheac/GICS/raw/0c2b0e4c0ca56a0e520301fd978fc095ed4fc328/data/standards.rda'
response = requests.get(url)

rda_file_path = './data/standards.rda'
with open(rda_file_path, 'wb') as file:
    file.write(response.content)

# Load the .rda file using pyreadr
result = pyreadr.read_r(rda_file_path)

print(result.keys())  

df = result[list(result.keys())[0]]  

# Save the DataFrame as a CSV file and remove the rda file
df.to_csv('./data/standards.csv', index=False)

os.remove(rda_file_path)

print("Data has been saved as standards.csv")


odict_keys(['standards'])
Data has been saved as standards.csv


### Neo4j AuraDB Connection

In [2]:
load_status = dotenv.load_dotenv("Neo4j-fccfe306-Created-2024-09-24.txt")
if load_status is False:
    raise RuntimeError('Environment variables not loaded.')

URI = os.getenv("NEO4J_URI")
AUTH = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))

try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()
        print("Connection established.")
except Exception as e:
    print(f"Error connecting to Neo4j: {e}")



Connection established.


### Data Wrangling

In [72]:
# data wrangling for company
df1 = pd.read_csv('data/ECM_Datasets.csv',usecols = ['Company','Ticker','Year of 10-K '],sep=',')

df2 = pd.read_csv('data/NASDAQ_10-K_URLs.csv',usecols = ['Company Name','ticker','Latest Filing Year'],sep=',')


df1 = df1.rename(columns={
            'Company': 'company_name',
            'Ticker': 'ticker',
            'Year of 10-K ': 'year'  
        })
df2 = df2.rename(columns={
            'Company Name': 'company_name',
            'ticker': 'ticker',
            'Latest Filing Year': 'year'
        })
df1['company_name'] = df1['company_name'].str.title().str.strip()
df2['company_name'] = df2['company_name'].str.title().str.strip()

all_df = []
all_df.append(df1)
all_df.append(df2)

merged_df = pd.concat(all_df, ignore_index=True,axis=0)
merged_df["year"] = merged_df["year"].astype('Int64')

merged_df = merged_df.sort_values(by='year', ascending=False)
# duplicates with null value removed first
merged_df = merged_df.drop_duplicates(subset='ticker', keep='first')

# order by company names ASC
merged_df = merged_df.sort_values(by='company_name',ascending=True)

merged_df.reset_index(drop=True, inplace=True)
merged_df.index += 1

merged_df.head(10)



Unnamed: 0,company_name,ticker,year
1,Adobe Inc,ADBE,2024
2,Advanced Micro Devices Inc,AMD,2024
3,Alphabet Inc,GOOGL,2024
4,Alphabet Inc.,GOOG,2024
5,Amazon.Com Inc,AMZN,2024
6,Amgen Inc,AMGN,2024
7,Analog Devices Inc,ADI,2023
8,Apple Inc,AAPL,2023
9,Applied Materials Inc /De,AMAT,2023
10,Automatic Data Processing Inc,ADP,2024


In [3]:
# data wrangling for industry

def wrangling(csv_path):
    df = pd.read_csv(csv_path)
    
    df = df.dropna()

    df = df.drop_duplicates()
    
    df = df.rename(columns={
        'sector id': 'sector_id',
        'sector name': 'sector_name',
        'industry group id': 'industry_group_id',
        'industry group name': 'industry_group_name',
        'industry id': 'industry_id',
        'industry name': 'industry_name',
        'subindustry id': 'subindustry_id',
        'subindustry name': 'subindustry_name',
        'description': 'primary_activity'
    })

    
    df['sector_id'] = df['sector_id'].astype('Int64')  
    df['industry_group_id'] = df['industry_group_id'].astype('Int64')
    df['industry_id'] = df['industry_id'].astype('Int64')
    df['subindustry_id'] = df['subindustry_id'].astype('Int64')

    df.reset_index(drop=True, inplace=True)
    df.index += 1

    return df

industry = wrangling("./data/standards.csv")
industry.head()


Unnamed: 0,sector_id,sector_name,industry_group_id,industry_group_name,industry_id,industry_name,subindustry_id,subindustry_name,primary_activity
1,10,Energy,1010,Energy,101010,Energy Equipment & Services,10101010,Oil & Gas Drilling,Drilling contractors or owners of drilling rig...
2,10,Energy,1010,Energy,101010,Energy Equipment & Services,10101020,Oil & Gas Equipment & Services,"Manufacturers of equipment, including drilling..."
3,10,Energy,1010,Energy,101020,"Oil, Gas & Consumable Fuels",10102010,Integrated Oil & Gas,Integrated oil companies engaged in the explor...
4,10,Energy,1010,Energy,101020,"Oil, Gas & Consumable Fuels",10102020,Oil & Gas Exploration & Production,Companies engaged in the exploration and produ...
5,10,Energy,1010,Energy,101020,"Oil, Gas & Consumable Fuels",10102030,Oil & Gas Refining & Marketing,Companies engaged in the refining and marketin...


In [4]:
industry.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 1 to 157
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   sector_id            157 non-null    Int64 
 1   sector_name          157 non-null    object
 2   industry_group_id    157 non-null    Int64 
 3   industry_group_name  157 non-null    object
 4   industry_id          157 non-null    Int64 
 5   industry_name        157 non-null    object
 6   subindustry_id       157 non-null    Int64 
 7   subindustry_name     157 non-null    object
 8   primary_activity     157 non-null    object
dtypes: Int64(4), object(5)
memory usage: 11.8+ KB


#### Adding Industry Node

Industry sectors represent large sections of the economy and include multiple companies.

In [27]:
def add_industry_sector_constraints(tx) -> None:
    tx.run("CREATE INDEX industry_id IF NOT EXISTS FOR (i:Industry) ON (i.industry_id)")
    tx.run("CREATE INDEX industry_name IF NOT EXISTS FOR (i:Industry) ON (i.industry_name)")
    tx.run("CREATE INDEX industry_group IF NOT EXISTS FOR (i:Industry) ON (i.industry_group)")
    tx.run("CREATE INDEX subindustry_name IF NOT EXISTS FOR (i:Industry) ON (i.subindustry_name)")
    tx.run("CREATE INDEX primary_activity IF NOT EXISTS FOR (i:Industry) ON (i.primary_activity)")
    

In [28]:
def add_industries(tx, industry: pd.DataFrame) -> None:
    queries = []
    for _, row in industry.iterrows():
        query ="""
        MERGE (:Industry {
            industry_id: $industry_id,
            industry_name: $industry_name,
            industry_group_name: $industry_group_name,
            subindustry_name: $subindustry_name,
            primary_activity: $primary_activity
        })
        """
        queries.append((query, row.to_dict()))
    for query, params in queries:
        tx.run(query, **params)

In [30]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
        session.execute_write(add_industry_sector_constraints) 
        session.execute_write(add_industries, industry=industry)

#### Web scraping for industry-company relationship

This is the partial data table that is extracted from the D&B Hoovers portal
which can be accessed here: https://app-hoovers-dnb-com.libproxy1.nus.edu.sg/

However, the D&B Hoovers assigns industry classifications based on the Standard Industrial Classification (SIC) system or its internal classifications, which may differ from the global GICS standard created by MSCI and Standard & Poor's (S&P).

In [1]:
## only 20 records can be downloaded from, even though 40 of them have the records
import pandas as pd

df = pd.read_csv('data/company_industry.csv')

# List of columns to extract
columns_to_extract = [
    'Company Name', 
    'Ticker', 
    'Country/Region', 
    'Sales (USD)', 
    'D&B Hoovers Industry', 
    'ISIC Rev 4 Code', 
    'ISIC Rev 4 Description'
]
extracted_df = df[columns_to_extract]

extracted_df.reset_index(drop=True, inplace=True)
extracted_df.index += 1

print(extracted_df)


                      Company Name Ticker Country/Region   Sales (USD)  \
1                       Apple Inc.   AAPL  United States  3.832850e+11   
2            Microsoft Corporation   MSFT  United States  2.451220e+11   
3                 Amazon.com, Inc.   AMZN  United States  5.747850e+11   
4     Costco Wholesale Corporation   COST  United States  2.422900e+11   
5                      Tesla, Inc.   TSLA  United States  9.677300e+10   
6                    Alphabet Inc.  GOOGL  United States  3.073940e+11   
7                    PepsiCo, Inc.    PEP  United States  8.639200e+10   
8                T-Mobile US, Inc.   TMUS  United States  7.855800e+10   
9            QUALCOMM Incorporated   QCOM  United States  3.582000e+10   
10  Texas Instruments Incorporated    TXN  United States  1.751900e+10   
11                      Amgen Inc.   AMGN  United States  2.819000e+10   
12         Applied Materials, Inc.   AMAT  United States  2.651700e+10   
13    Honeywell International Inc.    

#### Adding Company IS INVOLVED IN Industry Relationship

In [2]:
def add_company_industry(tx, company_industry: pd.DataFrame) -> None:
    query = """
    MERGE (c:Company {name: $company_name, ticker: $ticker})
    MERGE (i:Industry {isic_rev4_code: $isic_rev4_code, isic_rev4_description: $isic_rev4_description, dnb_hoovers_industry: $dnb_hoovers_industry})
    MERGE (c)-[:INVOLVED_IN]->(i)
    """
    
    for _, row in company_industry.iterrows():
        tx.run(query, 
               company_name=row['Company Name'], 
               ticker=row['Ticker'], 
               isic_rev4_code=row['ISIC Rev 4 Code'], 
               isic_rev4_description=row['ISIC Rev 4 Description'], 
               dnb_hoovers_industry=row['D&B Hoovers Industry'])



In [85]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
        session.execute_write(add_company_industry, company_industry=extracted_df)