# 2. **Knowledge Graph**

In [75]:
import pandas as pd
from py2neo import Graph, Node, Relationship
from tqdm import tqdm

In [76]:
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "12345678"
graph = Graph(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

In [77]:
stocks_complete = pd.read_csv("../data/stocks_completed.csv")
rolling_corr_matrices = pd.read_csv("../data/rolling_corr_matrices.csv")
sector_means = pd.read_csv("../data/sector_means.csv")
sector_volatility = pd.read_csv("../data/sector_volatility.csv")
sector_returns = pd.read_csv("../data/sector_returns.csv")
rolling_sector_corr = pd.read_csv("../data/rolling_sector_corr.csv")

In [78]:
graph.run("MATCH (n) DETACH DELETE n")
print("Cleared the Neo4j database.")

Cleared the Neo4j database.


In [79]:
stocks_complete.columns

Index(['Ticker', 'Name', 'Sector', 'marketCap', 'trailingPE', 'forwardPE',
       'priceToBook', 'trailingEps', 'forwardEps', 'bookValue', 'payoutRatio',
       'beta', 'fiveYearAvgDividendYield', '52WeekChange', 'averageVolume',
       'enterpriseToRevenue', 'profitMargins', 'Close', 'Volume', 'NormClose',
       'DailyLogReturn', 'ALR1M', 'ALR3M', 'Volatility', 'RSI', 'MACD',
       'HighLowRange'],
      dtype='object')

In [80]:
sectors = stocks_complete["Sector"].unique()
for sector in tqdm(sectors, desc="Adding Sector Nodes"):
    sector_node = Node("Sector", name=sector)
    graph.merge(sector_node, "Sector", "name")
    
sector_nodes_count = graph.run("MATCH (s:Sector) RETURN COUNT(s) AS count").evaluate()
print(f"Total Sector Nodes: {sector_nodes_count}")

sector_nodes = graph.run("MATCH (s:Sector) RETURN s.name LIMIT 5").data()
print("Sample Sector Nodes:", sector_nodes)

Adding Sector Nodes: 100%|██████████| 11/11 [00:00<00:00, 34.75it/s]

Total Sector Nodes: 11
Sample Sector Nodes: [{'s.name': 'Information Technology'}, {'s.name': 'Health Care'}, {'s.name': 'Financials'}, {'s.name': 'Real Estate'}, {'s.name': 'Consumer Discretionary'}]





In [81]:
for _, row in tqdm(stocks_complete.iterrows(), desc="Adding Stock Nodes"):
    stock_node = Node(
        "Stock",
        name=row["Ticker"],
        sector=row["Sector"],
        marketCap=row["marketCap"],
        trailingPE=row["trailingPE"],
        forwardPE=row["forwardPE"],
        priceToBook=row["priceToBook"],
        trailingEps=row["trailingEps"],
        forwardEps=row["forwardEps"],
        bookValue=row["bookValue"],
        payoutRatio=row["payoutRatio"],
        beta=row["beta"],
        fiveYearAvgDividendYield=row["fiveYearAvgDividendYield"],
        weekChange=row["52WeekChange"],
        averageVolume=row["averageVolume"],
        enterpriseToRevenue=row["enterpriseToRevenue"],
        profitMargins=row["profitMargins"]
    )
    sector_node = graph.nodes.match("Sector", name=row["Sector"]).first()
    graph.merge(stock_node, "Stock", "name")
    if sector_node:
        belongs_to = Relationship(stock_node, "BELONGS_TO", sector_node)
        graph.merge(belongs_to)

Adding Stock Nodes: 119500it [03:13, 617.01it/s]


In [82]:
rolling_corr_matrices.columns

Index(['TimeIndex', 'Ticker1', 'Ticker2', 'Correlation'], dtype='object')

In [83]:
rolling_corr_matrices = rolling_corr_matrices[rolling_corr_matrices["Correlation"] > 0.8]
stock_nodes = {node["name"]: node for node in graph.nodes.match("Stock")}
relationships = []
for _, row in tqdm(rolling_corr_matrices.iterrows(), desc="Preparing Correlation Relationships"):
    stock1 = stock_nodes.get(row["Ticker1"])
    stock2 = stock_nodes.get(row["Ticker2"])
    if stock1 and stock2:
        relationships.append((stock1, stock2, row["Correlation"]))

# Batch Insert Relationships
for stock1, stock2, correlation in tqdm(relationships, desc="Adding Correlation Relationships"):
    relates_to = Relationship(stock1, "CORRELATES_WITH", stock2, correlation=correlation)
    graph.merge(relates_to)

print("Graph database successfully updated.")


Preparing Correlation Relationships: 410752it [00:09, 45097.42it/s]
Adding Correlation Relationships: 100%|██████████| 410752/410752 [55:29<00:00, 123.37it/s]   

Graph database successfully updated.





In [84]:
for _, row in tqdm(rolling_sector_corr.iterrows(), desc="Adding Sector Correlation Relationships"):
    sector1 = graph.nodes.match("Sector", name=row["Sector1"]).first()
    sector2 = graph.nodes.match("Sector", name=row["Sector2"]).first()
    if sector1 and sector2:
        correlates_with = Relationship(sector1, "CORRELATES_WITH", sector2, correlation=row["Correlation"])
        graph.merge(correlates_with)

print("Graph creation complete!")

Adding Sector Correlation Relationships: 126060it [19:45, 106.37it/s]

Graph creation complete!



