In [1]:
import pandas as pd
from neo4j import GraphDatabase

# Starting data set from Kaggle has 271,680 rows and 8 columns; 102 stocks
nasdaq100 = pd.read_csv("/user/projects/project-3-techChanakya/data/NASDAQ_100_Data_From_2010.csv",sep="\t")

# Filter the data for the time range May to September 2021
start_date = '2021-05-01'
end_date = '2021-09-30'

sub_nasdaq100 = nasdaq100[(nasdaq100['Date'] >= start_date) & (nasdaq100['Date'] <= end_date)]
print(sub_nasdaq100.shape)

# Sub data set has 9,384 rows and 8 columns; 102 stocks

print(sub_nasdaq100['Name'].nunique())

In [2]:
sub_nasdaq100 = pd.read_csv("/user/projects/project-3-techChanakya/data/sub_nasdaq100.csv")

In [2]:
# Neo4j connnection parameters
uri = "neo4j://neo4j:7687"
user = "neo4j"
password = "ucb_mids_w205"

# Neo4j connection
driver = GraphDatabase.driver(uri=uri, auth=(user,password))

# Session is only needed if we have more than one database but here we have default database so this is optional
    # session = driver.session(database="neo4j")

# my_neo4j_wipe_out_database() - since community edition can only have 1 database "neo4j", this function will wipe out all the nodes and relationships

In [None]:
def my_neo4j_wipe_out_database():
    """
    wipe out database by deleting all nodes and relationships
    """
    
    with driver.session() as session:
        session.run("MATCH (n)-[r]->() DELETE r, n") # Delete nodes with relationship
        session.run("MATCH (n) DELETE n") # Delete remaining nodes
        
def insert_stock_data(tx, stock, date, close, volume):
    """
        Insert stock and trading day nodes into Neo4j
    """
    tx.run("""
        MERGE (s:Stock {name: $stock})
        MERGE (t:StockTradingDay {date:date($date), close:toFloat($close), volume:toInteger($volume)})
        MERGE (s)-[:TRADING_DAY]->(t)
    """, stock=stock, date=date, close=close, volume=volume)

def link_trading_days(tx, stock):
    """
        Links StockTradingDay nodes for a given stock using apoc.nodes.link()
    """
    tx.run("""
        MATCH(s:Stock {name: $stock})-[:TRADING_DAY]->(day:StockTradingDay)
        WITH s, day
        ORDER by day.date ASC
        WITH s, collect(day) AS days
        UNWIND range(0, size(days)-2) AS i
        WITH days[i] AS current_day, days[i+1] AS next_day
        MERGE (current_day)-[:NEXT_DAY]->(next_day)
    """, stock=stock)

# Prep the dataframe 
sub_nasdaq100.sort_values(by=['Name','Date'], inplace=True)

# Wipe out current Neo4j database
my_neo4j_wipe_out_database()

# Insert stock and trading day nodes
with driver.session() as session:
    for _, row in sub_nasdaq100.iterrows():
        stock = row['Name']
        date = row['Date']
        close = row['Close']
        volume = row['Volume']
        
        session.execute_write(insert_stock_data, stock, date, close, volume)
        
    # Link trading day nodes per stock
    stocks = sub_nasdaq100['Name'].unique()
    for stock in stocks:
        session.execute_write(link_trading_days, stock)

driver.close()

# Query to retrive the linked list for a given stock

MATCH (s:Stock {name:'AAPL'})-[:TRADING_DAY]->(start:StockTradingDay)

MATCH path = (start)-[:NEXT_DAY]->(end:StockTradingDay)

RETURN s, path