In [1]:
import pandas as pd
from neo4j import GraphDatabase

In [2]:
sub_nasdaq100 = pd.read_csv("/user/projects/project-3-techChanakya/data/sub_nasdaq100.csv")

In [3]:
# Neo4j connnection parameters
uri = "neo4j://neo4j:7687"
user = "neo4j"
password = "ucb_mids_w205"

# Neo4j connection
driver = GraphDatabase.driver(uri=uri, auth=(user,password))

# Session is only needed if we have more than one database but here we have default database so this is optional
    # session = driver.session(database="neo4j")

In [5]:
# Jaccard Similarity

# Step 1: Identify volume buckets to assign a volume lable to stock trading days

def create_volume_categories(tx):
    """
    Create new VolumeCategory nodes and assign each StockTradingDay node to the VolumeCateogry node basing on
    the volume trading on that day.
    """
    tx.run("""
        MATCH (t:StockTradingDay)
        WITH t,
            CASE
                WHEN t.volume > 10000000 THEN ['HighVolume', 10000001, 100000000]
                WHEN t.volume > 1000000 THEN ['MediumVolume', 1000001, 10000000]
                ELSE ['LowVolume', 0, 1000000]
            END AS volumeCategoryInfo
        MERGE (v:VolumeCategory {name: volumeCategoryInfo[0]})
        SET v.min = volumeCategoryInfo[1], v.max = volumeCategoryInfo[2]
        MERGE (t)-[:IN_VOLUME_CATEGORY]->(v)
    
    """)

# Step 2: Link stocks to volume category
def link_stocks_to_volume_categories(tx):
    """
    Assigns each Stock node to a VolumeCategory node.
    """
    tx.run("""
        MATCH (s:Stock)-[:TRADING_DAY]->(t:StockTradingDay)-[:IN_VOLUME_CATEGORY]->(v:VolumeCategory)
        MERGE (s)-[:HAS_VOLUME_CATEGORY]->(v)
    """)

# Step 3: Compute Jaccard similarity
def compute_jaccard_similarity_with_labels(tx):
    """
    Drops existing projection and creates new projection using Jaccard Similarity
    """
    # Drop graph projection if exists
    tx.run("""
        CALL gds.graph.exists('stock-similarity') YIELD exists
        WITH exists
        WHERE exists
        CALL gds.graph.drop('stock-similarity') YIELD graphName
        RETURN graphName
    """)
    
    # Create new projection
    tx.run("""
        CALL gds.graph.project(
            'stock-similarity',
            ['Stock', 'VolumeCategory'],
            {
                HAS_VOLUME_CATEGORY: {
                    orientation: 'UNDIRECTED'
                }
            }
        )
    
    """)
    
    # Run Jaccard similarity
    result = tx.run("""
        CALL gds.nodeSimilarity.stream('stock-similarity')
        YIELD node1, node2, similarity
        WITH gds.util.asNode(node1) AS stock1, gds.util.asNode(node2) AS stock2, similarity
        MATCH (stock1)-[:HAS_VOLUME_CATEGORY]->(v:VolumeCategory)<-[:HAS_VOLUME_CATEGORY]-(stock2)
        WITH stock1.name AS stock1, stock2.name AS stock2, similarity,
            collect(DISTINCT {
                name: v.name,
                min: v.min,
                max: v.max
            }) AS shared_volume_categories
        RETURN stock1, stock2, similarity, shared_volume_categories
        ORDER BY similarity DESC
    """)
    
    return result.data()

In [6]:
# Calculate Jaccard similarity

with driver.session() as session:
    # Create a link volume categories
    session.execute_write(create_volume_categories)
    session.execute_write(link_stocks_to_volume_categories)
    
    # Compute similarity and receive the table
    similarity_results = session.execute_read(compute_jaccard_similarity_with_labels)

driver.close()

# Convert the returne data into a Data Frame
df_similarity = pd.DataFrame(similarity_results)

# Output the data into a CSV file
df_similarity.to_csv("/user/projects/project-3-techChanakya/data/jaccard_similarity.csv", index=False)