In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

### Set Up Data

In [2]:
nasdaq100 = pd.read_csv("/user/projects/project-3-techChanakya/data/NASDAQ_100_Data_From_2010.csv",sep="\t")

In [3]:
# Set date parameters
start_date = '2021-05-01'
end_date = '2021-09-30'

sub_nasdaq100 = nasdaq100[(nasdaq100['Date']>= start_date) & (nasdaq100['Date'] <= end_date)]

#### Set Up Pearson Correlation Metrics

In [4]:
pivot_data = sub_nasdaq100.pivot(index='Date', columns='Name', values='Close')

# Standardize to prevent large ranges between stocks (outliers) from dominationg
scaler = StandardScaler()

normalized_data = scaler.fit_transform(pivot_data)

correlation_matrix = np.corrcoef(normalized_data.T)

In [5]:
# view the correlation matrix
tickers = pivot_data.columns.tolist()

correlation_df = pd.DataFrame(correlation_matrix, index=tickers, columns=tickers)

print(correlation_df)

          AAPL      ADBE       ADI       ADP      ADSK       AEP      ALGN  \
AAPL  1.000000  0.959071  0.483433  0.830890  0.719896  0.654451  0.875747   
ADBE  0.959071  1.000000  0.626680  0.824658  0.726294  0.553116  0.895322   
ADI   0.483433  0.626680  1.000000  0.615201  0.539853  0.109034  0.574187   
ADP   0.830890  0.824658  0.615201  1.000000  0.893719  0.681567  0.826015   
ADSK  0.719896  0.726294  0.539853  0.893719  1.000000  0.641644  0.745979   
...        ...       ...       ...       ...       ...       ...       ...   
WBA  -0.810144 -0.788888 -0.482522 -0.739569 -0.710359 -0.348523 -0.586548   
WDAY  0.588310  0.595777  0.172393  0.233950  0.145609  0.448618  0.650578   
XEL  -0.400249 -0.542443 -0.590300 -0.229653 -0.208771  0.344954 -0.262877   
XLNX  0.857907  0.911114  0.647094  0.780952  0.712473  0.623550  0.940269   
ZM    0.233404  0.351377  0.662045  0.355973  0.378842 -0.294925  0.120462   

          AMAT       AMD      AMGN  ...      TSLA       TXN    

### Neo4j Set Up

In [6]:
from neo4j import GraphDatabase

In [7]:
# Neo4j connnection parameters
uri = "neo4j://neo4j:7687"
user = "neo4j"
password = "ucb_mids_w205"

# Neo4j connection
driver = GraphDatabase.driver(uri=uri, auth=(user,password))

In [8]:
# Function to wipe out database by deleting all nodes and relationships
def my_neo4j_wipe_out_database():
    """
    Wipe out the Neo4j database by deleting all nodes and relationships.
    """
    with driver.session() as session:
        session.run("MATCH (n)-[r]->() DELETE r, n")  # Delete nodes with relationships
        session.run("MATCH (n) DELETE n")  # Delete remaining nodes
    print("Neo4j database wiped clean.")

In [9]:
# Function to insert stock data into neo4j
def insert_stock_data(tx, stock, date, close, volume):
    """
        Insert stock and trading day nodes into Neo4j
    """
    tx.run("""
        MERGE (s:Stock {name: $stock})
        MERGE (t:StockTradingDay {
                    date:date($date), 
                    close:toFloat($close), 
                    volume:toInteger($volume)
                })
                    
        MERGE (s)-[:TRADING_DAY]->(t)
    """, stock=stock, date=date, close=close, volume=volume)

In [10]:
# Function to create correlations between stocks into neo4j
def link_correlation_data(tx, stock1, stock2, correlation_value):
    """
    Insert correlation relationships between stocks into Neo4j.
    """
    tx.run("""
        MATCH (s1:Stock {name: $stock1}), (s2:Stock {name: $stock2})
        MERGE (s1)-[r:CORRELATION]-(s2)
        SET r.value = $correlation_value
    """, stock1=stock1, stock2=stock2, correlation_value=correlation_value)

In [11]:
# Function to create relationships between stocks and trading days into neo4j
def link_trading_days(tx, stock):
    """
        Links StockTradingDay nodes for a given stock using apoc.nodes.link()
    """
    tx.run("""
        MATCH(s:Stock {name: $stock})-[:TRADING_DAY]->(day:StockTradingDay)
        WITH s, day
        ORDER by day.date ASC
        WITH s, collect(day) AS days
        UNWIND range(0, size(days)-2) AS i
        WITH days[i] AS current_day, days[i+1] AS next_day
        MERGE (current_day)-[:NEXT_DAY]->(next_day)
    """, stock=stock)

#### Implementing into Neo4j Driver

In [12]:
my_neo4j_wipe_out_database()

Neo4j database wiped clean.


In [13]:
# Creating Neo4j database with Linked Trading Days and Pearson Correlations on Stocks
with driver.session() as session:
    for _, row in sub_nasdaq100.iterrows():
        stock = row['Name']
        date = row['Date']
        close = row['Close']
        volume = row['Volume']
        
        session.execute_write(insert_stock_data, stock, date, close, volume)
        
    stocks = sub_nasdaq100['Name'].unique()
    for stock in stocks:
        session.execute_write(link_trading_days, stock)

# Pearson Correlation Metrics    
correlation_matrix = np.corrcoef(normalized_data.T)  
stock_names = pivot_data.columns.tolist()

# Empty Low Correlation List - not to execute in neo4j
pearson_low_corr = []
pearson_high_corr = []

with driver.session() as session:
    for i, stock1 in enumerate(stock_names):
        for j, stock2 in enumerate(stock_names):
            if i < j:
                correlation_value = correlation_matrix[i, j]
                if abs(correlation_value) >= 0.8:
                    pearson_high_corr.append({
                            'Stock1': stock1,
                            'Stock2': stock2,
                            'Correlation': correlation_value
                            })
                    
                    # write to neo4j
                    session.execute_write(link_correlation_data, stock1, stock2, correlation_value)
                      
                elif abs(correlation_value) <= 0.2:
                     pearson_low_corr.append({
                            'Stock1': stock1,
                            'Stock2': stock2,
                            'Correlation': correlation_value
                            })
                
driver.close()

##### Save Correlations >= to .8 as high correlations

In [14]:
df_pearson_high_corr = pd.DataFrame(pearson_high_corr)

In [15]:
df_pearson_high_corr.to_csv("pearson_high_correlation_0.8_or_more.csv", index=False)

##### Save Correlations <= to .2 as low correlations

In [16]:
df_pearson_low_corr = pd.DataFrame(pearson_low_corr)

In [17]:
df_pearson_low_corr.to_csv("pearson_low_correlation_0.2_or_less.csv", index=False)

### Cypher Code to retrieve visuals and tables in Neo4j

##### Pearson Correlation graph between stocks and trading day clusters
MATCH (N) RETURN N

##### Pearson Correlation graph between stocks only
MATCH (s1:Stock)-[r:CORRELATION]->(s2:Stock)
RETURN s1, r, s2