## Graph Centrality in Stocks Using the Page Rank Algorithm
---

*University of California, Berkeley*

In [1]:
# Import necesssary packages
from neo4j import GraphDatabase
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
# Read file
nasdaq100 = pd.read_csv("/user/projects/project-3-techChanakya/data/NASDAQ_100_Data_From_2010.csv",sep="\t")

In [3]:
# Set date parameters
start_date = '2021-05-01'
end_date = '2021-09-30'

# Scope time for analysis
sub_nasdaq100 = nasdaq100[(nasdaq100['Date']>= start_date) & (nasdaq100['Date'] <= end_date)]

In [4]:
# Neo4j connnection parameters
uri = "neo4j://neo4j:7687"
user = "neo4j"
password = "ucb_mids_w205"

# Neo4j connection
driver = GraphDatabase.driver(uri=uri, auth=(user,password))

In [5]:
# Wipe database
def my_neo4j_wipe_out_database():
    """
    Wipe out the Neo4j database
    """
    with driver.session() as session:
        session.run("MATCH (n)-[r]->() DELETE r, n")  
        session.run("MATCH (n) DELETE n")
    print("Neo4j database wiped clean.")

# Insert stock and trading day data 
def insert_stock_data(tx, stock, date, close, volume):
    """
    Insert stock and trading day nodes into Neo4j
    """
    tx.run("""
        MERGE (s:Stock {name: $stock})
        MERGE (t:StockTradingDay {date: date($date), close: toFloat($close), volume: toInteger($volume)})
        MERGE (s)-[:TRADING_DAY]->(t)
    """, stock=stock, date=date, close=close, volume=volume)

# Insert correlation relationships between stocks into Neo4j
def insert_correlation_data(tx, stock1, stock2, correlation_value):
    """
    Insert correlation relationships between stocks into Neo4j
    """
    tx.run("""
        MATCH (s1:Stock {name: $stock1}), (s2:Stock {name: $stock2})
        MERGE (s1)-[:CORRELATION {value: $correlation_value}]->(s2)
        MERGE (s2)-[:CORRELATION {value: $correlation_value}]->(s1)
    """, stock1=stock1, stock2=stock2, correlation_value=correlation_value)

# Calculate stock correlations based on closing prices
def calculate_stock_correlations(sub_nasdaq100):
    """
    Calculate correlation matrix for the stocks based on their closing prices.
    """
    pivot_data = sub_nasdaq100.pivot(index='Date', columns='Name', values='Close')

    scaler = StandardScaler()
    normalized_data = scaler.fit_transform(pivot_data)

    correlation_matrix = np.corrcoef(normalized_data.T) # Use Pearson's Correlation

    return correlation_matrix, pivot_data.columns.tolist()

# Insert correlation relationships for all stock pairs
def insert_all_correlations(correlation_matrix, stock_names, threshold=0.8): # Selected 0.8 correlation to only include strong correlations
    """
    Insert correlations into Neo4j for stock pairs with correlation greater than the threshold.
    """
    with driver.session() as session:
        for i, stock1 in enumerate(stock_names):
            for j, stock2 in enumerate(stock_names):
                if i < j:
                    correlation_value = correlation_matrix[i, j]
                    if abs(correlation_value) >= threshold:
                        session.execute_write(insert_correlation_data, stock1, stock2, correlation_value)

# Project the graph and run the PageRank algorithm
def run_pagerank():
    """
    Run the PageRank algorithm on the stock correlation graph to rank stocks by importance
    """
    results = []
    
    with driver.session() as session:
        # Project the graph into Neo4j (with correlation relationships)
        session.run("""
            CALL gds.graph.project(
                'nasdaq_graph', 
                'Stock', 
                'CORRELATION', 
                {relationshipProperties: ['value']}
            )
        """)

        # Run the PageRank algorithm selecting 20 iterations for the algorithm and the default damping factor
        result = session.run("""
            CALL gds.pageRank.stream('nasdaq_graph', {maxIterations: 20, dampingFactor: 0.85})
            YIELD nodeId, score
            RETURN gds.util.asNode(nodeId).name AS ticker, score
            ORDER BY score DESC
        """)
        
        for record in result:
            results.append({
                'ticker': record['ticker'],
                'pagerank_score': record['score']
            })

    df = pd.DataFrame(results)
    
    # Avoid clipping, show the full data output in Jupyter Notebook
    pd.set_option('display.max_rows', None)  
    pd.set_option('display.max_columns', None) 
    pd.set_option('display.width', None) 
    pd.set_option('display.max_colwidth', None)

    # Print the DataFrame in Jupyter Notebook
    print(df)
    
    # Export results to CSV
    df.to_csv("pagerank_results_full.csv", index=False)

    print("PageRank results data saved to 'pagerank_results_full.csv'.")

    return df

# Insert stock and trading day data into Neo4j
def insert_stock_and_trading_day_data(sub_nasdaq100):
    """
    Insert stock and trading day data into Neo4j
    """
    with driver.session() as session:
        # Insert stock and trading day nodes
        for _, row in sub_nasdaq100.iterrows():
            stock = row['Name']
            date = row['Date']
            close = row['Close']
            volume = row['Volume']
            
            session.execute_write(insert_stock_data, stock, date, close, volume)

# Main execution function
def main(sub_nasdaq100):
    """
    Main function to run the entire process: wipe database, insert data, calculate correlations, 
    and run PageRank to rank stocks by importance.
    """
    my_neo4j_wipe_out_database()

    insert_stock_and_trading_day_data(sub_nasdaq100)

    correlation_matrix, stock_names = calculate_stock_correlations(sub_nasdaq100)
    insert_all_correlations(correlation_matrix, stock_names)

    run_pagerank()

main(sub_nasdaq100)

driver.close()

Neo4j database wiped clean.
    ticker  pagerank_score
0     ISRG        1.612057
1     COST        1.564447
2    GOOGL        1.533045
3     MSFT        1.523989
4     LULU        1.479440
5      AMD        1.449135
6     AAPL        1.439781
7     DXCM        1.426360
8     CPRT        1.423022
9     IDXX        1.417159
10    PAYX        1.414466
11    MRNA        1.410208
12    GOOG        1.408441
13     KHC        1.402646
14    ALGN        1.396271
15    ASML        1.395030
16    REGN        1.387047
17    SNPS        1.385511
18    CDNS        1.368419
19    CTAS        1.356030
20    ADBE        1.346309
21    INTU        1.344067
22    TEAM        1.341493
23    PCAR        1.308497
24    XLNX        1.297532
25     CDW        1.292861
26      FB        1.250918
27    AMAT        1.246566
28     TXN        1.246566
29    CHTR        1.242459
30    ANSS        1.230539
31    SPLK        1.225974
32    NVDA        1.215133
33    QCOM        1.188189
34     PEP        1.159825


#### How it Works
The Page Rank algorithm uncovered importance of each stock as it relates to the other stock, this is computed based on the correlation between stocks. The stocks with the higher page rank score (e.g., ISRG, COST, GOOGLE, MSFT, LULU) are deepemed influential in our structure. The opposite is true for our lower ranked stocks, as they are deemed less central to our network based on our correlation computations.

#### Interpreting Results
Depending on the computation powering the page rank score, this type of algorithm can uncover many different business use cases. Since ours is built on correlations, financial advisors may take this information into account when making investment recommendations. Based on their clients preferred investment approach, they may recommend the higher 'influencer' stocks as a starting point and move to the lower scores for a more diversified portfolio long-term.