# Graph Construction and Analysis

## 1. Dependencies

In [4]:
import pandas as pd
import os
from graphdatascience import GraphDataScience 
from graphdatascience import ServerVersion

## 2. Connect to NEO4J database

In [5]:
uri = "bolt://localhost:7689"
username = "neo4j"          
password = "12345678"
gds = GraphDataScience(uri, auth=(username, password))

assert gds.server_version() >= ServerVersion(1,8,0)

## 3. Get the dataset

In [6]:
stocks=pd.read_csv("../data/stocks.csv")
stocks.head(n=5)

Unnamed: 0,Symbol,Name,Sector
0,AAPL,Apple Inc.,Information Technology
1,ABBV,AbbVie,Health Care
2,ABT,Abbott Laboratories,Health Care
3,ACN,Accenture,Information Technology
4,ADBE,Adobe Inc.,Information Technology


In [7]:
correlation=pd.read_csv("../data/correlation.csv")
correlation.head(n=5)

Unnamed: 0,stock_a,stock_b,correlation
0,ABBV,AAPL,0.217205
1,ABT,AAPL,0.372865
2,ACN,AAPL,0.513727
3,ADBE,AAPL,0.592649
4,AIG,AAPL,0.273366


In [8]:
correlation["correlation"].describe()

count    9900.000000
mean        0.353263
std         0.121930
min         0.072356
25%         0.266610
50%         0.340762
75%         0.427256
max         0.858524
Name: correlation, dtype: float64

In [9]:
correlation=correlation[correlation["correlation"]>0.6]
correlation["correlation"].describe()

count    318.000000
mean       0.683377
std        0.061589
min        0.600924
25%        0.628964
50%        0.673625
75%        0.718153
max        0.858524
Name: correlation, dtype: float64

## 4. Create the Graph

In [10]:
gds.run_cypher(
    """
    UNWIND $nodes AS node
    MERGE (s:Stock {symbol: node.Symbol})
    SET s.name = node.Name, s.sector=node.Sector
    MERGE (sector:Sector {name:node.Sector})
    MERGE (s)-[:BELONGS_TO] -> (sector)
    """,
    params={"nodes":stocks.to_dict("records")},
)

In [11]:
gds.run_cypher(
    """
    UNWIND $correlation AS rel
    MATCH (a:Stock {symbol: rel.stock_a})
    MATCH (b:Stock {symbol: rel.stock_b})
    MERGE (a)-[r:CORRELATED]->(b)
    SET r.correlation = toFloat(rel.correlation)
    """,
    params={"correlation": correlation.to_dict("records")}
)

In [12]:
G, result=gds.graph.project("stockGraph",["Stock"],{"CORRELATED":{"properties":["correlation"]}})
print(f"The projection took {result['projectMillis']} ms")

The projection took 145 ms


In [13]:
print(f"Graph '{G.name()}' node count: {G.node_count()}")
print(f"Graph '{G.name()}' node labels: {G.node_labels()}")
print(f"Graph '{G.name()}' relationship count: {G.relationship_count()}")

Graph 'stockGraph' node count: 100
Graph 'stockGraph' node labels: ['Stock']
Graph 'stockGraph' relationship count: 318


## 5. Centrality Analysis

### 5.1. Eingenvector Centrality

In [14]:
eigenvector_centrality_result=gds.eigenvector.mutate(G,maxIterations=100,mutateProperty="eigenvectorCentrality")

In [15]:
G.node_properties()

Stock    [eigenvectorCentrality]
dtype: object

In [16]:
if eigenvector_centrality_result.didConverge:
    print(
        f"The number of iterations taken by Eigenvector Centrality to run is {eigenvector_centrality_result.ranIterations}."
    )
else:
    print("Algorithm did not converge")

The number of iterations taken by Eigenvector Centrality to run is 18.


In [17]:
eigenvector_centrality_result.centralityDistribution

{'Error': 'Unable to create histogram due to range of scores exceeding implementation limits.'}

In [18]:
gds.graph.nodeProperties.write(G,["eigenvectorCentrality"])

writeMillis                                                         40
graphName                                                   stockGraph
nodeProperties                                 [eigenvectorCentrality]
propertiesWritten                                                  100
configuration        {'jobId': '99aa3c45-15ab-4019-ae12-168652c1f7c...
Name: 0, dtype: object

In [19]:
def top_10_stocks(centrality_measure):
    query=f"""
    MATCH (s:Stock)
    RETURN s.symbol AS symbol, s.name AS name, s.sector AS sector, s.{centrality_measure} AS {centrality_measure}
    ORDER BY s.{centrality_measure} DESC
    LIMIT 10
    """
    result=gds.run_cypher(query)
    print(result)
    
top_10_stocks("eigenvectorCentrality")

  symbol                          name      sector  eigenvectorCentrality
0    JPM                JPMorgan Chase  Financials               0.283733
1    MET                       MetLife  Financials               0.282753
2     BK                    BNY Mellon  Financials               0.276688
3     MS                Morgan Stanley  Financials               0.276688
4    BAC               Bank of America  Financials               0.276688
5  BRK-B  Berkshire Hathaway (Class B)  Financials               0.276411
6    USB                  U.S. Bancorp  Financials               0.267151
7    WFC                   Wells Fargo  Financials               0.267151
8     GS                 Goldman Sachs  Financials               0.265567
9    AXP              American Express  Financials               0.263075


### 5.2. Betweeness Centrality

In [20]:
betweennes_centrality_result=gds.betweenness.mutate(G,mutateProperty="betweennessCentrality")

In [21]:
G.node_properties()

Stock    [betweennessCentrality, eigenvectorCentrality]
dtype: object

In [22]:
betweennes_centrality_result.centralityDistribution

{'min': 0.0,
 'max': 111.38134765624999,
 'p90': 12.547660827636719,
 'p999': 111.38134002685547,
 'p99': 91.00048065185547,
 'p50': 0.0,
 'p75': 0.0,
 'p95': 36.00023651123047,
 'mean': 5.600013809204102}

In [23]:
gds.graph.nodeProperties.write(G,["betweennessCentrality"])

writeMillis                                                         33
graphName                                                   stockGraph
nodeProperties                                 [betweennessCentrality]
propertiesWritten                                                  100
configuration        {'jobId': '0635fe79-2651-41aa-b941-049edf07067...
Name: 0, dtype: object

In [24]:
top_10_stocks("betweennessCentrality")

  symbol                          name                  sector  \
0    MET                       MetLife              Financials   
1   NVDA                        Nvidia  Information Technology   
2   MSFT                     Microsoft  Information Technology   
3    CAT              Caterpillar Inc.             Industrials   
4    HON                     Honeywell             Industrials   
5   QCOM                      Qualcomm  Information Technology   
6  BRK-B  Berkshire Hathaway (Class B)              Financials   
7   ADBE                    Adobe Inc.  Information Technology   
8   INTU                        Intuit  Information Technology   
9    EMR              Emerson Electric             Industrials   

   betweennessCentrality  
0             111.380952  
1              91.000000  
2              67.000000  
3              38.000000  
4              38.000000  
5              36.000000  
6              34.000000  
7              33.000000  
8              26.666667  
9  

### 5.3. Degree Centrality

In [25]:
degree_centrality_result=gds.degree.mutate(G,mutateProperty="degreeCentrality")

In [26]:
G.node_properties()

Stock    [betweennessCentrality, eigenvectorCentrality,...
dtype: object

In [27]:
degree_centrality_result.centralityDistribution

{'min': 0.0,
 'max': 17.000122070312496,
 'p90': 12.000053405761719,
 'p999': 17.00011444091797,
 'p99': 15.000053405761719,
 'p50': 1.0,
 'p75': 4.000022888183594,
 'p95': 14.000053405761719,
 'mean': 3.1800079345703125}

In [28]:
gds.graph.nodeProperties.write(G,["degreeCentrality"])

writeMillis                                                         24
graphName                                                   stockGraph
nodeProperties                                      [degreeCentrality]
propertiesWritten                                                  100
configuration        {'jobId': 'bcd55787-1969-4b9b-89da-78bedaeb641...
Name: 0, dtype: object

In [29]:
top_10_stocks("degreeCentrality")

  symbol                          name      sector  degreeCentrality
0    MET                       MetLife  Financials              17.0
1    JPM                JPMorgan Chase  Financials              15.0
2  BRK-B  Berkshire Hathaway (Class B)  Financials              15.0
3    BAC               Bank of America  Financials              14.0
4     BK                    BNY Mellon  Financials              14.0
5     MS                Morgan Stanley  Financials              14.0
6     GS                 Goldman Sachs  Financials              13.0
7    USB                  U.S. Bancorp  Financials              13.0
8    AXP              American Express  Financials              13.0
9    WFC                   Wells Fargo  Financials              13.0


## 6. Community Detection

In [30]:
assert gds.server_version() >= ServerVersion(2, 5, 0)

### 6.1. Weakly Connected Components

In [31]:
weakly_result=gds.wcc.mutate(G,mutateProperty="componentId")

In [32]:
G.node_properties()

Stock    [betweennessCentrality, eigenvectorCentrality,...
dtype: object

In [33]:
print(f"Components found: {weakly_result['componentCount']}")

Components found: 48


In [34]:
query = """
    CALL gds.graph.nodeProperties.stream('stockGraph', 'componentId')
    YIELD nodeId, propertyValue
    WITH gds.util.asNode(nodeId).symbol AS symbol, propertyValue AS componentId
    WITH componentId, collect(symbol) AS stocks
    WITH componentId, stocks, size(stocks) AS componentSize
    RETURN componentId, componentSize, stocks
    ORDER BY componentSize DESC
"""
components = gds.run_cypher(query)
components

Unnamed: 0,componentId,componentSize,stocks
0,5,21,"[AIG, AXP, BAC, BK, BLK, BRK-B, C, CAT, COF, D..."
1,0,15,"[AAPL, ACN, ADBE, AMD, AMZN, AVGO, CRM, GOOGL,..."
2,22,5,"[CL, KO, MDLZ, PEP, PG]"
3,25,3,"[COP, CVX, XOM]"
4,35,3,"[DUK, NEE, SO]"
5,39,3,"[GD, LMT, RTX]"
6,21,2,"[CHTR, CMCSA]"
7,32,2,"[DHR, TMO]"
8,37,2,"[F, GM]"
9,38,2,"[FDX, UPS]"


In [35]:
largest_component = components["componentId"][0]
print(f"The largest component has id {largest_component} with {components['componentSize'][0]} stocks.")

The largest component has id 5 with 21 stocks.


In [36]:
largest_component_graph, _ = gds.graph.filter(
    "largest_connected_component",
    G,
    f"n.componentId={largest_component}",
    "*"
)

In [37]:
largest_component_graph

Graph({'graphName': 'largest_connected_component', 'nodeCount': 21, 'relationshipCount': 210, 'database': 'neo4j', 'configuration': {'relationshipProperties': {}, 'jobId': 'a98a9e3b-3342-4524-8715-48dc1e0d3239', 'validateRelationships': False, 'nodeFilter': 'n.componentId=5', 'sudo': False, 'relationshipProjection': {'CORRELATED': {'aggregation': 'DEFAULT', 'orientation': 'NATURAL', 'indexInverse': False, 'properties': {'correlation': {'aggregation': 'DEFAULT', 'property': 'correlation', 'defaultValue': None}}, 'type': 'CORRELATED'}}, 'readConcurrency': 4, 'nodeProperties': {}, 'nodeProjection': {'Stock': {'label': 'Stock', 'properties': {}}}, 'logProgress': True, 'concurrency': 4, 'relationshipFilter': '*', 'parameters': {}}, 'schema': {'graphProperties': {}, 'nodes': {'Stock': {'betweennessCentrality': 'Float (DefaultValue(NaN), TRANSIENT)', 'eigenvectorCentrality': 'Float (DefaultValue(NaN), TRANSIENT)', 'degreeCentrality': 'Float (DefaultValue(NaN), TRANSIENT)', 'componentId': 'Int

### 6.2. Community Detection using Louvain

In [38]:
gds.louvain.mutate(
    largest_component_graph,
    mutateProperty="louvainCommunityId"
)

Louvain: 100%|██████████| 100.0/100 [00:00<00:00, 482.74%/s]


mutateMillis                                                             0
nodePropertiesWritten                                                   21
modularity                                                         0.12839
modularities                                         [0.12839002267573693]
ranLevels                                                                1
communityCount                                                           2
communityDistribution    {'min': 7, 'p5': 7, 'max': 14, 'p999': 14, 'p9...
postProcessingMillis                                                     3
preProcessingMillis                                                      0
computeMillis                                                          676
configuration            {'mutateProperty': 'louvainCommunityId', 'jobI...
Name: 0, dtype: object

In [39]:
gds.graph.nodeProperties.write(largest_component_graph, ["louvainCommunityId"])

writeMillis                                                         40
graphName                                  largest_connected_component
nodeProperties                                    [louvainCommunityId]
propertiesWritten                                                   21
configuration        {'jobId': '3c53e6b9-7add-4966-8885-4529adfa200...
Name: 0, dtype: object

In [40]:
gds.run_cypher(
    """
    MATCH (n) WHERE 'louvainCommunityId' IN keys(n)
    RETURN n.symbol AS symbol, n.louvainCommunityId AS communityId
    LIMIT 10
    """
)

Unnamed: 0,symbol,communityId
0,AIG,12
1,AXP,12
2,BAC,12
3,BK,12
4,BLK,12
5,BRK-B,12
6,C,12
7,CAT,9
8,COF,12
9,DE,9


In [41]:
query = """
    CALL gds.graph.nodeProperties.stream('largest_connected_component', 'louvainCommunityId')
    YIELD nodeId, propertyValue
    WITH gds.util.asNode(nodeId).symbol AS symbol, propertyValue AS communityId
    WITH communityId, collect(symbol) AS stocks
    WITH communityId, stocks, size(stocks) AS communitySize
    RETURN communityId, communitySize, stocks
    ORDER BY communitySize DESC
"""
communities = gds.run_cypher(query)
communities

Unnamed: 0,communityId,communitySize,stocks
0,12,14,"[AIG, AXP, BAC, BK, BLK, BRK-B, C, COF, GS, JP..."
1,9,7,"[CAT, DE, DOW, EMR, HON, MET, MMM]"


## 7. Saved the graph

In [42]:
gds.run_cypher("""
    MATCH (s:Stock)
    SET s.eigenvectorCentrality = s.eigenvectorCentrality
""")

gds.run_cypher("""
    MATCH (s:Stock)
    SET s.betweennessCentrality = s.betweennessCentrality
""")

gds.run_cypher("""
    MATCH (s:Stock)
    SET s.degreeCentrality = s.degreeCentrality
""")

gds.run_cypher("""
    MATCH (s:Stock)
    SET s.componentId = s.componentId
""")

gds.run_cypher("""
    MATCH (s:Stock)
    SET s.louvainCommunityId = s.louvainCommunityId
""")