In [1]:
import pandas as pd
from py2neo import Graph
graph = Graph("bolt://localhost:7687", user='neo4j', password='newPassword')
# graph = Graph()

In [2]:
import matplotlib 
import matplotlib.pyplot as plt

### Part 1, EDA

#### Let's drill down into the Nedbank Behaviour db. How many nodes do we have for each label?

https://colab.research.google.com/github/neo4j-contrib/training-v2/blob/master/Courses/DataScience/notebooks/02_EDA.ipynb#scrollTo=0r69d4ek5huR

In [3]:
# https://neo4j.com/graphacademy/online-training/data-science/part-2/
result = {"label": [], "count": []}
for label in graph.run("CALL db.labels()").to_series():
    query = f"MATCH (:`{label}`) RETURN count(*) as count"
    count = graph.run(query).to_data_frame().iloc[0]['count']
    result["label"].append(label)
    result["count"].append(count)
nodes_df = pd.DataFrame(data=result)
nodes_df.sort_values("count")

Unnamed: 0,label,count
1,Merchant,578685
0,Client,2216269


#### Visualize counts:

In [None]:
nodes_df.plot(kind='bar', x='label', y='count', legend=None, title="Node Cardinalities")
plt.yscale("log")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#### Here are the types of relationships and their counts in the db:

In [None]:
result = {"relType": [], "count": []}
for relationship_type in graph.run("CALL db.relationshipTypes()").to_series():
    query = f"MATCH ()-[:`{relationship_type}`]->() RETURN count(*) as count"
    count = graph.run(query).to_data_frame().iloc[0]['count']
    result["relType"].append(relationship_type)
    result["count"].append(count)
rels_df = pd.DataFrame(data=result)
rels_df.sort_values("count")

#### Visualize relationship cardinalities:

In [None]:
rels_df.plot(kind='bar', x='relType', y='count', legend=None, title="Relationship Cardinalities")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#### Now let's explore the Merchant data. 

Now let's explore the transaction data in more detail.  We need to zoom in on one Merchant. The following query finds Dsichem and Dischem Dainfern Square in particular, the uniqe clients that vistited this merchant, the number of unique Nedbank Clients that visited the Merchant (Merchant1) and then any other Merchant (Merchant2) thi client may have visited and how many unique clients (merchant2_transactions) transacted at Merchant 2:

In [None]:
exploratory_query = """
MATCH (merchant1:Merchant {franchisename:'DIS-CHEM DAINFERN'})<-[:TRANSACTED_AT]-(client:Client)-[:TRANSACTED_AT]->(merchant2:Merchant)
WHERE merchant1<>merchant2
RETURN merchant1.franchisename AS merchant1, client.dedupestatic AS dedupestatic,  merchant2.franchisename AS merchant2, 
       size((merchant1)-[:TRANSACTED_AT]-()) AS merchant1_transactions, 
       size((merchant2)-[:TRANSACTED_AT]-()) AS merchant2_transactions
ORDER BY rand()
"""

graph.run(exploratory_query).to_data_frame()

In [None]:
exploratory_query = """
MATCH (merchant1:Merchant {franchisename:'DIS-CHEM DAINFERN'})<-[:TRANSACTED_AT]-()-[:TRANSACTED_AT]->(merchant2:Merchant)
WHERE merchant1<>merchant2
RETURN merchant1.franchisename AS merchant1, merchant2.franchisename AS merchant2, 
       size((merchant1)-[:TRANSACTED_AT]-()) AS merchant1_transactions, 
       size((merchant2)-[:TRANSACTED_AT]-()) AS merchant2_transactions
ORDER BY rand()
"""

graph.run(exploratory_query).to_data_frame()

In [None]:
query = """
MATCH (m:Merchant {companyname:'DISCHEM'})-[transaction:TRANSACTED_AT]-(client:Client)
RETURN m.franchisename AS Merchant, count(transaction) AS transactions
"""

citation_df = graph.run(query).to_data_frame()
citation_df.describe([.25, .5, .75, .9, .99])

In [None]:
citation_df

#### Find popular first-degree merchants ('friends' linked via customers):

How many unique client transaction at this merchant?

In [None]:
popular_merchants_query = """
MATCH (merchant1:Merchant {franchisename:'DIS-CHEM DAINFERN'})<-[:TRANSACTED_AT]-(client)
RETURN count(client) AS uniqueClients
"""

graph.run(popular_merchants_query).to_data_frame()

Here we look at the number of customer transactions at first degree merchants:

In [None]:
popular_merchants_query = """
MATCH (merchant1:Merchant {franchisename:'DIS-CHEM DAINFERN'})<-[:TRANSACTED_AT]-()-[merchantTransactions:TRANSACTED_AT]->(merchant2:Merchant)
WHERE merchant1<>merchant2
RETURN DISTINCT(merchant2.franchisename) AS first_degree_merchant, 
       count(merchantTransactions) AS first_degree_merchant_transactions
ORDER BY first_degree_merchant_transactions DESC
"""

graph.run(popular_merchants_query).to_data_frame()

In [None]:
popular_fd=graph.run(popular_merchants_query).to_data_frame()

In [None]:
fig1, ax1 = plt.subplots()
ax1.hist(pd.Series(popular_fd['first_degree_merchant_transactions'].dropna()), 1250, density=True, facecolor='g', alpha=0.75)
ax1.set_xscale("log")
plt.tight_layout()
plt.show()

#### There is almost a 50% chance of a Dischem DSQ shopper shopping at a PNP DSQ too!  ie, a dischem shopper made their way to PNP 1/2 times.

Choose the most popular fisrt degree merchant (to DIS-CHEM dainfern) (Check the spelling!):

In [None]:
merchantName='PNP CRP DEINFERN SQUAR'

How many unique customer transaction to this merchant:

In [None]:
popular_merchants_query = """
MATCH (merchant1:Merchant {franchisename:$merchantName})<-[:TRANSACTED_AT]-(client)
RETURN count(client) AS uniqueClients
"""
graph.run(popular_merchants_query, {"merchantName": merchantName}).to_data_frame()

Now.  Take 'PNP CRP DEINFERN SQUAR' as merchant zero and extract its first degree merchant friends:

In [None]:
popular_merchants_query = """
MATCH (merchant1:Merchant {franchisename:$merchantName})<-[:TRANSACTED_AT]-()-[merchantTransactions:TRANSACTED_AT]->(merchant2:Merchant)
WHERE merchant1<>merchant2
RETURN DISTINCT(merchant2.franchisename) AS merchant2, 
       count(merchantTransactions) AS merchant2_transactions
ORDER BY merchant2_transactions DESC
"""

graph.run(popular_merchants_query, {"merchantName": merchantName}).to_data_frame()

There is hence a less that 30% (116/383) chance of a client visiting PNP and Dischem at this mall, ie all the PNP shoppers less than 1/3 made their way to Dischem.  But the probability of PNP-WW is also less (112/383) than DISCHEM-WW (111/252)

#### Carry on with DSQ merchant, PNP CRP DEINFERN SQUAR.

#### Merchant of Merchant transactions

In [None]:
merchantName='PNP CRP DEINFERN SQUAR'
merchantName_clients = """
MATCH (merchant:Merchant {franchisename: $merchantName})<-[:TRANSACTED_AT]-(client:Client)
RETURN client.dedupestatic AS client, size((client)-[:TRANSACTED_AT]-(merchant)) AS merchantTransactions,
size((client)-[:TRANSACTED_AT]-()) AS momTransactions
ORDER BY momTransactions DESC
LIMIT 20
"""

graph.run(merchantName_clients,  {"merchantName": merchantName}).to_data_frame()

### Find the co-merchants, the merchant-of-merchants

In [None]:
merchantName='PNP CRP DEINFERN SQUAR'
collaborations_query = """
MATCH (:Merchant {franchisename: $merchantName})<-[:TRANSACTED_AT]-(client)-[:TRANSACTED_AT]->(momMerchant:Merchant)
RETURN momMerchant.franchisename AS momMerchant, count(*) AS coMerchantsCount
ORDER BY coMerchantsCount DESC
LIMIT 10
"""

graph.run(collaborations_query,  {"merchantName": merchantName}).to_data_frame()

### cocMerchants:

## Recommendations

### Page Ranking

Copy of 03_Recommendations_Part1.ipynb

https://www.cs.princeton.edu/~chazelle/courses/BIB/pagerank.htm

PageRank is an algorithm that measures the transitive influence or connectivity of nodes. It can be computed by either iteratively distributing one node’s rank (originally based on degree) over its neighbors or by randomly traversing the graph and counting the frequency of hitting each node during these walks.

The following PageRank code is run over the whole graph to find out the most influential Merchant in terms of transactions:

#### Find popular merchants:

In [None]:
popular_merchants_query = """
MATCH (merchant:Merchant)
RETURN merchant.franchisename, size((merchant)<-[:TRANSACTED_AT]-()) AS transactions
ORDER BY transactions DESC
LIMIT 10
"""

graph.run(popular_merchants_query).to_data_frame()

#### Pick one merchant, 'franchisename:'DIS-CHEM DAINFERN' 

In [None]:
franchisename='DIS-CHEM DAINFERN'

In [None]:
query = """
MATCH (m:Merchant {franchisename:$franchisename})
RETURN m.companyname
"""
graph.run(query, {"franchisename": franchisename}).to_data_frame()

In [None]:
query = """
    CALL db.index.fulltext.createNodeIndex('merchants', ['Merchant'], ['franchisename'])
"""
graph.run(query).data()

In [None]:
query = """
CALL db.indexes()
YIELD name, uniqueness, type
WHERE type = "FULLTEXT"
RETURN *
"""
graph.run(query).to_data_frame()

In [None]:
query = """
CALL db.index.fulltext.queryNodes("merchants", "columbine")
YIELD node, score
RETURN node.franchisename, score, [(client)-[:TRANSACTED_AT]-(node) | client.dedupestatic] AS clients
LIMIT 10
"""
graph.run(query).to_data_frame()

In [None]:
query = """
CALL db.index.fulltext.queryNodes("merchant", "columbine")
YIELD node, score
MATCH (node)-[:TRANSACTED_AT]-(client)
RETURN author.name, sum(score) AS totalScore, collect(node.title) AS articles
ORDER By totalScore DESC
LIMIT 20
"""

graph.run(query).to_data_frame()

In [None]:
nodes = [n for n in result]

In [None]:
nodes

In [None]:
my_node = graph.run("""MATCH (merchant:Merchant {companyname:'DISCHEM'})
RETURN merchant """).data()

In [None]:
my_node

In [None]:
my_node = graph.run("""MATCH (merchant:Merchant {franchisename:'DIS-CHEM DAINFERN'})-[:TRANSACTED_AT]-(c:Client) 
RETURN merchant.franchisename, c.dedupestatic """).data()

In [None]:
len(my_node)

Retrieve the customers this merchant had and how many transactions they had:

In [None]:
author_articles_query = """
MATCH (:Merchant {franchisename: $franchisename})<-[:TRANSACTED_AT]-(client)
RETURN client.dedupestatic AS client, size((client)-[:TRANSACTED_AT]->()) AS other_transactions
ORDER BY other_transactions DESC
LIMIT 20
"""
graph.run(author_articles_query,  {"franchisename": franchisename}).to_data_frame()

Retrieve the customers this merchant had and how many other transactions they had:

In [None]:
author_articles_query = """
MATCH (merchant1:Merchant {franchisename: $franchisename})<-[:TRANSACTED_AT]-(client)-[:TRANSACTED_AT]->(merchant2)
WHERE merchant1<>merchant2
RETURN client.dedupestatic AS client, count((client)-[:TRANSACTED_AT]->(merchant2)) AS other_transactions
ORDER BY other_transactions DESC
LIMIT 20
"""
graph.run(author_articles_query,  {"franchisename": franchisename}).to_data_frame()

In [None]:
collaborations_query = """
MATCH (:Merchant {franchisename: $franchisename})<-[:TRANSACTED_AT]-(client)-[:TRANSACTED_AT]->(comerchant)
RETURN comerchant.franchisename AS franchisename, count(*) AS cotransactions
ORDER BY cotransactions DESC
LIMIT 10
"""

graph.run(collaborations_query,  {"franchisename": franchisename}).to_data_frame()

In [None]:
my_node = graph.run("""MATCH (dischem:Merchant {franchisename:'DIS-CHEM DAINFERN'})-[:TRANSACTED_AT]-(c:Client)-
[:TRANSACTED_AT]-(other:Merchant)
WHERE other.franchisename<>dischem.franchisename 
RETURN dischem.franchisename, c.dedupestatic, other.franchisename, other.companyname""").data()

In [None]:
my_node

In [None]:
my_node = graph.run("""MATCH (dischem:Merchant {franchisename:'DIS-CHEM DAINFERN'})-[:TRANSACTED_AT]-(c:Client)-
[:TRANSACTED_AT]-(other:Merchant)
WHERE other.franchisename<>dischem.franchisename 
RETURN DISTINCT(c.dedupestatic), count(DISTINCT other)""").data()
my_node

In [None]:
my_node = graph.run("""MATCH (dischem:Merchant {franchisename:'DIS-CHEM DAINFERN'})-[:TRANSACTED_AT]-(c:Client)-
[othertransaction:TRANSACTED_AT]-(other:Merchant)
WHERE other.franchisename<>dischem.franchisename 
RETURN DISTINCT(other.franchisename) AS other_franchisename, count(othertransaction) AS number_transactions""").data()
my_node

In [None]:
my_node = graph.run("""MATCH (dischem:Merchant {franchisename:'DIS-CHEM DAINFERN'})-[:TRANSACTED_AT]-(c:Client)-
[othertransaction:TRANSACTED_AT]-(other:Merchant)
WHERE other.franchisename<>dischem.franchisename 
WITH other.franchisename AS other_franchisename, count(othertransaction) AS number_transactions 
RETURN DISTINCT(other_franchisename), number_transactions""").data()
my_node

In [None]:
my_node = graph.run("""MATCH (dischem:Merchant {franchisename:'DIS-CHEM DAINFERN'})-[:TRANSACTED_AT]-(c0:Client)
MATCH (c1:Client {dedupestatic:'2.11279273006e+11'})  
WHERE c0.dedupestatic <> c1.dedupestatic  
RETURN gds.alpha.linkprediction.commonNeighbors(c0, c1)""").data()
my_node

In [None]:
my_node = graph.run("""MATCH (dischem:Merchant {franchisename:'DIS-CHEM DAINFERN'})-[:TRANSACTED_AT]-(c0:Client) 
MATCH (dischem:Merchant {franchisename:'DIS-CHEM DAINFERN'})-[:TRANSACTED_AT]-(c1:Client) 
WHERE c0.dedupestatic <> c1.dedupestatic AND gds.alpha.linkprediction.commonNeighbors(c0, c1)>5 
RETURN c0.dedupestatic, c1.dedupestatic, gds.alpha.linkprediction.commonNeighbors(c0, c1) as commons""").data()
my_node

In [None]:
my_node = graph.run("""MATCH (dischem:Merchant {franchisename:'DIS-CHEM DAINFERN'})-[:TRANSACTED_AT]-(c0:Client)
WITH collect(distinct c0) as clients 
MATCH (dischem:Merchant {franchisename:'DIS-CHEM DAINFERN'})-[:TRANSACTED_AT]-(c1:Client) 
WHERE c1 NOT in clients AND gds.alpha.linkprediction.commonNeighbors(c0, c1)>5 
RETURN c0.dedupestatic, c1.dedupestatic""").data()
my_node

In [None]:
 MATCH (p1:Person {name: 'Michael'})
 MATCH (p2:Person {name: 'Karin'})
 RETURN gds.alpha.linkprediction.commonNeighbors(p1, p2) AS score

In [None]:
my_node = graph.run("""MATCH (dischem:Merchant {franchisename:'DIS-CHEM DAINFERN'})-[:TRANSACTED_AT]-(c0:Client) 
MATCH (dischem:Merchant {franchisename:'DIS-CHEM DAINFERN'})-[:TRANSACTED_AT]-(c1:Client)
WHERE c0.dedupestatic <> c1.dedupestatic  
RETURN DISTINCT(c0.dedupestatic), DISTINCT(c1.dedupestatic)""").data()
my_node

#### Peek into the file:

In [None]:
my_node = graph.run("""MATCH (merchant:Merchant {franchisename:'DIS-CHEM DAINFERN'})-[:TRANSACTED_AT]-(c:Client) 
RETURN merchant.franchisename, c.dedupestatic """).data()

In [None]:
data

In [None]:
# crimes_header.csv
filename='/var/lib/neo4j/import/beats.csv'
beats = pd.read_csv(filename,nrows=10)
beats

In [None]:
# crimes_header.csv
filename='/var/lib/neo4j/import/crimes_header.csv'
crimes_header_csv = pd.read_csv(filename,nrows=10)
crimes_header_csv

In [None]:
filename='/var/lib/neo4j/import/crimes.csv'
crimes_csv = pd.read_csv(filename,nrows=10)
crimes_csv

In [17]:
import csv
filename='/var/lib/neo4j/clientswipes_202003.csv'
clientswipes_202003_csv = pd.read_csv(filename,nrows=3000000)

In [22]:
clientswipes_202003_csv['TransactionDate']

0          2020-03-01T00:00:00.000Z
1          2020-03-01T00:00:00.000Z
2          2020-03-01T00:00:00.000Z
3          2020-03-01T00:00:00.000Z
4          2020-03-01T00:00:00.000Z
                     ...           
2999995    2020-03-19T00:00:00.000Z
2999996    2020-03-19T00:00:00.000Z
2999997    2020-03-19T00:00:00.000Z
2999998    2020-03-19T00:00:00.000Z
2999999    2020-03-19T00:00:00.000Z
Name: TransactionDate, Length: 3000000, dtype: object

In [None]:
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            line_count += 1
        else:
            print(f'\t{row[0]} works in the {row[1]} department, and was born in {row[2]}.')
            line_count += 1
    print(f'Processed {line_count} lines.')

In [None]:
dedupegroup_list=list(set(df.Dedupegroup))

In [None]:
for dedupestatic in list(set(dedupegroup_list)):
    print('Doing dedupestatic: ', dedupestatic, ' at index: ', dedupegroup_list.index(dedupestatic), ' of ', len(dedupegroup_list))        
    client_param_dict={"dedupestatic":dedupestatic}
    graph.run("MERGE (client:Client {dedupestatic:{dedupestatic}})", client_param_dict)

In [None]:
for dedupe in dedupegroup_list:
    gf=df[df.Dedupegroup==dedupe].copy()
    print('Doing dedupestatic: ', dedupe, ' at index: ', dedupegroup_list.index(dedupe), ' of ', len(dedupegroup_list))    
    for dedupestatic,companyname,franchisename,companyindex,class_id,discretionary,division_id,group_id,subclass_id,channel,TransactionAmount,TransactionDate in zip(gf.Dedupegroup, gf.companyname, gf.franchisename,gf.companyindex,gf.class_id,gf.discretionary,gf.division_id,gf.group_id,gf.subclass_id,gf.channel,gf.TransactionAmount,gf.TransactionDate):

        client_param_dict={"dedupestatic":dedupestatic}
        company_param_dict={"companyname":companyname,
                  "franchisename":franchisename,
                  "companyindex":companyindex,
                  "class_id":class_id,
                  "discretionary":discretionary,
                  "division_id":division_id,
                  "group_id":group_id,
                  "subclass_id":subclass_id,
                  "channel":channel}
        franchise_param_dict={
              "franchisename":franchisename,"companyname":companyname,}
        transaction_param_dict={"dedupestatic":dedupestatic,
              "franchisename":franchisename,
    #           "companyindex":companyindex,
              "amount":amount,
              "date":date}
        owner_param_dict={"companyname":companyname,
              "franchisename":franchisename}
    #   ------------------------------------------------------------------------------------------
        graph.run("MERGE (client:Client {dedupestatic:{dedupestatic}})", client_param_dict)
    #   ------------------------------------------------------------------------------------------    
        graph.run("""MERGE (franchise:Franchise 
        {
        franchisename:{franchisename},
        companyname:{companyname}})
        """, franchise_param_dict)
    #   ------------------------------------------------------------------------------------------
        graph.run("""
        MERGE (company:Company 
        {
        companyname: {companyname}, 
        companyindex: {companyindex}}) ON CREATE SET company.channel={channel},
        company.class_id={class_id},
        company.discretionary={discretionary},
        company.division_id={division_id},
        company.group_id={group_id},
        company.subclass_id={subclass_id},
        company.channel={channel}
        """, company_param_dict)
    #   ------------------------------------------------------------------------------------------
        graph.run("""MATCH (client:Client {dedupestatic:{dedupestatic}}),(franchise:Franchise { franchisename: {franchisename} })
        MERGE (client)-[t:transacted]->(franchise)
        ON CREATE SET t.amount=[{amount}], t.date=[{date}]
        ON MATCH SET t.amount=t.amount+{amount}, t.date=t.date+{date}""", transaction_param_dict)
    #   ------------------------------------------------------------------------------------------
        graph.run("""MATCH (franchise:Franchise {franchisename:{franchisename}}),(company:Company { companyname: {companyname} })
        MERGE (franchise)-[t:owned]->(company)""", owner_param_dict)    

In [None]:
dedupestatic=dedupegroup_list[0]
companyname=list(gf.companyname)[0]
franchisename=list(gf.franchisename)[0]
companyindex=list(gf.companyindex)[0]
class_id=list(gf.class_id)[0]
discretionary=list(gf.discretionary)[0]
division_id=list(gf.division_id)[0]
group_id=list(gf.group_id)[0]
subclass_id=list(gf.subclass_id)[0]
channel=list(gf.channel)[0]
amount=list(gf.TransactionAmount)[0]
date=list(gf.TransactionDate)[0]
# companyindex=list(gf.companyindex)[0]
# class_id=list(gf.class_id)[0]
# discretionary=list(gf.discretionary)[0]
# division_id=list(gf.division_id)[0]
# group_id=list(gf.group_id)[0]
# subclass_id=list(gf.subclass_id)[0]
# channel=list(gf.channel)[0]

In [None]:
client_param_dict={"dedupestatic":dedupestatic}

In [None]:
company_param_dict={"companyname":companyname,
          "franchisename":franchisename,
          "companyindex":companyindex,
          "class_id":class_id,
          "discretionary":discretionary,
          "division_id":division_id,
          "group_id":group_id,
          "subclass_id":subclass_id,
          "channel":channel}

In [None]:
franchise_param_dict={
          "franchisename":franchisename,"companyname":companyname,}

In [None]:
transaction_param_dict={"dedupestatic":dedupestatic,
          "franchisename":franchisename,
#           "companyindex":companyindex,
          "amount":amount,
          "date":date}

In [None]:
owner_param_dict={"companyname":companyname,
          "franchisename":franchisename}

In [None]:
graph.run("MERGE (client:Client {dedupestatic:{dedupestatic}})", client_param_dict)

In [None]:
graph.run("""
MERGE (franchise:Franchise 
{
franchisename:{franchisename},
companyname:{companyname}})
""", franchise_param_dict)

In [None]:
graph.run("""
MERGE (company:Company 
{
companyname: {companyname}, 
companyindex: {companyindex}}) ON CREATE SET company.channel={channel},
company.class_id={class_id},
company.discretionary={discretionary},
company.division_id={division_id},
company.group_id={group_id},
company.subclass_id={subclass_id},
company.channel={channel}
""", company_param_dict)

In [None]:
graph.run("""MATCH (client:Client {dedupestatic:{dedupestatic}}),(franchise:Franchise { franchisename: {franchisename} })
MERGE (client)-[t:transacted]->(franchise)
ON CREATE SET t.amount=[{amount}], t.date=[{date}]
ON MATCH SET t.amount=t.amount+{amount}, t.date=t.date+{date}""", transaction_param_dict)


In [None]:
graph.run("""MATCH (franchise:Franchise {franchisename:{franchisename}}),(company:Company { companyname: {companyname} })
MERGE (franchise)-[t:owned]->(company)""", owner_param_dict)