In [14]:
url = "bolt://localhost:7687"
user = "neo4j"
password = "0000" 

In [15]:
from neo4j import GraphDatabase
driver = GraphDatabase.driver(url, auth=(user, password))
neo4j = driver.session()

In [16]:
import pandas as pd
result = neo4j.run('MATCH (c1:Customer)-[:PERFORMS]->(t1:Transaction)-[:WITH]->(m1:Merchant) Where t1.fraud="1" RETURN c1.id')
df = pd.DataFrame(result.data())
print(df)

            c1.id
0      C626305923
1      C617723960
2     C1350963410
3     C1667161948
4      C218446443
...           ...
7195  C1350963410
7196   C218446443
7197  C1918088317
7198  C1506293914
7199  C2118328869

[7200 rows x 1 columns]


In [7]:
result = neo4j.run("""
MATCH (c1:Customer)-[:PERFORMS]->(t1:Transaction)-[:WITH]->(m1:Merchant)
WITH c1, m1
MERGE (p1:Placeholder {id: m1.id})
""")
print(result.data())

[]


In [8]:
result = neo4j.run("""
MATCH (c1:Customer)-[:PERFORMS]->(t1:Transaction)-[:WITH]->(m1:Merchant)
WITH c1, m1, count(*) as cnt
MERGE (p2:Placeholder {id:c1.id})
""")
print(result.data())

[]


In [9]:
result = neo4j.run("""
MATCH (c1:Customer)-[:PERFORMS]->(t1:Transaction)-[:WITH]->(m1:Merchant)
WITH c1, m1, count(*) as cnt
MATCH (p1:Placeholder {id:m1.id})
WITH c1, m1, p1, cnt
MATCH (p2:Placeholder {id: c1.id})
WITH c1, m1, p1, p2, cnt
CREATE (p2)-[:PAYS {cnt: cnt}]->(p1)
""")
print(result.data())

[]


In [10]:
result = neo4j.run("""
MATCH (c1:Customer)-[:PERFORMS]->(t1:Transaction)-[:WITH]->(m1:Merchant)
WITH c1, m1, count(*) as cnt
MATCH (p1:Placeholder {id:c1.id})
WITH c1, m1, p1, cnt
MATCH (p2:Placeholder {id: m1.id})
WITH c1, m1, p1, p2, cnt
CREATE (p1)-[:PAYS {cnt: cnt}]->(p2)
""")
print(result.data())

[]


In [35]:
# Run the Cypher query
result = neo4j.run("""
CALL gds.graph.create.cypher.estimate(
    'MATCH (p) WHERE p:Placeholder RETURN id(p) as id',
    'MATCH (p)-[i:PAYS]->(p1:Placeholder) RETURN id(p) AS source, id(p1) AS target')
""")

# Print the results
row = result.single()
print("Estimated:", row['nodeCount'], "nodes,", row['relationshipCount'], "relationships,", row['requiredMemory']," memory required.")

Estimated: 4162 nodes, 94263 relationships, 1417 KiB  memory required.


In [37]:
import pprint 

# This query drops the projected graph if it already exists, else it returns 'None'.
result = neo4j.run("""
CALL gds.graph.exists($name) YIELD exists
WHERE exists
CALL gds.graph.drop($name) YIELD graphName
RETURN graphName + " was dropped." as message
""", name = 'pageRank')

# Print the results
pprint.pprint(result.data())

[{'message': 'pageRank was dropped.'}]


In [38]:

result = neo4j.run("""
CALL gds.graph.create.cypher(
    'pageRank',
    'MATCH (p) WHERE p:Placeholder RETURN id(p) as id',
    'MATCH (p)-[h:PAYS]->(p1:Placeholder) RETURN id(p) AS source, h.cnt as weight, id(p1) AS target')
""")

# Print the results
row = result.single()
print(row['graphName'],"-", row['nodeCount'], "nodes,", row['relationshipCount'], "relationships,", row['createMillis']," ms to create the projection.")


pageRank - 4162 nodes, 94264 relationships, 225  ms to create the projection.


In [39]:
result = neo4j.run("""
CALL gds.pageRank.stream.estimate('pageRank',  { relationshipWeightProperty: 'weight' })
""")

print(result.single()['requiredMemory'], ' memory required to run the algorithm.')

131 KiB  memory required to run the algorithm.


In [40]:
result = neo4j.run("""
CALL gds.pageRank.stream('pageRank', { relationshipWeightProperty: 'weight'}) 
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).id as id, score 
ORDER BY score DESC
""")

df = pd.DataFrame(result.data())
print(df)

               id       score
0     M1823072687  241.650932
1      M348934600  183.568446
2       M85975013   21.399732
3      M480139044    8.407023
4      M151143676    6.262718
...           ...         ...
4157   C607062277    0.150000
4158  C1133451243    0.150000
4159   C446179646    0.150000
4160   C114981535    0.150000
4161  C1069635670    0.150000

[4162 rows x 2 columns]


In [41]:
result = neo4j.run("CALL gds.pageRank.write('pageRank', { writeProperty: 'pageRank', relationshipWeightProperty: 'weight' })")

pprint.pprint(result.data())

[{'centralityDistribution': {'max': 241.65136623382568,
                             'mean': 0.27596811123160087,
                             'min': 0.14999961853027344,
                             'p1': 0.14999961853027344,
                             'p10': 0.14999961853027344,
                             'p100': 241.65136623382568,
                             'p25': 0.14999961853027344,
                             'p5': 0.14999961853027344,
                             'p50': 0.14999961853027344,
                             'p75': 0.14999961853027344,
                             'p90': 0.14999961853027344,
                             'p95': 0.14999961853027344,
                             'p99': 0.3773164749145508,
                             'stdDev': 4.718309196988444},
  'computeMillis': 314,
  'configuration': {'cacheWeights': False,
                    'concurrency': 4,
                    'dampingFactor': 0.85,
                    'maxIterations': 20,
              

In [43]:
result = neo4j.run("""MATCH (p:Placeholder)
RETURN p.id AS id, p.pageRank as pagerank
""")

df = pd.DataFrame(result.data())
print(df)

               id    pagerank
0      M348934600  183.568446
1     M1823072687  241.650932
2       M50039827    3.020985
3     M1888755466    1.753567
4     M1053599405    5.738430
...           ...         ...
4157  C2060410910    0.150000
4158  C1657671280    0.150000
4159  C1743702978    0.150000
4160   C849065220    0.150000
4161  C1562081159    0.150000

[4162 rows x 2 columns]


In [46]:
import numpy as np
import pandas as pd
from py2neo import Graph
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [47]:
def load_pagerank(record):
    return records[record.split("'")[1]]['pagerank']

In [48]:
bank_df = pd.read_csv("bs140513_032310.csv")

In [50]:
labels = bank_df['fraud']

In [60]:
result = neo4j.run("""
MATCH (p:Placeholder)
RETURN p.id AS id, p.pageRank as pagerank
""")

df= pd.DataFrame(result.data())
print(df)

               id    pagerank
0      M348934600  183.568446
1     M1823072687  241.650932
2       M50039827    3.020985
3     M1888755466    1.753567
4     M1053599405    5.738430
...           ...         ...
4157  C2060410910    0.150000
4158  C1657671280    0.150000
4159  C1743702978    0.150000
4160   C849065220    0.150000
4161  C1562081159    0.150000

[4162 rows x 2 columns]


In [63]:
graph = Graph(password="0000")

# Query to fetch the network features from Neo4j
query = """
MATCH (p:Placeholder)
RETURN p.id AS id, p.pageRank as pagerank
"""

data = graph.run(query)
print(data)

 id          |           pagerank 
-------------|--------------------
 M348934600  | 183.56844635009764 
 M1823072687 | 241.65093231201172 
 M50039827   | 3.0209848880767822 



In [64]:
records = {}

for record in data:
    records[record['id']] = { 'pagerank': record['pagerank']}

# Merging the graph features with the banksim dataset

bank_df['merchant_pagerank'] = bank_df['merchant'].apply(load_pagerank)
bank_df['customer_pagerank'] = bank_df['customer'].apply(load_pagerank)


In [65]:
print(bank_df)

        step       customer  age gender zipcodeOri       merchant zipMerchant  \
0          0  'C1093826151'  '4'    'M'    '28007'   'M348934600'     '28007'   
1          0   'C352968107'  '2'    'M'    '28007'   'M348934600'     '28007'   
2          0  'C2054744914'  '4'    'F'    '28007'  'M1823072687'     '28007'   
3          0  'C1760612790'  '3'    'M'    '28007'   'M348934600'     '28007'   
4          0   'C757503768'  '5'    'M'    '28007'   'M348934600'     '28007'   
...      ...            ...  ...    ...        ...            ...         ...   
594638   179  'C1753498738'  '3'    'F'    '28007'  'M1823072687'     '28007'   
594639   179   'C650108285'  '4'    'F'    '28007'  'M1823072687'     '28007'   
594640   179   'C123623130'  '2'    'F'    '28007'   'M349281107'     '28007'   
594641   179  'C1499363341'  '5'    'M'    '28007'  'M1823072687'     '28007'   
594642   179   'C616528518'  '4'    'F'    '28007'  'M1823072687'     '28007'   

                   category

In [67]:
# Dropping the unnecessary columns including the age and gender attributes
feature_df = bank_df.drop(['step', 'age', 'gender', 'customer', 'zipcodeOri', 'zipMerchant', 'fraud'], axis=1)

In [68]:
# One hot encoding the categorical variables
feature_df = pd.get_dummies(feature_df, columns=['category', 'merchant'])

In [70]:
# Standardizing the features
standard_scaler = StandardScaler()
scaled_df = pd.DataFrame(standard_scaler.fit_transform(feature_df), columns = feature_df.columns)

scaled_df = scaled_df.values
labels = labels.values

In [71]:
k_fold = StratifiedKFold(n_splits=5, shuffle=False)

random_forest = RandomForestClassifier(max_depth=20, n_estimators=150)

In [73]:
print("\n\nBuilding Random Forest classifier with k=5 folds")
for train_index, test_index in k_fold.split(scaled_df, labels):

    X_train, X_test = scaled_df[train_index], scaled_df[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    clf = random_forest.fit(X_train, y_train)
    predictions = clf.predict(X_test)

    print(classification_report(y_test, predictions))



Building Random Forest classifier with k=5 folds
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.77      0.75      0.76      1440

    accuracy                           0.99    118929
   macro avg       0.88      0.87      0.88    118929
weighted avg       0.99      0.99      0.99    118929

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.85      0.75      0.79      1440

    accuracy                           1.00    118929
   macro avg       0.92      0.87      0.90    118929
weighted avg       1.00      1.00      1.00    118929

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117489
           1       0.88      0.74      0.81      1440

    accuracy                           1.00    118929
   macro avg       0.94      0.87      0.90    118929
weighted avg       1.00

In [111]:
# save the model to disk
import pickle
filename = 'finalized_model.pkl'
pickle.dump(random_forest, open(filename, 'wb'))

In [105]:
import sklearn.metrics as metrics
print(np.sqrt(metrics.mean_squared_error(y_test,predictions)))

0.06412278431087365
