In [1]:
#!pip install graphdatascience scikit-multilearn scikit-learn

In [2]:
import numpy as np
import pandas as pd

from graphdatascience import GraphDataScience

from skmultilearn.model_selection import iterative_train_test_split
from skmultilearn.problem_transform import LabelPowerset

from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression

import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 100)


In [3]:
host = "bolt://localhost:7687"
user = "neo4j"
password= "pleaseletmein"

gds = GraphDataScience(host, auth=(user, password))

In [4]:
gds.run_cypher("""
MATCH (a:Article)
RETURN exists {(a)-[:IN_LIST]-()}, count(*) AS count
ORDER BY count DESC
""")

Unnamed: 0,exists {(a)-[:IN_LIST]-()},count
0,True,54931


In [5]:
G, metadata = gds.graph.project(
    "articles",
    ["Article", "List"],
    'IN_LIST',
    nodeProperties=['openaiEmbedding']
)


Loading:   0%|          | 0/100 [00:00<?, ?%/s]

In [6]:
gds.nodeSimilarity.mutate(G, topK=2000, mutateProperty='score', mutateRelationshipType='SIMILAR')

NodeSimilarity:   0%|          | 0/100 [00:00<?, ?%/s]

preProcessingMillis                                                                                                         0
computeMillis                                                                                                           75350
mutateMillis                                                                                                             1860
postProcessingMillis                                                                                                       -1
nodesCompared                                                                                                           54931
relationshipsWritten                                                                                                 12574857
similarityDistribution    {'p1': 0.05263155698776245, 'max': 1.0000075697898865, 'p5': 0.08333343267440796, 'p90': 1.00000...
configuration             {'topK': 2000, 'similarityMetric': 'JACCARD', 'bottomK': 10, 'bottomN': 0, 'mutateRelationsh

In [7]:
gds.fastRP.write(
    G,
    writeProperty="fastrp",
    embeddingDimension=256,
    relationshipTypes=["SIMILAR"],
    nodeLabels=["Article"],
)

FastRP:   0%|          | 0/100 [00:00<?, ?%/s]

nodeCount                                                                                                              54931
nodePropertiesWritten                                                                                                  54931
preProcessingMillis                                                                                                        0
computeMillis                                                                                                           4158
writeMillis                                                                                                             3345
configuration            {'writeConcurrency': 4, 'nodeSelfInfluence': 0, 'propertyRatio': 0.0, 'concurrency': 4, 'jobId':...
Name: 0, dtype: object

In [8]:
wcc = gds.wcc.stream(G)

In [9]:
wcc_grouped = wcc.groupby('componentId').size().to_frame('componentSize').reset_index().sort_values('componentSize', ascending=False).reset_index()
wcc_grouped

Unnamed: 0,index,componentId,componentSize
0,0,0,54800
1,7,149,368
2,4,124,193
3,23,374,134
4,16,288,117
...,...,...,...
599,493,25648,2
600,475,23246,2
601,490,25231,2
602,372,14429,2


In [10]:
largest_component = wcc_grouped['componentId'][0]
start_node = wcc[wcc['componentId'] == largest_component]['nodeId'][0]

In [11]:
trainG, metadata = gds.alpha.graph.sample.rwr('trainGraph', G, samplingRatio=0.20, startNodes=[int(start_node)], nodeLabels=['Article'], relationshipTypes=['SIMILAR'])

Random walk with restarts sampling:   0%|          | 0/100 [00:00<?, ?%/s]

In [12]:
%%time
gds.beta.graphSage.train(trainG, modelName='articleModel', embeddingDimension=512, sampleSizes=[30, 30],
                         searchDepth=20, epochs=20, learningRate=0.001, activationFunction='RELU',
                         aggregator='MEAN', featureProperties=['openaiEmbedding'], batchSize=10)

CPU times: user 48.9 ms, sys: 144 ms, total: 193 ms
Wall time: 14min 42s


(GraphSageModel({'modelInfo': {0: {'modelName': 'articleModel', 'modelType': 'graphSage', 'metrics': {'ranIterationsPerEpoch': [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10], 'iterationLossesPerEpoch': [[26.130593648069045, 25.276426092010478, 23.359088236977517, 21.772414373789637, 21.795054374826215, 19.65304353559649, 20.282220518387753, 19.162152604907142, 18.539002542795778, 17.72454613224475], [17.70111133496202, 20.242463267551305, 20.180167267050486, 20.443562048732986, 19.198587496620757, 17.839383445773166, 16.8257715377103, 17.31153454977032, 16.440906446381994, 16.994070894402128], [16.714556755413618, 18.675032342650134, 19.9226293849263, 18.436295006281348, 17.9300472767778, 17.71937942880774, 17.74026269675531, 16.064716275946225, 17.162037581107302, 16.220478575941595], [16.564025867613367, 17.987609060409497, 18.15908980653432, 18.477519673081538, 18.532859096543948, 18.084450720177323, 16.554426610206, 17.16501739934058, 16.0796631483

In [13]:
gds.beta.graphSage.write(G, modelName='articleModel', nodeLabels=['Article'], writeProperty='graphSAGE', relationshipTypes=['SIMILAR'])

GraphSage:   0%|          | 0/100 [00:00<?, ?%/s]

nodeCount                                                                                                              54931
nodePropertiesWritten                                                                                                  54931
preProcessingMillis                                                                                                        1
computeMillis                                                                                                         869540
writeMillis                                                                                                             4204
configuration            {'jobId': '31ae8c5b-3ca6-4644-b8af-b19b1eb44e35', 'modelName': 'articleModel', 'writeConcurrency...
Name: 0, dtype: object

In [14]:
G.drop()

graphName                                                                                                           articles
database                                                                                                               neo4j
memoryUsage                                                                                                                 
sizeInBytes                                                                                                               -1
nodeCount                                                                                                              59302
relationshipCount                                                                                                   12662148
configuration                                                                                                             {}
density                                                                                                             0.003601


In [15]:
# Tag relevant tags
gds.run_cypher("""
MATCH (t:Tag)
WHERE count{(t)<--()} > 100
SET t:Target
""")

In [16]:
data = gds.run_cypher("""
MATCH (a:Article)-[:HAS_TAG]->(tag:Target)
RETURN a.url AS article, a.fastrp AS fastrp, collect(tag.name) AS tags, a.openaiEmbedding AS openai, a.graphSAGE AS graphSAGE
""")

In [17]:
#instantiating MultiLabelBinarizer
mlb = MultiLabelBinarizer()
tags_mlb = mlb.fit_transform(data['tags'])
data['target'] = list(tags_mlb)
data.head()

Unnamed: 0,article,fastrp,tags,openai,graphSAGE,target
0,https://medium.com/better-programming/this-python-library-can-animate-your-charts-a7c0a98b3463,"[0.08663628995418549, -0.06408265978097916, -0.06370709836483002, -0.015515219420194626, 0.23375...","[machine-learning, data-science, technology, artificial-intelligence, programming]","[-0.003536871401593089, 0.00747159868478775, 0.01575239561498165, -0.01385077927261591, 0.005306...","[-0.024850887401985787, -0.01390526928863974, -0.011590916944380048, -0.005315524494654697, -0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,https://medium.com/datadriveninvestor/list-of-chatgpt-prompts-for-nlp-practitioners-and-nlp-prod...,"[-0.007080343551933765, -0.00889026839286089, -0.18236935138702393, -0.14672335982322693, 0.3220...","[machine-learning, data-science, deep-learning, python, naturallanguageprocessing]","[-0.01077286060899496, -0.0017891748575493693, 0.0182870514690876, -0.013639666140079498, 0.0061...","[-0.0029908329879554793, -0.0012797971176591125, -0.00154999756568174, -0.0018629333323836132, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,https://medium.com/towards-artificial-intelligence/deepminds-clever-idea-to-master-asymmetric-ga...,"[-0.21350812911987305, 0.013562796637415886, 0.0765717625617981, -0.12203571945428848, 0.3002116...","[machine-learning, data-science, deep-learning, artificial-intelligence]","[-0.032979629933834076, -0.012398408725857735, 0.022878754884004593, -0.007013346068561077, 0.01...","[-0.011517131737703568, -0.00599713490088639, -0.00411311608871896, -0.006146829890950987, -0.00...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,https://medium.com/towards-data-science/must-read-papers-on-gans-b665bbae3317,"[0.25916963815689087, -0.01417395006865263, 0.09633032977581024, -0.13773560523986816, -0.014066...","[machine-learning, data-science, deep-learning, artificial-intelligence]","[-0.03457323834300041, 0.005550392437726259, 0.028737546876072884, -0.011742711067199707, 0.0239...","[-0.003663774303208057, -0.0017448916705572657, -0.0017310002027370829, -0.0018859679502443221, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,https://medium.com/towards-data-science/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66,"[0.15469515323638916, 0.06696459650993347, -0.08211394399404526, 0.04332730919122696, 0.17559620...","[machine-learning, deep-learning, pytorch]","[-0.02942381054162979, -0.0033131770323961973, 0.00732263782992959, -0.0019605879206210375, 0.01...","[-0.00885533563088848, -0.003836722124383314, -0.003924066660443985, -0.004346797047862697, 0.05...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [18]:
def get_macro_precision(classes, y_true, y_pred):
    totalPrecision= 0
    for i in range(len(classes)):
      p = precision_score(y_true[:,i], y_pred.toarray()[:,i])
      totalPrecision+= p
    return totalPrecision/len(classes)

def get_weighted_precision(classes, y_true, y_pred):
    totalPrecision=0
    totalSupport=0
    for i in range (len(classes)):
      p= precision_score(y_true[:,i], y_pred.toarray()[:,i])
      support= (y_test[:,i]==1).sum()
      totalSupport+=support
      totalPrecision+= p*support
    return totalPrecision/totalSupport

In [19]:
def train_and_evaluate(df, input_columns):
  max_weighted_precision = 0
  best_input = ""
  # Single split data
  X = data[input_columns].values
  y = np.array(data['target'].to_list())
  x_train_all, y_train, x_test_all, y_test = iterative_train_test_split(X, y, test_size = 0.2)
  # Train a model for each input option
  for i, input_column in enumerate(input_columns):
    print(f"Training a model based on {input_column} column")
    x_train = np.array([x[i] for x in x_train_all])
    x_test = np.array([x[i] for x in x_test_all])

    # train
    classifier = LabelPowerset(LogisticRegression())
    classifier.fit(x_train, y_train)
    # predict
    predictions = classifier.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(y_test, predictions)))
    print("Macro Precision: {:.2f}".format(get_macro_precision(mlb.classes_, y_test, predictions)))
    weighted_precision = get_weighted_precision(mlb.classes_, y_test, predictions)
    print("Weighted Precision: {:.2f}".format(weighted_precision))
    if weighted_precision > max_weighted_precision:
      max_weighted_precision = weighted_precision
      best_classifier = classifier
      best_input = input_column
      
  return best_classifier, best_input

In [None]:
%%time
classifier = train_and_evaluate(data, ['openai','fastrp','graphSAGE'])

Training a model based on openai column


In [None]:
example = gds.run_cypher("""
MATCH (a:Article)
WHERE NOT EXISTS {(a)-[:HAS_TAG]->()}
RETURN a.title AS title, a.fastrp AS embedding
LIMIT 15
""")

In [None]:
tags_predicted = classifier.predict(np.array(example['embedding'].to_list()))
example['tags'] = [list(mlb.inverse_transform(x)[0]) for x in tags_predicted]
example[['title', 'tags']]