In [1]:
import random

import numpy as np
import pandas as pd

from nltk.corpus import words

In [2]:
from bluegraph import PandasPGFrame
from bluegraph.preprocess import ScikitLearnPGEncoder
from bluegraph.backends.stellargraph import StellarGraphNodeEmbedder

In [3]:
nodes = [
    "Alice", "Bob", "Eric", "John", "Anna", "Laura", "Matt"
]
age = [25, 9, 70, 42, 26, 35, 36]
height = [180, 122, 173, 194, 172, 156, 177]
weight = [75, 43, 68, 82, 70, 59, 81]
sources = [
    "Alice", "Alice", "Bob", "Bob", "Bob", "Eric", "Anna", "Anna", "Matt"
]
targets = [
    "Bob", "Eric", "Eric", "John", "Anna", "Anna", "Laura", "John", "John"
]
weights = [1.0, 2.2, 0.3, 4.1, 1.5, 21.0, 1.0, 2.5, 7.5]
edges = list(zip(sources, targets))
frame = PandasPGFrame(nodes=nodes, edges=edges)

# Add properties

a = pd.DataFrame()
frame.add_node_properties(
    {
        "@id": nodes,
        "age": age
    }, prop_type="numeric")
frame.add_node_properties(
    {
        "@id": nodes,
        "height": height
    }, prop_type="numeric")
frame.add_node_properties(
    {
        "@id": nodes,
        "weight": weight
    }, prop_type="numeric")

edge_weight = pd.DataFrame({
    "@source_id": sources,
    "@target_id": targets,
    "distance": weights
})
frame.add_edge_properties(edge_weight, prop_type="numeric")

In [4]:
props = ["age", "height", "weight"]

In [5]:
df =frame._nodes[props]

In [6]:
df

Unnamed: 0_level_0,age,height,weight
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alice,25,180,75
Bob,9,122,43
Eric,70,173,68
John,42,194,82
Anna,26,172,70
Laura,35,156,59
Matt,36,177,81


In [7]:
df.to_numpy().tolist()

[[25, 180, 75],
 [9, 122, 43],
 [70, 173, 68],
 [42, 194, 82],
 [26, 172, 70],
 [35, 156, 59],
 [36, 177, 81]]

In [8]:
df["_generated_features"] = df.to_numpy().tolist()

In [9]:
df

Unnamed: 0_level_0,age,height,weight,_generated_features
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alice,25,180,75,"[25, 180, 75]"
Bob,9,122,43,"[9, 122, 43]"
Eric,70,173,68,"[70, 173, 68]"
John,42,194,82,"[42, 194, 82]"
Anna,26,172,70,"[26, 172, 70]"
Laura,35,156,59,"[35, 156, 59]"
Matt,36,177,81,"[36, 177, 81]"


In [10]:
def generate_targets(nodes, s, density=0.2):
    edges = []
    for t in nodes:
        if s < t:
            edge = np.random.choice([0, 1], p=[1 - density, density])
            if edge:
                
                edges.append([s, t])
    return edges


def random_pgframe(n_nodes, density):
    nodes = list(range(n_nodes))

    edges = sum(
        map(lambda x: generate_targets(nodes, x, density), nodes), [])
    edges = pd.DataFrame(
        edges, columns=["@source_id", "@target_id"])
    edges_df = edges.set_index(["@source_id", "@target_id"])
    frame = PandasPGFrame(nodes=nodes, edges=edges_df.index)
    return frame

In [11]:
N = 70
density = 0.13

# Generate a random graph

In [12]:
graph_frame = random_pgframe(N, density)

In [13]:
graph_frame._nodes.reset_index().to_dict("records")

[{'@id': 0},
 {'@id': 1},
 {'@id': 2},
 {'@id': 3},
 {'@id': 4},
 {'@id': 5},
 {'@id': 6},
 {'@id': 7},
 {'@id': 8},
 {'@id': 9},
 {'@id': 10},
 {'@id': 11},
 {'@id': 12},
 {'@id': 13},
 {'@id': 14},
 {'@id': 15},
 {'@id': 16},
 {'@id': 17},
 {'@id': 18},
 {'@id': 19},
 {'@id': 20},
 {'@id': 21},
 {'@id': 22},
 {'@id': 23},
 {'@id': 24},
 {'@id': 25},
 {'@id': 26},
 {'@id': 27},
 {'@id': 28},
 {'@id': 29},
 {'@id': 30},
 {'@id': 31},
 {'@id': 32},
 {'@id': 33},
 {'@id': 34},
 {'@id': 35},
 {'@id': 36},
 {'@id': 37},
 {'@id': 38},
 {'@id': 39},
 {'@id': 40},
 {'@id': 41},
 {'@id': 42},
 {'@id': 43},
 {'@id': 44},
 {'@id': 45},
 {'@id': 46},
 {'@id': 47},
 {'@id': 48},
 {'@id': 49},
 {'@id': 50},
 {'@id': 51},
 {'@id': 52},
 {'@id': 53},
 {'@id': 54},
 {'@id': 55},
 {'@id': 56},
 {'@id': 57},
 {'@id': 58},
 {'@id': 59},
 {'@id': 60},
 {'@id': 61},
 {'@id': 62},
 {'@id': 63},
 {'@id': 64},
 {'@id': 65},
 {'@id': 66},
 {'@id': 67},
 {'@id': 68},
 {'@id': 69}]

In [14]:
graph_frame._edges

@source_id,@target_id
0,10
0,23
0,28
0,38
0,42
...,...
60,66
60,68
61,66
62,67


# Add node and edge types

In [15]:
types = ["Apple", "Orange", "Carrot"]

In [16]:
node_types = {
    n: np.random.choice(types, p=[0.5, 0.4, 0.1])
    for n in range(N)
}

In [17]:
graph_frame.add_node_types(node_types)

In [18]:
graph_frame._nodes

Unnamed: 0_level_0,@type
@id,Unnamed: 1_level_1
0,Apple
1,Apple
2,Orange
3,Orange
4,Apple
...,...
65,Apple
66,Apple
67,Orange
68,Apple


In [19]:
types = ["isFriend", "isEnemy"]

In [20]:
edge_types = {
    e: np.random.choice(types, p=[0.8, 0.2])
    for e in graph_frame.edges()
}

In [21]:
graph_frame.add_edge_types(edge_types)

In [22]:
graph_frame._edges

Unnamed: 0_level_0,Unnamed: 1_level_0,@type
@source_id,@target_id,Unnamed: 2_level_1
0,10,isFriend
0,23,isFriend
0,28,isFriend
0,38,isEnemy
0,42,isFriend
...,...,...
60,66,isFriend
60,68,isFriend
61,66,isFriend
62,67,isFriend


# Add node and edge properties

numerical, categorical, text

## Add node properties

In [23]:
weight = pd.DataFrame(
    [
        (n, np.random.normal(loc=35, scale=5))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "weight"]
)

In [24]:
graph_frame.add_node_properties(weight, prop_type="numeric")

In [25]:
colors = ["red", "green", "blue"]

In [26]:
colors = pd.DataFrame(
    [
        (n, np.random.choice(colors))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "color"]
)

In [27]:
graph_frame.add_node_properties(colors, prop_type="category")

In [28]:
desc = pd.DataFrame(
    [
        (n, ' '.join(random.sample(words.words(), 20)))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "desc"]
)

In [29]:
graph_frame.add_node_properties(desc, prop_type="text")

In [30]:
graph_frame._nodes

Unnamed: 0_level_0,@type,weight,color,desc
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Apple,29.159788,green,monogeneity oversaturation muzzlewood sheepher...
1,Apple,30.786967,blue,chorizontic heaterman uniramous qualifiable ri...
2,Orange,28.538049,green,noint bimasty harmoniacal sowbacked faithbreak...
3,Orange,47.050694,green,prefixed wanderable Gaboon outwater rhinocelia...
4,Apple,34.240315,green,root Sac retiary outpromise plazolite parakine...
...,...,...,...,...
65,Apple,37.834447,green,avanious pyrenoid undrainable alcoholometry vo...
66,Apple,30.806040,green,frill receivables Jewess spoonyism wetherteg H...
67,Orange,36.127293,red,desponder dentist nonrespectable chort antimin...
68,Apple,25.324251,blue,fasces tacket aristocraticness orthoxylene Vej...


## Add edge properties

In [31]:
years = pd.DataFrame(
    [
        (s, t, np.random.randint(0, 20))
        for s, t in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "n_years"]
)

In [32]:
graph_frame.add_edge_properties(years, prop_type="numeric")

In [33]:
shapes = ["dashed", "dotted", "solid"]
shapes = pd.DataFrame(
    [
        (s, t, np.random.choice(shapes))
        for s, t, in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "shapes"]
)

In [34]:
graph_frame.add_edge_properties(shapes, prop_type="category")

In [35]:
desc = pd.DataFrame(
    [
        (s, t, ' '.join(random.sample(words.words(), 20)))
        for s, t, in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "desc"]
)

In [36]:
graph_frame.add_edge_properties(desc, prop_type="text")

In [37]:
graph_frame._edges

Unnamed: 0_level_0,Unnamed: 1_level_0,@type,n_years,shapes,desc
@source_id,@target_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,10,isFriend,13,solid,ramental sennite scarping emprise canonics che...
0,23,isFriend,15,dashed,cerebralism florence overelaboration xanthatio...
0,28,isFriend,16,dashed,crystallographer eolithic clock timesaver orbi...
0,38,isEnemy,19,dashed,unequated outscream hacky saddeningly quinquag...
0,42,isFriend,13,solid,diabolepsy macropodine offlet suspensoid unhar...
...,...,...,...,...,...
60,66,isFriend,13,dashed,grayish Bennet peridermal insoluble predecepti...
60,68,isFriend,12,dotted,Moldavian amargoso pyruvil unsanguineness unth...
61,66,isFriend,3,dotted,purview trick jingo admirable kerchunk Kadaga ...
62,67,isFriend,0,solid,hematostibiite cordicole Scotchman predefault ...


In [38]:
graph_frame._edge_prop_types

{'@type': 'category',
 'n_years': 'numeric',
 'shapes': 'category',
 'desc': 'text'}

# Property encoding

In [39]:
## Run this if nltk error occurs (need to download the 'words' corpus)
# import nltk
# nltk.download('words')

In [52]:
hom_encoder = ScikitLearnPGEncoder(
    node_properties=["weight", "color", "desc"],
    edge_properties=["n_years", "shapes", "desc"],
    edge_features=True,
    heterogeneous=False,
    encode_types=True, drop_types=True, text_encoding="tfidf")

In [53]:
transformed_frame = hom_encoder.fit_transform(graph_frame)

In [54]:
transformed_frame._nodes.sample(5)

Unnamed: 0_level_0,features
@id,Unnamed: 1_level_1
3,"[2.500322350517759, 0.0, 1.0, 0.0, 0.0, 0.0, 0..."
68,"[-1.9383550713359565, 1.0, 0.0, 0.0, 0.0, 1.0,..."
37,"[0.5705943870669569, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
54,"[0.15240571930824656, 0.0, 1.0, 0.0, 0.0, 0.0,..."
7,"[0.1819241257342659, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


In [55]:
hom_encoder._node_encoders

{'weight': StandardScaler(),
 'color': MultiLabelBinarizer(),
 'desc': TfidfVectorizer(max_features=128, stop_words='english', sublinear_tf=True)}

In [56]:
transformed_frame._edges.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,features
@source_id,@target_id,Unnamed: 2_level_1
30,57,"[-0.855034373506005, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
16,34,"[-1.0290583827324948, 0.0, 0.0, 1.0, 0.0, 0.0,..."
31,65,"[0.5371577003059147, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
18,69,"[1.5813017556648543, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
29,63,"[-1.7251544196384547, 1.0, 0.0, 0.0, 0.0, 0.0,..."


In [57]:
hom_encoder._edge_encoders

{'n_years': StandardScaler(),
 'shapes': MultiLabelBinarizer(),
 'desc': TfidfVectorizer(max_features=128, stop_words='english', sublinear_tf=True)}

## Convert PGFrames into json-ld

In [58]:
jsonld_repr = graph_frame.to_jsonld(edges_key="outEdges")

In [59]:
jsonld_repr

[{'@id': '0',
  '@type': ['Apple'],
  'weight': 29.159788374579446,
  'color': 'green',
  'desc': 'monogeneity oversaturation muzzlewood sheepherder topline Leptotrichia eyre refugeeship Phrynosoma julid dermatozoon bot prebelief nonlepidopterous Taunton dacker ungrantable roadstead ayont escutellate',
  'outEdges': [{'n_years': 13,
    'shapes': 'solid',
    'desc': 'ramental sennite scarping emprise canonics chemoreceptor nondomesticated vindictively dispensatorily Tricholoma improvidence monogenistic affirmable cornel hypnoanalysis Spirochaetales thoroughwax milkweed degorge countercause',
    'isFriend': {'@id': '10'}},
   {'n_years': 15,
    'shapes': 'dashed',
    'desc': 'cerebralism florence overelaboration xanthation Platycarpus archostenosis aim voiding trochlear diffuse zoocyst podial supper mudir evocative plumbaginaceous undefaceable Winfred nonunderstanding Phyllophaga',
    'isFriend': {'@id': '23'}},
   {'n_years': 16,
    'shapes': 'dashed',
    'desc': 'crystallograph

In [60]:
new_frame = PandasPGFrame()
new_frame.from_jsonld(jsonld_repr, types_from_relations=False)

In [61]:
new_frame._nodes.sample(5)

Unnamed: 0_level_0,@type,color,desc,weight
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,Orange,red,desponder dentist nonrespectable chort antimin...,36.127293
31,Apple,red,adroit universality subofficial heckimal allur...,34.634759
21,Apple,green,unimitating valiship drabbler subsultus acclim...,37.47807
42,Apple,green,aosmic chevronwise ambassadress colima covin i...,35.888129
14,Orange,blue,actinic theatrician uncynically subordination ...,33.620625


In [62]:
new_frame._edges.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,@type,desc,n_years,shapes
@source_id,@target_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
19,59,isFriend,sweetwort defunct Mantzu myocardiograph dually...,4.0,dashed
39,49,isFriend,unhoaxed extemporalness Upupidae interglacial ...,0.0,dashed
16,33,isFriend,tiswin meltingness Lemna prosodist expectorato...,15.0,dashed
5,41,isFriend,chelicere assessor arterioplasty heteroeciousn...,15.0,dotted
2,14,isFriend,attemperator picture myelencephalous conduplic...,5.0,solid
