In [1]:
import random

import numpy as np
import pandas as pd

from nltk.corpus import words

In [2]:
from bluegraph import PandasPGFrame
from bluegraph.preprocess import ScikitLearnPGEncoder
from bluegraph.backends.stellargraph import StellarGraphNodeEmbedder

In [3]:
nodes = [
    "Alice", "Bob", "Eric", "John", "Anna", "Laura", "Matt"
]
age = [25, 9, 70, 42, 26, 35, 36]
height = [180, 122, 173, 194, 172, 156, 177]
weight = [75, 43, 68, 82, 70, 59, 81]
sources = [
    "Alice", "Alice", "Bob", "Bob", "Bob", "Eric", "Anna", "Anna", "Matt"
]
targets = [
    "Bob", "Eric", "Eric", "John", "Anna", "Anna", "Laura", "John", "John"
]
weights = [1.0, 2.2, 0.3, 4.1, 1.5, 21.0, 1.0, 2.5, 7.5]
edges = list(zip(sources, targets))
frame = PandasPGFrame(nodes=nodes, edges=edges)

# Add properties

a = pd.DataFrame()
frame.add_node_properties(
    {
        "@id": nodes,
        "age": age
    }, prop_type="numeric")
frame.add_node_properties(
    {
        "@id": nodes,
        "height": height
    }, prop_type="numeric")
frame.add_node_properties(
    {
        "@id": nodes,
        "weight": weight
    }, prop_type="numeric")

edge_weight = pd.DataFrame({
    "@source_id": sources,
    "@target_id": targets,
    "distance": weights
})
frame.add_edge_properties(edge_weight, prop_type="numeric")

In [4]:
props = ["age", "height", "weight"]

In [5]:
df = frame._nodes[props]

In [6]:
df

Unnamed: 0_level_0,age,height,weight
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alice,25,180,75
Bob,9,122,43
Eric,70,173,68
John,42,194,82
Anna,26,172,70
Laura,35,156,59
Matt,36,177,81


In [7]:
df.to_numpy().tolist()

[[25, 180, 75],
 [9, 122, 43],
 [70, 173, 68],
 [42, 194, 82],
 [26, 172, 70],
 [35, 156, 59],
 [36, 177, 81]]

In [8]:
df["_generated_features"] = df.to_numpy().tolist()

In [9]:
df

Unnamed: 0_level_0,age,height,weight,_generated_features
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alice,25,180,75,"[25, 180, 75]"
Bob,9,122,43,"[9, 122, 43]"
Eric,70,173,68,"[70, 173, 68]"
John,42,194,82,"[42, 194, 82]"
Anna,26,172,70,"[26, 172, 70]"
Laura,35,156,59,"[35, 156, 59]"
Matt,36,177,81,"[36, 177, 81]"


In [10]:
def generate_targets(nodes, s, density=0.2):
    edges = []
    for t in nodes:
        if s < t:
            edge = np.random.choice([0, 1], p=[1 - density, density])
            if edge:
                
                edges.append([s, t])
    return edges


def random_pgframe(n_nodes, density):
    nodes = list(range(n_nodes))

    edges = sum(
        map(lambda x: generate_targets(nodes, x, density), nodes), [])
    edges = pd.DataFrame(
        edges, columns=["@source_id", "@target_id"])
    edges_df = edges.set_index(["@source_id", "@target_id"])
    frame = PandasPGFrame(nodes=nodes, edges=edges_df.index)
    return frame

In [11]:
N = 70
density = 0.13

# Generate a random graph

In [12]:
graph_frame = random_pgframe(N, density)

In [13]:
graph_frame._nodes.reset_index().to_dict("records")

[{'@id': 0},
 {'@id': 1},
 {'@id': 2},
 {'@id': 3},
 {'@id': 4},
 {'@id': 5},
 {'@id': 6},
 {'@id': 7},
 {'@id': 8},
 {'@id': 9},
 {'@id': 10},
 {'@id': 11},
 {'@id': 12},
 {'@id': 13},
 {'@id': 14},
 {'@id': 15},
 {'@id': 16},
 {'@id': 17},
 {'@id': 18},
 {'@id': 19},
 {'@id': 20},
 {'@id': 21},
 {'@id': 22},
 {'@id': 23},
 {'@id': 24},
 {'@id': 25},
 {'@id': 26},
 {'@id': 27},
 {'@id': 28},
 {'@id': 29},
 {'@id': 30},
 {'@id': 31},
 {'@id': 32},
 {'@id': 33},
 {'@id': 34},
 {'@id': 35},
 {'@id': 36},
 {'@id': 37},
 {'@id': 38},
 {'@id': 39},
 {'@id': 40},
 {'@id': 41},
 {'@id': 42},
 {'@id': 43},
 {'@id': 44},
 {'@id': 45},
 {'@id': 46},
 {'@id': 47},
 {'@id': 48},
 {'@id': 49},
 {'@id': 50},
 {'@id': 51},
 {'@id': 52},
 {'@id': 53},
 {'@id': 54},
 {'@id': 55},
 {'@id': 56},
 {'@id': 57},
 {'@id': 58},
 {'@id': 59},
 {'@id': 60},
 {'@id': 61},
 {'@id': 62},
 {'@id': 63},
 {'@id': 64},
 {'@id': 65},
 {'@id': 66},
 {'@id': 67},
 {'@id': 68},
 {'@id': 69}]

In [14]:
graph_frame._edges

@source_id,@target_id
0,7
0,8
0,33
0,38
0,42
...,...
63,66
63,67
65,66
65,68


# Add node and edge types

In [15]:
types = ["Apple", "Orange", "Carrot"]

In [16]:
node_types = {
    n: np.random.choice(types, p=[0.5, 0.4, 0.1])
    for n in range(N)
}

In [17]:
graph_frame.add_node_types(node_types)

In [18]:
graph_frame._nodes

Unnamed: 0_level_0,@type
@id,Unnamed: 1_level_1
0,Apple
1,Orange
2,Orange
3,Orange
4,Orange
...,...
65,Carrot
66,Orange
67,Carrot
68,Apple


In [19]:
types = ["isFriend", "isEnemy"]

In [20]:
edge_types = {
    e: np.random.choice(types, p=[0.8, 0.2])
    for e in graph_frame.edges()
}

In [21]:
graph_frame.add_edge_types(edge_types)

In [22]:
graph_frame._edges

Unnamed: 0_level_0,Unnamed: 1_level_0,@type
@source_id,@target_id,Unnamed: 2_level_1
0,7,isFriend
0,8,isFriend
0,33,isFriend
0,38,isFriend
0,42,isFriend
...,...,...
63,66,isFriend
63,67,isFriend
65,66,isFriend
65,68,isFriend


# Add node and edge properties

numerical, categorical, text

## Add node properties

In [23]:
weight = pd.DataFrame(
    [
        (n, np.random.normal(loc=35, scale=5))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "weight"]
)

In [24]:
graph_frame.add_node_properties(weight, prop_type="numeric")

In [25]:
colors = ["red", "green", "blue"]

In [26]:
colors = pd.DataFrame(
    [
        (n, np.random.choice(colors))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "color"]
)

In [27]:
graph_frame.add_node_properties(colors, prop_type="category")

In [28]:
desc = pd.DataFrame(
    [
        (n, ' '.join(random.sample(words.words(), 20)))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "desc"]
)

In [29]:
graph_frame.add_node_properties(desc, prop_type="text")

In [30]:
graph_frame._nodes

Unnamed: 0_level_0,@type,weight,color,desc
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Apple,36.769760,blue,Banyoro Ctenoplana overweening Peperomia charm...
1,Orange,33.154008,green,odontocele usefully quinotoxine gonfanon prier...
2,Orange,27.894133,blue,Vermetidae outhiss deglaze allopathist orthoph...
3,Orange,36.937222,blue,anticoagulin unsacramentally hayband hemimetab...
4,Orange,30.807936,green,petre paleographically melocoton degradational...
...,...,...,...,...
65,Carrot,35.690091,blue,perstringe subversionary puniness unclassablen...
66,Orange,36.811264,red,disilane sciapod vasofactive yowt evertebral o...
67,Carrot,34.780642,red,unreliableness Myrrhis stola tablinum curioman...
68,Apple,40.900634,green,fossilated overdure abietene melange sudden ov...


## Add edge properties

In [31]:
years = pd.DataFrame(
    [
        (s, t, np.random.randint(0, 20))
        for s, t in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "n_years"]
)

In [32]:
graph_frame.add_edge_properties(years, prop_type="numeric")

In [33]:
shapes = ["dashed", "dotted", "solid"]
shapes = pd.DataFrame(
    [
        (s, t, np.random.choice(shapes))
        for s, t, in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "shapes"]
)

In [34]:
graph_frame.add_edge_properties(shapes, prop_type="category")

In [35]:
desc = pd.DataFrame(
    [
        (s, t, ' '.join(random.sample(words.words(), 20)))
        for s, t, in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "desc"]
)

In [36]:
graph_frame.add_edge_properties(desc, prop_type="text")

In [37]:
graph_frame._edges

Unnamed: 0_level_0,Unnamed: 1_level_0,@type,n_years,shapes,desc
@source_id,@target_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,7,isFriend,12,dashed,impetrator trickstering impersonative banago b...
0,8,isFriend,17,dotted,homonymy vermix ross skelloch tarboggin nonpas...
0,33,isFriend,14,dashed,brachyceric faburden concupiscent Machetes sub...
0,38,isFriend,2,solid,Arctomys conduce worse bayal volutation throne...
0,42,isFriend,4,solid,yahoo blepharoblennorrhea upsheath diplotegia ...
...,...,...,...,...,...
63,66,isFriend,7,solid,elasticize Myaria nondetailed rediscussion zab...
63,67,isFriend,11,dashed,loxodrome bogwood ferryboat reductively unrele...
65,66,isFriend,0,dotted,colure butterbur pygmoid Polycarpon lipper sor...
65,68,isFriend,15,dotted,gagroot adenocarcinomatous tellurethyl phenyla...


In [38]:
graph_frame._edge_prop_types

{'@type': 'category',
 'n_years': 'numeric',
 'shapes': 'category',
 'desc': 'text'}

# Property encoding

In [39]:
## Run this if nltk error occurs (need to download the 'words' corpus)
# import nltk
# nltk.download('words')

In [40]:
hom_encoder = ScikitLearnPGEncoder(
    node_properties=["weight", "color", "desc"],
    edge_properties=["n_years", "shapes", "desc"],
    edge_features=True,
    heterogeneous=False,
    encode_types=True, drop_types=True, text_encoding="tfidf")

In [41]:
transformed_frame = hom_encoder.fit_transform(graph_frame)

In [42]:
transformed_frame._nodes.sample(5)

Unnamed: 0_level_0,features
@id,Unnamed: 1_level_1
40,"[-1.4178926691253575, 0.0, 1.0, 0.0, 0.0, 0.0,..."
31,"[-0.9167879385137265, 0.0, 1.0, 0.0, 0.0, 0.0,..."
0,"[0.41915372787745564, 1.0, 0.0, 0.0, 0.0, 0.0,..."
49,"[2.337829459413175, 0.0, 1.0, 0.0, 0.0, 0.0, 0..."
52,"[1.0661600994826879, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


In [43]:
hom_encoder._node_encoders

{'weight': StandardScaler(),
 'color': MultiLabelBinarizer(),
 'desc': TfidfVectorizer(max_features=128, stop_words='english', sublinear_tf=True)}

In [44]:
transformed_frame._edges.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,features
@source_id,@target_id,Unnamed: 2_level_1
34,67,"[1.522224573335882, 0.0, 1.0, 0.0, 0.0, 0.0, 0..."
65,66,"[-1.63415285078705, 0.0, 1.0, 0.0, 0.0, 0.0, 0..."
45,66,"[-0.4066727414059098, 0.0, 0.0, 1.0, 0.0, 0.0,..."
29,37,"[-1.4587985494468871, 0.0, 1.0, 0.0, 0.0, 0.0,..."
10,57,"[0.47009876529490463, 1.0, 0.0, 0.0, 0.0, 0.0,..."


In [45]:
hom_encoder._edge_encoders

{'n_years': StandardScaler(),
 'shapes': MultiLabelBinarizer(),
 'desc': TfidfVectorizer(max_features=128, stop_words='english', sublinear_tf=True)}

## Convert PGFrames into json-ld

In [46]:
jsonld_repr = graph_frame.to_jsonld(edges_key="outEdges")

In [47]:
jsonld_repr

[{'@id': '0',
  '@type': ['Apple'],
  'weight': 36.76976024883883,
  'color': 'blue',
  'desc': 'Banyoro Ctenoplana overweening Peperomia charmwise crozzle thou intertransversalis kilting bicamerist antiroyalist falconer Amphrysian outplace resentingly megmho betutor unbountiful unhingement postcephalic',
  'outEdges': [{'n_years': 12,
    'shapes': 'dashed',
    'desc': 'impetrator trickstering impersonative banago bowker creatable stony disenchain jimbang outrhyme buzane tamandua pallometric caressively perceiver twilled normalist sunsquall patterer fictation',
    'isFriend': {'@id': '7'}},
   {'n_years': 17,
    'shapes': 'dotted',
    'desc': 'homonymy vermix ross skelloch tarboggin nonpasserine Muter aqueousness unfringed maliceproof bluffy adyton thieveless upsteam equatorially homogen catholicist molleton purchase tamein',
    'isFriend': {'@id': '8'}},
   {'n_years': 14,
    'shapes': 'dashed',
    'desc': 'brachyceric faburden concupiscent Machetes subaggregate coercement kor

In [48]:
new_frame = PandasPGFrame()
new_frame.from_jsonld(jsonld_repr, types_from_relations=False)

In [49]:
new_frame._nodes.sample(5)

Unnamed: 0_level_0,@type,color,desc,weight
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
25,Apple,red,unhidated skirwort overfroth Portor sempiterni...,39.658851
40,Orange,green,Arriet anaphrodisiac gamobium whatna dedition ...,27.866321
26,Orange,green,exostosed homochromatic Chlorioninae ichthyomo...,35.617184
69,Apple,green,cockneyland intensative Majorist chahar cinnam...,38.676122
42,Orange,green,congregator Acinetina Dolichosauria barometric...,29.848444


In [50]:
new_frame._edges.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,@type,desc,n_years,shapes
@source_id,@target_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16,29,isFriend,superenrollment algaecide bookishly unadultera...,4.0,dashed
52,57,isFriend,semidiagrammatic neuromastic Franciscan corodi...,1.0,dashed
6,28,isFriend,unshrubbed Deimos nonion beray Glis backspring...,15.0,dotted
21,22,isFriend,tartago manque hemiekton universalness verbali...,17.0,dashed
2,66,isFriend,funt semibalked pursley onshore waiterage Pter...,17.0,dotted
