# Introduction to PGFrames and semantic encoding


In [1]:
import random

import numpy as np
import pandas as pd

from nltk.corpus import words

In [2]:
from bluegraph import PandasPGFrame
from bluegraph.preprocess import ScikitLearnPGEncoder
from bluegraph.backends.stellargraph import StellarGraphNodeEmbedder

__NB:__ If an nltk error occurs, run the following code (the 'words' corpus needs to be downloaded for semantic encoding of text properties):

```
import nltk
nltk.download('words')
```

## Example 1: small property graph

Intialize a `PandasPGFrame` given a node and edge list.

In [3]:
nodes = ["Alice", "Bob", "Eric", "John", "Anna", "Laura", "Matt"]

sources = [
    "Alice", "Alice", "Bob", "Bob", "Bob", "Eric", "Anna", "Anna", "Matt"
]
targets = [
    "Bob", "Eric", "Eric", "John", "Anna", "Anna", "Laura", "John", "John"
]
edges = list(zip(sources, targets))

frame = PandasPGFrame(nodes=nodes, edges=edges)

Get nodes and edges as lists.

In [4]:
frame.nodes()

['Alice', 'Bob', 'Eric', 'John', 'Anna', 'Laura', 'Matt']

In [5]:
frame.edges()

[('Alice', 'Bob'),
 ('Alice', 'Eric'),
 ('Bob', 'Eric'),
 ('Bob', 'John'),
 ('Bob', 'Anna'),
 ('Eric', 'Anna'),
 ('Anna', 'Laura'),
 ('Anna', 'John'),
 ('Matt', 'John')]

Add properties to nodes and edges. Here, all the properties have type `numeric`. Other available types are: `categorical` and `text`.

In [6]:
age = [25, 9, 70, 42, 26, 35, 36]
frame.add_node_properties(
    {
        "@id": nodes,
        "age": age
    }, prop_type="numeric")

height = [180, 122, 173, 194, 172, 156, 177]
frame.add_node_properties(
    {
        "@id": nodes,
        "height": height
    }, prop_type="numeric")

weight = [75, 43, 68, 82, 70, 59, 81]
frame.add_node_properties(
    {
        "@id": nodes,
        "weight": weight
    }, prop_type="numeric")


weights = [1.0, 2.2, 0.3, 4.1, 1.5, 21.0, 1.0, 2.5, 7.5]
edge_weight = pd.DataFrame({
    "@source_id": sources,
    "@target_id": targets,
    "distance": weights
})
frame.add_edge_properties(edge_weight, prop_type="numeric")

Get nodes and edges as dataframes.

In [7]:
frame.nodes(raw_frame=True).sample(5)

Unnamed: 0_level_0,age,height,weight
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
John,42,194,82
Anna,26,172,70
Alice,25,180,75
Matt,36,177,81
Laura,35,156,59


In [8]:
frame.edges(raw_frame=True).sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,distance
@source_id,@target_id,Unnamed: 2_level_1
Anna,Laura,1.0
Alice,Bob,1.0
Bob,Anna,1.5
Eric,Anna,21.0
Anna,John,2.5


## Example 2: Random graph with a given density

In this example we will generate a small random graph given a specified density value (i.e. ratio of edges realized of all possible edges between distinct pairs of nodes).

### Create a PandasPGFrame

In [9]:
N = 70  # number of nodes
density = 0.1  # density value

In [10]:
# Helper functions for graph generation

def generate_targets(nodes, s, density=0.2):
    edges = []
    for t in nodes:
        if s < t:
            edge = np.random.choice([0, 1], p=[1 - density, density])
            if edge:
                
                edges.append([s, t])
    return edges


def random_pgframe(n_nodes, density):
    nodes = list(range(n_nodes))

    edges = sum(
        map(lambda x: generate_targets(nodes, x, density), nodes), [])
    edges = pd.DataFrame(
        edges, columns=["@source_id", "@target_id"])
    edges_df = edges.set_index(["@source_id", "@target_id"])
    frame = PandasPGFrame(nodes=nodes, edges=edges_df.index)
    return frame

In [11]:
graph_frame = random_pgframe(N, density)

Get nodes and edges as dataframes.

In [12]:
graph_frame.nodes(raw_frame=True).sample(5)

26
62
39
34
11


In [13]:
graph_frame.edges(raw_frame=True).sample(5)

@source_id,@target_id
13,46
0,56
19,25
13,20
51,64


### Add node and edge types

Here we generate random types for nodes and edges.

In [14]:
types = ["Apple", "Orange", "Carrot"]
node_types = {
    n: np.random.choice(types, p=[0.5, 0.4, 0.1])
    for n in range(N)
}

In [15]:
graph_frame.add_node_types(node_types)

In [16]:
graph_frame.nodes(raw_frame=True).sample(5)

Unnamed: 0_level_0,@type
@id,Unnamed: 1_level_1
6,Orange
53,Orange
66,Apple
31,Orange
47,Orange


In [17]:
types = ["isFriend", "isEnemy"]
edge_types = {
    e: np.random.choice(types, p=[0.8, 0.2])
    for e in graph_frame.edges()
}

In [18]:
graph_frame.add_edge_types(edge_types)

In [19]:
graph_frame.edges(raw_frame=True).sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,@type
@source_id,@target_id,Unnamed: 2_level_1
12,56,isEnemy
32,33,isFriend
4,32,isEnemy
40,57,isFriend
47,54,isFriend


### Add node and edge properties

We add node properties of different data types (`numeric`, `categorical`, `text`) randomly.

In [20]:
weight = pd.DataFrame(
    [
        (n, np.random.normal(loc=35, scale=5))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "weight"]
)

In [21]:
graph_frame.add_node_properties(weight, prop_type="numeric")

In [22]:
colors = ["red", "green", "blue"]

In [23]:
colors = pd.DataFrame(
    [
        (n, np.random.choice(colors))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "color"]
)

In [24]:
graph_frame.add_node_properties(colors, prop_type="category")

In [25]:
desc = pd.DataFrame(
    [
        (n, ' '.join(random.sample(words.words(), 20)))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "desc"]
)

In [26]:
graph_frame.add_node_properties(desc, prop_type="text")

In [27]:
graph_frame.nodes(raw_frame=True).sample(5)

Unnamed: 0_level_0,@type,weight,color,desc
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7,Apple,38.230987,blue,polycrystalline ornithischian kismet cyclophre...
55,Orange,42.171966,green,mantellone sensibilize cannibalize scrupler do...
67,Apple,36.589363,green,methoxide cubocuneiform Dasyurus lovemonger su...
8,Apple,35.480659,green,accresce adenomalacia enucleator pentyne untam...
32,Orange,35.251585,red,Meccawee bigamize Kirsten sappiness Tim phalan...


In [28]:
graph_frame._node_prop_types

{'@type': 'category', 'weight': 'numeric', 'color': 'category', 'desc': 'text'}

We add edge properties of different data types (`numeric`, `categorical`, `text`) randomly.

In [29]:
years = pd.DataFrame(
    [
        (s, t, np.random.randint(0, 20))
        for s, t in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "n_years"]
)

In [30]:
graph_frame.add_edge_properties(years, prop_type="numeric")

In [31]:
shapes = ["dashed", "dotted", "solid"]
shapes = pd.DataFrame(
    [
        (s, t, np.random.choice(shapes))
        for s, t, in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "shapes"]
)

In [32]:
graph_frame.add_edge_properties(shapes, prop_type="category")

In [33]:
desc = pd.DataFrame(
    [
        (s, t, ' '.join(random.sample(words.words(), 20)))
        for s, t, in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "desc"]
)

In [34]:
graph_frame.add_edge_properties(desc, prop_type="text")

In [35]:
graph_frame.edges(raw_frame=True).sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,@type,n_years,shapes,desc
@source_id,@target_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
30,56,isFriend,16,dashed,vikingship cicely hydrobiplane instinctivist m...
5,8,isFriend,16,dotted,barognosis Neil gaminish cassine boonk subditi...
39,42,isFriend,11,dashed,pleuropericardial helder nonslippery equiposti...
41,55,isFriend,18,solid,inalienable posterial cogwood hunting reciproc...
28,38,isFriend,18,dashed,befinger predetachment worshipfully rightship ...


In [36]:
graph_frame._edge_prop_types

{'@type': 'category',
 'n_years': 'numeric',
 'shapes': 'category',
 'desc': 'text'}

### Perform semantic encoding of properties

BlueGraph allows to convert node/edge properties of different data types into numerical vectors.

Create a encoder object for homogeneous encoding (properties of all the nodes (edges) are encoded with feature vectors of the same length independently of their type).

In [37]:
hom_encoder = ScikitLearnPGEncoder(
    node_properties=["weight", "color", "desc"],
    edge_properties=["n_years", "shapes", "desc"],
    edge_features=True,
    heterogeneous=False,
    encode_types=True,
    drop_types=True,
    text_encoding="tfidf",
    standardize_numeric=True)

In [38]:
transformed_frame = hom_encoder.fit_transform(graph_frame)

In [39]:
transformed_frame.nodes(raw_frame=True).sample(5)

Unnamed: 0_level_0,features
@id,Unnamed: 1_level_1
60,"[-0.5606563439968789, 0.0, 0.0, 1.0, 0.0, 0.0,..."
30,"[-0.5723782773667659, 0.0, 0.0, 1.0, 0.0, 0.0,..."
18,"[-0.4444349383011907, 0.0, 1.0, 0.0, 0.0, 0.0,..."
1,"[1.752355378767732, 0.0, 0.0, 1.0, 0.0, 0.0, 0..."
51,"[-0.013849249565769765, 0.0, 0.0, 1.0, 0.0, 0...."


We can inspect encoding models for different node and edge properties created by BlueGraph.

In [40]:
hom_encoder._node_encoders

{'weight': StandardScaler(),
 'color': MultiLabelBinarizer(),
 'desc': <bluegraph.preprocess.utils.TfIdfEncoder at 0x7f8a4069f790>}

In [41]:
transformed_frame.edges(raw_frame=True).sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,features
@source_id,@target_id,Unnamed: 2_level_1
0,51,"[0.7992979631740463, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
41,67,"[-1.6168694614926307, 1.0, 0.0, 0.0, 0.0, 0.0,..."
7,16,"[-0.4087857491592922, 1.0, 0.0, 0.0, 0.0, 0.0,..."
17,54,"[0.10896441326928144, 1.0, 0.0, 0.0, 0.0, 0.0,..."
6,36,"[-1.099119299064057, 0.0, 1.0, 0.0, 0.0, 0.0, ..."


In [42]:
hom_encoder._edge_encoders

{'n_years': StandardScaler(),
 'shapes': MultiLabelBinarizer(),
 'desc': <bluegraph.preprocess.utils.TfIdfEncoder at 0x7f8a3f498390>}

### Convert PGFrames to JSON

In [43]:
json_repr = graph_frame.to_json()

In [44]:
json_repr["nodes"][:2]

[{'@id': 0,
  '@type': 'Apple',
  'weight': 36.48081522792947,
  'color': 'blue',
  'desc': 'sertule Saumya unson subrogation mechanist brachycranial cliffsman quinisext transformance unitarily remunerable ceaselessly oppositifolious Zwinglian cytodiagnosis pauper ophthalmophore allalinite reinsult irreticence'},
 {'@id': 1,
  '@type': 'Orange',
  'weight': 43.06895384018701,
  'color': 'red',
  'desc': 'pleiophylly aegagropila nonblack photoinduction Mauri royale Hezron misanthrope hod adiaphorism ingemination unindwellable ratten substantivally Safini casque handflower amphilogy disagreeableness crumpet'}]

In [45]:
json_repr["edges"][:2]

[{'@source_id': 0,
  '@target_id': 33,
  '@type': 'isEnemy',
  'n_years': 16,
  'shapes': 'dashed',
  'desc': 'sapiutan polysyllable architecturally ossification demonstratable smelt dipolarize unpracticability oraculum outdaciousness putelee coenocyte tights goodwife pressingness jaudie pathlessness enherit Aspidobranchia irresistibility'},
 {'@source_id': 0,
  '@target_id': 47,
  '@type': 'isFriend',
  'n_years': 17,
  'shapes': 'dotted',
  'desc': 'vertebra bushlet androdioecious unmechanic celandine philologue Hartmann acritical yashiro Tartar versicular sleech deteriorator pianistically wejack Delsartian apiculus exsanguinate ferromolybdenum merchantman'}]

Create a new `PandasPGFrame` from the generated representation.

In [46]:
new_frame = PandasPGFrame.from_json(json_repr)

In [47]:
new_frame.nodes(raw_frame=True).sample(5)

Unnamed: 0_level_0,@type,weight,color,desc
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,Orange,28.304847,red,recapacitate Emesa unpresumingness truthlikene...
56,Apple,36.598888,blue,generale rhomboideus reavoid overstimulate she...
33,Orange,37.705923,green,appraise nonremanie Ohio mistreatment semitech...
53,Orange,29.818268,blue,subgape mischance Palaeotheriidae outquote ide...
15,Orange,36.693074,blue,tib ethnotechnography celite dicyanine aortocl...
