In [1]:
import random

import numpy as np
import pandas as pd

from nltk.corpus import words

In [2]:
# import nltk

# nltk.download('words')

In [3]:
from bluegraph import PandasPGFrame
from bluegraph.preprocess import ScikitLearnPGEncoder

In [7]:
def generate_targets(nodes, s, density=0.2):
    edges = []
    for t in nodes:
        if s < t:
            edge = np.random.choice([0, 1], p=[1 - density, density])
            if edge:
                
                edges.append([s, t])
    return edges


def random_pgframe(n_nodes, density):
    nodes = list(range(n_nodes))

    edges = sum(
        map(lambda x: generate_targets(nodes, x, density), nodes), [])
    edges = pd.DataFrame(
        edges, columns=["@source_id", "@target_id"])
    edges_df = edges.set_index(["@source_id", "@target_id"])
    frame = PandasPGFrame(nodes=nodes, edges=edges_df.index)
    return frame

In [5]:
N = 70
density = 0.13

# Generate a random graph

In [8]:
graph_frame = random_pgframe(N, density)

In [9]:
graph_frame._nodes

0
1
2
3
4
...
65
66
67
68
69


In [10]:
graph_frame._edges

@source_id,@target_id
0,11
0,19
0,20
0,22
0,27
...,...
62,64
63,64
64,65
66,69


# Add node and edge types

In [11]:
types = ["Apple", "Orange", "Carrot"]

In [12]:
node_types = {
    n: np.random.choice(types, p=[0.5, 0.4, 0.1])
    for n in range(N)
}

In [14]:
graph_frame.add_node_types(node_types)

In [15]:
graph_frame._nodes

Unnamed: 0_level_0,@type
@id,Unnamed: 1_level_1
0,Orange
1,Apple
2,Apple
3,Apple
4,Apple
...,...
65,Apple
66,Apple
67,Orange
68,Orange


In [16]:
types = ["isFriend", "isEnemy"]

In [17]:
edge_types = {
    e: np.random.choice(types, p=[0.8, 0.2])
    for e in graph_frame.edges()
}

In [18]:
graph_frame.add_edge_types(edge_types)

In [19]:
graph_frame._edges

Unnamed: 0_level_0,Unnamed: 1_level_0,@type
@source_id,@target_id,Unnamed: 2_level_1
0,11,isEnemy
0,19,isFriend
0,20,isFriend
0,22,isFriend
0,27,isEnemy
...,...,...
62,64,isFriend
63,64,isFriend
64,65,isFriend
66,69,isFriend


# Add node and edge properties

numerical, categorical, text

## Add node properties

In [20]:
weight = pd.DataFrame(
    [
        (n, np.random.normal(loc=35, scale=5))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "weight"]
)

In [22]:
graph_frame.add_node_properties(weight, prop_type="numeric")

In [25]:
colors = ["red", "green", "blue"]

In [26]:
colors = pd.DataFrame(
    [
        (n, np.random.choice(colors))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "color"]
)

In [27]:
graph_frame.add_node_properties(colors, prop_type="category")

In [28]:
desc = pd.DataFrame(
    [
        (n, ' '.join(random.sample(words.words(), 20)))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "desc"]
)

In [29]:
graph_frame.add_node_properties(desc, prop_type="text")

In [30]:
graph_frame._nodes

Unnamed: 0_level_0,@type,weight,color,desc
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Orange,33.009618,green,rhombos unkinglike couchmaking coreplastic inf...
1,Apple,35.173856,red,nonplastic insightful Hydrophyllaceae massebah...
2,Apple,35.949917,green,Messinese unpiety moromancy waterbrain pecopte...
3,Apple,45.994004,green,formulatory bestrew angelolatry micht sauriosi...
4,Apple,24.224365,red,amongst subbromid superstructor pentacetate fu...
...,...,...,...,...
65,Apple,40.707186,red,phenakism peristeronic deair esurient Dulangan...
66,Apple,34.152254,blue,Vestas franker counterwind unisolate untasty C...
67,Orange,29.789191,red,charmlessly bud bahnung idiosome shapeshifter ...
68,Orange,32.252627,green,maltose previolate oxytocic cacodemonic rebrus...


In [31]:
graph_frame._node_prop_types

{'@type': 'category', 'weight': 'numeric', 'color': 'category', 'desc': 'text'}

## Add edge properties

In [32]:
years = pd.DataFrame(
    [
        (s, t, np.random.randint(0, 20))
        for s, t in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "n_years"]
)

In [33]:
graph_frame.add_edge_properties(years, prop_type="numeric")

In [34]:
shapes = ["dashed", "dotted", "solid"]
shapes = pd.DataFrame(
    [
        (s, t, np.random.choice(shapes))
        for s, t, in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "shapes"]
)

In [35]:
graph_frame.add_edge_properties(shapes, prop_type="category")

In [36]:
desc = pd.DataFrame(
    [
        (s, t, ' '.join(random.sample(words.words(), 20)))
        for s, t, in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "desc"]
)

In [37]:
graph_frame.add_edge_properties(desc, prop_type="text")

In [38]:
graph_frame._edges

Unnamed: 0_level_0,Unnamed: 1_level_0,@type,n_years,shapes,desc
@source_id,@target_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,11,isEnemy,2,solid,Pyrenomycetineae Chroococcus awfu exchequer ob...
0,19,isFriend,13,dashed,proceed proangiosperm unfinical seediness Joub...
0,20,isFriend,18,dashed,roamage bisymmetric oesophagostomiasis microga...
0,22,isFriend,3,dashed,lepidene warree siderean slideableness unshrug...
0,27,isEnemy,14,dashed,theopathy bogart subofficial Arosaguntacook gr...
...,...,...,...,...,...
62,64,isFriend,4,solid,exodist unembarrassedly courtyard rutherfordit...
63,64,isFriend,19,dashed,shelterlessness Negritic ingeniously gallowsma...
64,65,isFriend,15,dashed,Fodientia tonsure fumeless Slavification bulge...
66,69,isFriend,3,dotted,beduke bleakly doloriferous basipoditic psycho...


In [39]:
graph_frame._edge_prop_types

{'@type': 'category',
 'n_years': 'numeric',
 'shapes': 'category',
 'desc': 'text'}

# Property encoding

In [40]:
hom_encoder = ScikitLearnPGEncoder(
    heterogeneous=False,
    encode_types=True, drop_types=True, text_encoding="tfidf")

In [41]:
graph_frame._edge_prop_types

{'@type': 'category',
 'n_years': 'numeric',
 'shapes': 'category',
 'desc': 'text'}

In [42]:
transformed_frame = hom_encoder.fit_transform(graph_frame)

In [44]:
transformed_frame._nodes

Unnamed: 0_level_0,features
@id,Unnamed: 1_level_1
0,"[0.0, 0.0, 1.0, -0.4054220559072771, 0.0, 1.0,..."
1,"[1.0, 0.0, 0.0, 0.03533446823235889, 0.0, 0.0,..."
2,"[1.0, 0.0, 0.0, 0.1933825673112407, 0.0, 1.0, ..."
3,"[1.0, 0.0, 0.0, 2.2389049730847126, 0.0, 1.0, ..."
4,"[1.0, 0.0, 0.0, -2.194577363879286, 0.0, 0.0, ..."
...,...
65,"[1.0, 0.0, 0.0, 1.162221364563694, 0.0, 0.0, 1..."
66,"[1.0, 0.0, 0.0, -0.17271921450297287, 1.0, 0.0..."
67,"[0.0, 0.0, 1.0, -1.0612762004891056, 0.0, 0.0,..."
68,"[0.0, 0.0, 1.0, -0.5595866152961186, 0.0, 1.0,..."


In [45]:
hom_encoder._node_encoders

{'@type': MultiLabelBinarizer(),
 'weight': StandardScaler(),
 'color': MultiLabelBinarizer(),
 'desc': TfidfVectorizer(max_features=64, stop_words='english', sublinear_tf=True)}

In [36]:
hom_encoder._edge_encoders

{'@type': MultiLabelBinarizer(),
 'n_years': StandardScaler(),
 'shapes': MultiLabelBinarizer(),
 'desc': TfidfVectorizer(max_features=64, stop_words='english', sublinear_tf=True)}

In [37]:
transformed_frame._nodes

Unnamed: 0_level_0,features
@id,Unnamed: 1_level_1
0,"[1.0, 0.0, 0.0, -1.140728778341095, 1.0, 0.0, ..."
1,"[1.0, 0.0, 0.0, -0.72790544229623, 0.0, 0.0, 1..."
2,"[0.0, 0.0, 1.0, -0.118436847924602, 0.0, 0.0, ..."
3,"[0.0, 0.0, 1.0, 1.0084074828161862, 0.0, 0.0, ..."
4,"[1.0, 0.0, 0.0, 0.34550704907778085, 0.0, 1.0,..."
...,...
65,"[0.0, 0.0, 1.0, 1.1600737519075413, 1.0, 0.0, ..."
66,"[0.0, 0.0, 1.0, -0.11434403548223476, 0.0, 0.0..."
67,"[0.0, 0.0, 1.0, -1.5864541768629998, 0.0, 1.0,..."
68,"[0.0, 0.0, 1.0, 0.5106035495345334, 0.0, 1.0, ..."


In [38]:
transformed_frame._edges

Unnamed: 0_level_0,Unnamed: 1_level_0,features
@source_id,@target_id,Unnamed: 2_level_1
0,7,"[0.0, 1.0, 0.7458675902680008, 0.0, 0.0, 1.0, ..."
0,19,"[0.0, 1.0, 0.20787356667396992, 1.0, 0.0, 0.0,..."
0,27,"[0.0, 1.0, 1.642524296258052, 0.0, 1.0, 0.0, 0..."
0,34,"[0.0, 1.0, -0.6887831393160815, 0.0, 1.0, 0.0,..."
0,38,"[0.0, 1.0, 0.028542225475959648, 0.0, 1.0, 0.0..."
...,...,...
65,66,"[0.0, 1.0, -1.5854398453061327, 0.0, 1.0, 0.0,..."
65,67,"[0.0, 1.0, -1.764771186504143, 0.0, 1.0, 0.0, ..."
66,67,"[0.0, 1.0, 0.5665362490699904, 0.0, 1.0, 0.0, ..."
66,69,"[0.0, 1.0, -0.5094517981180712, 0.0, 1.0, 0.0,..."


In [39]:
transformed_frame._nodes["features"].to_list()[0].shape

(71,)

In [40]:
transformed_frame._edges["features"].to_list()[0].shape

(70,)

In [41]:
hetero_encoder = ScikitLearnPGEncoder(
    heterogeneous=True, text_encoding="tfidf")

In [42]:
hetero_encoder.fit(graph_frame)

In [44]:
transformed_frame = hetero_encoder.transform(graph_frame)

In [46]:
transformed_frame._nodes

Unnamed: 0_level_0,features
@id,Unnamed: 1_level_1
0,"[0.0, 0.0, 1.0, -0.4054220559072771, 0.0, 1.0,..."
1,"[1.0, 0.0, 0.0, 0.03533446823235889, 0.0, 0.0,..."
2,"[1.0, 0.0, 0.0, 0.1933825673112407, 0.0, 1.0, ..."
3,"[1.0, 0.0, 0.0, 2.2389049730847126, 0.0, 1.0, ..."
4,"[1.0, 0.0, 0.0, -2.194577363879286, 0.0, 0.0, ..."
...,...
65,"[1.0, 0.0, 0.0, 1.162221364563694, 0.0, 0.0, 1..."
66,"[1.0, 0.0, 0.0, -0.17271921450297287, 1.0, 0.0..."
67,"[0.0, 0.0, 1.0, -1.0612762004891056, 0.0, 0.0,..."
68,"[0.0, 0.0, 1.0, -0.5595866152961186, 0.0, 1.0,..."


In [46]:
transformed_frame._edges

Unnamed: 0_level_0,Unnamed: 1_level_0,features,@type
@source_id,@target_id,Unnamed: 2_level_1,Unnamed: 3_level_1
0,7,"[0.7360007733457389, 0.0, 0.0, 1.0, 0.0, 0.0, ...",isFriend
0,19,"[0.19716717964858332, 1.0, 0.0, 0.0, 0.0, 0.0,...",isFriend
0,27,"[1.634056762840998, 0.0, 1.0, 0.0, 0.0, 0.0, 0...",isFriend
0,34,"[-0.7008888098466759, 0.0, 1.0, 0.0, 0.0, 0.0,...",isFriend
0,38,"[0.017555981749531473, 0.0, 1.0, 0.0, 0.0, 0.0...",isFriend
...,...,...,...
65,66,"[-1.5989447993419352, 0.0, 1.0, 0.0, 0.0, 0.0,...",isFriend
65,67,"[-1.778555997240987, 0.0, 1.0, 0.0, 0.0, 0.0, ...",isFriend
66,67,"[0.556389575446687, 0.0, 1.0, 0.0, 0.0, 0.0, 0...",isFriend
66,69,"[-0.5212776119476241, 0.0, 1.0, 0.0, 0.0, 0.0,...",isFriend
