In [4]:
import random

import numpy as np
import pandas as pd

from nltk.corpus import words

In [5]:
# import nltk

# nltk.download('words')

In [6]:
from bluegraph import PandasPGFrame
from bluegraph.preprocess import ScikitLearnPGEncoder

In [7]:
def generate_targets(nodes, s, density=0.2):
    edges = []
    for t in nodes:
        if s < t:
            edge = np.random.choice([0, 1], p=[1 - density, density])
            if edge:
                
                edges.append([s, t])
    return edges


def random_pgframe(n_nodes, density):
    nodes = list(range(n_nodes))

    edges = sum(
        map(lambda x: generate_targets(nodes, x, density), nodes), [])
    edges = pd.DataFrame(
        edges, columns=["@source_id", "@target_id"])
    edges_df = edges.set_index(["@source_id", "@target_id"])
    print(edges_df.index)
    frame = PandasPGFrame(nodes=nodes, edges=edges_df.index)
    return frame

In [8]:
N = 70
density = 0.13

# Generate a random graph

In [9]:
graph_frame = random_pgframe(N, density)

MultiIndex([( 0,  2),
            ( 0,  4),
            ( 0,  8),
            ( 0, 18),
            ( 0, 23),
            ( 0, 30),
            ( 0, 32),
            ( 0, 37),
            ( 0, 57),
            ( 0, 58),
            ...
            (59, 60),
            (59, 62),
            (60, 66),
            (61, 64),
            (61, 68),
            (62, 69),
            (63, 64),
            (63, 68),
            (65, 67),
            (67, 68)],
           names=['@source_id', '@target_id'], length=316)


# Add node and edge types

In [7]:
types = ["Apple", "Orange", "Carrot"]

In [8]:
node_types = {
    n: np.random.choice(types, p=[0.5, 0.4, 0.1])
    for n in range(N)
}

In [9]:
graph_frame.add_node_types(node_types)

In [10]:
graph_frame._nodes

Unnamed: 0_level_0,@type
@id,Unnamed: 1_level_1
0,Apple
1,Apple
2,Orange
3,Orange
4,Apple
...,...
65,Orange
66,Orange
67,Orange
68,Orange


In [11]:
types = ["isFriend", "isEnemy"]

In [12]:
edge_types = {
    e: np.random.choice(types, p=[0.8, 0.2])
    for e in graph_frame.edges()
}

In [13]:
graph_frame.add_edge_types(edge_types)

In [14]:
graph_frame._edges

Unnamed: 0_level_0,Unnamed: 1_level_0,@type
@source_id,@target_id,Unnamed: 2_level_1
0,7,isFriend
0,19,isFriend
0,27,isFriend
0,34,isFriend
0,38,isFriend
...,...,...
65,66,isFriend
65,67,isFriend
66,67,isFriend
66,69,isFriend


# Add node and edge properties

numerical, categorical, text

## Add node properties

In [15]:
weight = pd.DataFrame(
    [
        (n, np.random.normal(loc=35, scale=5))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "weight"]
)

In [16]:
graph_frame.add_node_properties(weight, prop_type="numeric")

In [17]:
colors = ["red", "green", "blue"]

In [18]:
colors = pd.DataFrame(
    [
        (n, np.random.choice(colors))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "color"]
)

In [19]:
graph_frame.add_node_properties(colors, prop_type="category")

In [20]:
desc = pd.DataFrame(
    [
        (n, ' '.join(random.sample(words.words(), 20)))
        for n in graph_frame.nodes()
    ], 
    columns=["@id", "desc"]
)

In [21]:
graph_frame.add_node_properties(desc, prop_type="text")

In [22]:
graph_frame._nodes

Unnamed: 0_level_0,@type,weight,color,desc
@id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Apple,29.674754,blue,acridinic unmummied valiancy humour Cydonia pr...
1,Apple,31.723168,red,bridgework perenniality sklinter incarnadine o...
2,Orange,34.747329,red,indevout rinceau Mcintosh radiogoniometry supe...
3,Orange,40.338689,red,perspiry reissue slum Acer cystolith fulminura...
4,Apple,37.049401,green,phytologically retinoid Calandridae gonion fri...
...,...,...,...,...
65,Orange,41.091252,blue,chromid fragmented mildewy frass Gerridae Acma...
66,Orange,34.767637,red,Dionysia admonishment spannel nervily waterwis...
67,Orange,27.463080,green,abusious subdue saltly reinclination photosphe...
68,Orange,37.868604,green,dietotoxicity tartrate merrymaking antiknock p...


In [23]:
graph_frame._node_prop_types

{'weight': 'numeric', 'color': 'category', 'desc': 'text'}

## Add edge properties

In [24]:
years = pd.DataFrame(
    [
        (s, t, np.random.randint(0, 20))
        for s, t in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "n_years"]
)

In [25]:
graph_frame.add_edge_properties(years, prop_type="numeric")

In [26]:
shapes = ["dashed", "dotted", "solid"]
shapes = pd.DataFrame(
    [
        (s, t, np.random.choice(shapes))
        for s, t, in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "shapes"]
)

In [27]:
graph_frame.add_edge_properties(shapes, prop_type="category")

In [28]:
desc = pd.DataFrame(
    [
        (s, t, ' '.join(random.sample(words.words(), 20)))
        for s, t, in graph_frame.edges()
    ], 
    columns=["@source_id", "@target_id", "desc"]
)

In [29]:
graph_frame.add_edge_properties(desc, prop_type="text")

In [30]:
graph_frame._edges

Unnamed: 0_level_0,Unnamed: 1_level_0,@type,n_years,shapes,desc
@source_id,@target_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,7,isFriend,14,solid,resultingly overveil pentahydric polyideic rhy...
0,19,isFriend,11,dashed,explicitly vegetality zoogeographically Bilocu...
0,27,isFriend,19,dotted,syllogist unwilted Delichon pottled gypsyesque...
0,34,isFriend,6,dotted,arsenicophagy phlebotomical pseudomorph allomo...
0,38,isFriend,10,dotted,stringful radiophotograph alkalimeter Neandert...
...,...,...,...,...,...
65,66,isFriend,1,dotted,Alebion nitrocotton Jethronian holohyaline phy...
65,67,isFriend,0,dotted,licorne deutomerite macrotous Chechehet multit...
66,67,isFriend,13,dotted,unspell churlish Narraganset uncost outwith Le...
66,69,isFriend,7,dotted,mystify Mbuba backstroke Parnellite undeformed...


In [31]:
graph_frame._edge_prop_types

{'n_years': 'numeric', 'shapes': 'category', 'desc': 'text'}

# Property encoding

In [32]:
hom_encoder = ScikitLearnPGEncoder(
    heterogeneous=False,
    encode_types=True, drop_types=True, text_encoding="tfidf")

In [33]:
graph_frame._edge_prop_types

{'n_years': 'numeric', 'shapes': 'category', 'desc': 'text'}

In [34]:
transformed_frame = hom_encoder.fit_transform(graph_frame)

In [35]:
hom_encoder._node_encoders

{'@type': MultiLabelBinarizer(),
 'weight': StandardScaler(),
 'color': MultiLabelBinarizer(),
 'desc': TfidfVectorizer(max_features=64, stop_words='english', sublinear_tf=True)}

In [36]:
hom_encoder._edge_encoders

{'@type': MultiLabelBinarizer(),
 'n_years': StandardScaler(),
 'shapes': MultiLabelBinarizer(),
 'desc': TfidfVectorizer(max_features=64, stop_words='english', sublinear_tf=True)}

In [37]:
transformed_frame._nodes

Unnamed: 0_level_0,features
@id,Unnamed: 1_level_1
0,"[1.0, 0.0, 0.0, -1.140728778341095, 1.0, 0.0, ..."
1,"[1.0, 0.0, 0.0, -0.72790544229623, 0.0, 0.0, 1..."
2,"[0.0, 0.0, 1.0, -0.118436847924602, 0.0, 0.0, ..."
3,"[0.0, 0.0, 1.0, 1.0084074828161862, 0.0, 0.0, ..."
4,"[1.0, 0.0, 0.0, 0.34550704907778085, 0.0, 1.0,..."
...,...
65,"[0.0, 0.0, 1.0, 1.1600737519075413, 1.0, 0.0, ..."
66,"[0.0, 0.0, 1.0, -0.11434403548223476, 0.0, 0.0..."
67,"[0.0, 0.0, 1.0, -1.5864541768629998, 0.0, 1.0,..."
68,"[0.0, 0.0, 1.0, 0.5106035495345334, 0.0, 1.0, ..."


In [38]:
transformed_frame._edges

Unnamed: 0_level_0,Unnamed: 1_level_0,features
@source_id,@target_id,Unnamed: 2_level_1
0,7,"[0.0, 1.0, 0.7458675902680008, 0.0, 0.0, 1.0, ..."
0,19,"[0.0, 1.0, 0.20787356667396992, 1.0, 0.0, 0.0,..."
0,27,"[0.0, 1.0, 1.642524296258052, 0.0, 1.0, 0.0, 0..."
0,34,"[0.0, 1.0, -0.6887831393160815, 0.0, 1.0, 0.0,..."
0,38,"[0.0, 1.0, 0.028542225475959648, 0.0, 1.0, 0.0..."
...,...,...
65,66,"[0.0, 1.0, -1.5854398453061327, 0.0, 1.0, 0.0,..."
65,67,"[0.0, 1.0, -1.764771186504143, 0.0, 1.0, 0.0, ..."
66,67,"[0.0, 1.0, 0.5665362490699904, 0.0, 1.0, 0.0, ..."
66,69,"[0.0, 1.0, -0.5094517981180712, 0.0, 1.0, 0.0,..."


In [39]:
transformed_frame._nodes["features"].to_list()[0].shape

(71,)

In [40]:
transformed_frame._edges["features"].to_list()[0].shape

(70,)

In [41]:
hetero_encoder = ScikitLearnPGEncoder(
    heterogeneous=True, text_encoding="tfidf")

In [42]:
hetero_encoder.fit(graph_frame)

In [44]:
transformed_frame = hetero_encoder.transform(graph_frame)

In [45]:
transformed_frame._nodes

Unnamed: 0_level_0,features,@type
@id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[-1.1367564524120004, 1.0, 0.0, 0.0, 0.0, 0.0,...",Apple
1,"[-0.6889777579992171, 0.0, 0.0, 1.0, 0.0, 0.0,...",Apple
2,"[-0.18327594143750522, 0.0, 0.0, 1.0, 0.0, 0.0...",Orange
3,"[0.8219906455377133, 0.0, 0.0, 1.0, 0.0, 0.707...",Orange
4,"[0.47532474621190834, 0.0, 1.0, 0.0, 0.0, 0.37...",Apple
...,...,...
65,"[0.9572933031714441, 1.0, 0.0, 0.0, 0.0, 0.0, ...",Orange
66,"[-0.17962471164220295, 0.0, 0.0, 1.0, 0.0, 0.0...",Orange
67,"[-1.4929056122121913, 0.0, 1.0, 0.0, 0.0, 0.0,...",Orange
68,"[0.37789588697962, 0.0, 1.0, 0.0, 0.0, 0.0, 0....",Orange


In [46]:
transformed_frame._edges

Unnamed: 0_level_0,Unnamed: 1_level_0,features,@type
@source_id,@target_id,Unnamed: 2_level_1,Unnamed: 3_level_1
0,7,"[0.7360007733457389, 0.0, 0.0, 1.0, 0.0, 0.0, ...",isFriend
0,19,"[0.19716717964858332, 1.0, 0.0, 0.0, 0.0, 0.0,...",isFriend
0,27,"[1.634056762840998, 0.0, 1.0, 0.0, 0.0, 0.0, 0...",isFriend
0,34,"[-0.7008888098466759, 0.0, 1.0, 0.0, 0.0, 0.0,...",isFriend
0,38,"[0.017555981749531473, 0.0, 1.0, 0.0, 0.0, 0.0...",isFriend
...,...,...,...
65,66,"[-1.5989447993419352, 0.0, 1.0, 0.0, 0.0, 0.0,...",isFriend
65,67,"[-1.778555997240987, 0.0, 1.0, 0.0, 0.0, 0.0, ...",isFriend
66,67,"[0.556389575446687, 0.0, 1.0, 0.0, 0.0, 0.0, 0...",isFriend
66,69,"[-0.5212776119476241, 0.0, 1.0, 0.0, 0.0, 0.0,...",isFriend
