# GrowSmart Graph ML Proof of Concept

## Setup and imports

In [1]:
%load_ext graph_notebook.magics

The graph_notebook.magics extension is already loaded. To reload it, use:
  %reload_ext graph_notebook.magics


In [2]:
%graph_notebook_host growsmart-neptune.cluster-custom-cgogeml0cuty.eu-west-3.neptune.amazonaws.com

set host to growsmart-neptune.cluster-custom-cgogeml0cuty.eu-west-3.neptune.amazonaws.com


In [3]:
import os
import io
os.environ['DGLBACKEND'] = 'pytorch'
import pandas as pd
import dgl
import numpy as np
from neo4j import GraphDatabase
from neo4j.graph import Node, Relationship
from torch import tensor, stack
from torch.nn import Module
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from pandas.api.types import is_numeric_dtype
import torch.nn.functional as F
from dgl.nn import GraphConv
import torch
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from dgl.nn import DeepWalk
from dgl.nn import DeepWalk
from torch.optim import SparseAdam
from torch.utils.data import DataLoader
import random

In [4]:
uri = "bolt://growsmart-neptune.cluster-custom-cgogeml0cuty.eu-west-3.neptune.amazonaws.com:8182"
driver = GraphDatabase.driver(uri, auth=("username", "password"), encrypted=True)

In [5]:
def run_query(query, parameters=None):
    with driver.session() as session:
        result = session.run(query, parameters)
        return [record for record in result]

## Let's do a Proof of Concept

https://julsimon.medium.com/a-primer-on-graph-neural-networks-with-amazon-neptune-and-the-deep-graph-library-5ce64984a276  

The advantage of creating a Graph Neural Network and exploit it for regression or classification is that the Graph stores the underlying connections between the nodes. This makes them more powerful than classic DM or ML approaches where we don't have this advantage.  
In the case of GrowSmart, we can exploit the graph in many ways depending on the end user:
- For internal usage: Analysts within the company can learn trends, patterns, and evaluate the status of the plants and gardens.
- For customer usage: The model can serve directly the customer's application and tell them if their plants/gardens are healthy or not based on the previous data and on the weather forecasts.

We can stem many model ideas from these perspectives:
- For the users:
    - Are my plants humid enough?
    - Tomorrow it's going to be sunny, what is the expected ph level of my tomates?
- For the business:
    - It has been rainy in Barcelona this week, to which users should we issue a warning for their plants?
    - Which plants are healthy based on their sensor data?
    
    
For this Proof of Concept, let's apply Node Binary Classification to predict if a plant is Healthy or not. To do this we will use DGL (Deep Graph Library) and Pytorch to train a simple CGN (Convolutional Graph Network).

In [6]:
%%oc --store-to labels

MATCH (n)
WITH labels(n)[0] AS lbl
RETURN DISTINCT lbl

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…

In [7]:
%%oc --store-to labels_edge
MATCH (n)-[e]->(m)
WITH type(e) as lbl
RETURN DISTINCT lbl

Tab(children=(Output(layout=Layout(max_height='600px', max_width='940px', overflow='scroll')), Output(layout=L…

In [8]:
run_query("MATCH (n) RETURN n LIMIT 1")

[<Record n=<Node id=16777998 labels=frozenset({'Plant'}) properties={'iot_plant_id': '580b5c4d-933a-4b01-9b16-6ca2f81a3a1a', 'status': 1.0}>>]

In [9]:
def get_nodess_v2():
    def get_nodes():
        query = f"""
            MATCH (p:Plant)<-[m:MEASURES]-(s:Sensor)-[r:REGISTERS]->(e:Event)-[c:CONTAINS]->(sd)
            WITH p, m, s, r, e, c, sd
            WHERE sd['value'] IS NOT NULL AND labels(sd)[0] <> 'iot_light' AND labels(sd)[0] <> 'iot_motion'
            RETURN ID(p) AS plant_id, ID(s) AS sensor_id, ID(sd) AS sdid, p['status'] AS plant_status, ID(e) AS event_id, sd['value'] AS value, labels(sd)[0] AS value_label
        """
        result = run_query(query)
        data = []
        label = None
        for record in result:
            row = dict(record)
            data.append(row)
        df = pd.DataFrame(data)
        stream = io.StringIO()
        df.to_csv(stream, index=False)
        stream.seek(0)
        df = pd.read_csv(stream)
        stream.close()
        return df
    node_dfs = get_nodes()
    node_dfs = node_dfs.reset_index(drop=True)
    grouped = node_dfs.groupby("value_label")
    groups = []
    scaler = MinMaxScaler()
    node_dfs['value'] = grouped['value'].transform(lambda x: scaler.fit_transform(x.values.reshape(-1, 1)).flatten())
    one_hot = OneHotEncoder(sparse=False)
    encoded = one_hot.fit_transform(node_dfs['value_label'].values.reshape(-1, 1))
    one_hot_df = pd.DataFrame(encoded, columns=one_hot.get_feature_names_out(['value_label']))
    node_dfs = pd.concat([node_dfs.drop('value_label', axis=1), one_hot_df], axis=1)
    cols = node_dfs.columns
    return node_dfs


nodes_df2 = get_nodess_v2()
nodes_df2 = nodes_df2.reset_index(drop=True)

# Separate 5 observations for final demonstration
sample_df = nodes_df2.sample(5)
sample_df = sample_df.reset_index(drop=True)
sample_df.to_csv("sample.csv", header=True, index=False)

node_indices = dict()
for idcol in ['plant_id', 'sensor_id', 'sdid', 'event_id']:
    for i, nodeid in enumerate(nodes_df2[idcol]):
        node_indices[nodeid] = i

sample_node_indices = dict()
for idcol in ['plant_id', 'sensor_id', 'sdid', 'event_id']:
    for i, nodeid in enumerate(sample_df[idcol]):
        sample_node_indices[nodeid] = i

#node_indices = {id: i for i, id in enumerate(nodes_df2['plant_id'])}
#sample_node_indices = {id:i for i, id in enumerate(sample_df['plant_id'])}
nodes_df2.head()


Unnamed: 0,plant_id,sensor_id,sdid,plant_status,event_id,value,value_label_iot_co,value_label_iot_humidity,value_label_iot_lpg,value_label_iot_rainfall,value_label_iot_smoke,value_label_iot_soil_humidity,value_label_iot_soil_nitrogen,value_label_iot_soil_ph,value_label_iot_soil_phosporous,value_label_iot_soil_potassium,value_label_iot_soil_temp,value_label_iot_temp
0,PLANT_2d04335e-db7d-4f83-a21c-104be59d1c85,SENSOR_az:12:rb:ss:34:gh,SENSORDATA_0dce1170-17fe-52f6-bec1-f84786203e67,0.0,EVENT_5ac10a11-ebe1-5f3a-a080-18ab29e1351b,0.539419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,PLANT_3f6c4278-9698-4e08-b386-459fb50008b8,SENSOR_az:12:rb:ss:34:gh,SENSORDATA_0dce1170-17fe-52f6-bec1-f84786203e67,0.0,EVENT_5ac10a11-ebe1-5f3a-a080-18ab29e1351b,0.539419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,PLANT_e8f48e96-1fd0-4435-b02a-d5e14f39196f,SENSOR_az:12:rb:ss:34:gh,SENSORDATA_0dce1170-17fe-52f6-bec1-f84786203e67,0.0,EVENT_5ac10a11-ebe1-5f3a-a080-18ab29e1351b,0.539419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,PLANT_5a508647-c0e4-421f-8296-f0ff1ed02ea4,SENSOR_az:12:rb:ss:34:gh,SENSORDATA_0dce1170-17fe-52f6-bec1-f84786203e67,0.0,EVENT_5ac10a11-ebe1-5f3a-a080-18ab29e1351b,0.539419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,PLANT_2d04335e-db7d-4f83-a21c-104be59d1c85,SENSOR_az:12:rb:ss:34:gh,SENSORDATA_fbdaa1cd-3d66-5f67-aae6-d98f34ea6f10,0.0,EVENT_5ac10a11-ebe1-5f3a-a080-18ab29e1351b,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
def get_edges(label):
    query = f"MATCH(n)-[e:{label}]->(m) RETURN type(e) AS lbl, id(e) AS idedge, id(n) AS start, id(m) AS end"
    result = run_query(query)
    data = []
    for record in result:
        id_ = record['idedge']
        start_node = record['start']
        end_node = record['end']
        label = record['lbl']
        row = {'id': id_, 'start_node': start_node, 'end_node': end_node, 'lbl': label}
        data.append(row)
    df = pd.DataFrame(data)
    return df

edge_labels = [lbl for lbl in ['MEASURES', 'REGISTERS', 'CONTAINS']]
edge_dfs = [get_edges(lbl) for lbl in edge_labels]
edges_df = pd.concat(edge_dfs)
edges_df = edges_df.reset_index(drop=True)

sample_edge_labels = [lbl for lbl in ['MEASURES', 'REGISTERS', 'CONTAINS']]
sample_edge_dfs = [get_edges(lbl) for lbl in sample_edge_labels]
sample_edges_df = pd.concat(sample_edge_dfs)
sample_edges_df = sample_edges_df.reset_index(drop=True)

edges_df['source_idx'] = edges_df['start_node'].map(node_indices)
edges_df['target_idx'] = edges_df['end_node'].map(node_indices)

sample_edges_df['source_idx'] = sample_edges_df['start_node'].map(sample_node_indices)
sample_edges_df['target_idx'] = sample_edges_df['end_node'].map(sample_node_indices)

encoder = LabelEncoder()
encoded_labels_edges = encoder.fit_transform(edges_df['lbl'].unique())
encoded_labels_edges_map = {lbl: enclbl for lbl, enclbl in zip(edges_df['lbl'].unique(), encoded_labels_edges)}
edges_df['label_encoded'] = edges_df['lbl'].map(encoded_labels_edges_map)

sample_encoder = LabelEncoder()
sample_encoded_labels_edges = sample_encoder.fit_transform(sample_edges_df['lbl'].unique())
sample_encoded_labels_edges_map = {lbl: enclbl for lbl, enclbl in zip(sample_edges_df['lbl'].unique(), sample_encoded_labels_edges)}
sample_edges_df['label_encoded'] = sample_edges_df['lbl'].map(sample_encoded_labels_edges_map)

sample_edges_df.to_csv("sample-edges.csv", header=True, index=False)

edges_df.head()
#sample_edges_df.head()

Unnamed: 0,id,start_node,end_node,lbl,source_idx,target_idx,label_encoded
0,E_baea41ef-a4a6-54b8-afa2-e96a1f9e19a8,SENSORDATA_d20cacf8-a4f7-5593-a6e4-256840de8847,PLANT_580b5c4d-933a-4b01-9b16-6ca2f81a3a1a,MEASURES,1505.0,,1
1,E_6135a13d-579d-5c21-b9c7-f6611cab7296,SENSORDATA_8cb19054-4215-5c58-82a1-0d5cacc90a2e,PLANT_580b5c4d-933a-4b01-9b16-6ca2f81a3a1a,MEASURES,1478.0,,1
2,E_55ccc6b7-3ec0-5263-a13e-9eaa5544fcb3,SENSORDATA_567be274-f348-5aed-b279-4db507089126,PLANT_580b5c4d-933a-4b01-9b16-6ca2f81a3a1a,MEASURES,1523.0,,1
3,E_836f13b3-4ed4-5e02-b8e1-f03a0ad292f7,SENSORDATA_72591c58-28a7-56e2-92ae-e70a3f4b1940,PLANT_580b5c4d-933a-4b01-9b16-6ca2f81a3a1a,MEASURES,1469.0,,1
4,E_31ed5b9b-e6a3-5ee3-b095-47ece53a12f0,SENSORDATA_dc058ba9-5284-52e4-a0f3-34a3750a4073,PLANT_580b5c4d-933a-4b01-9b16-6ca2f81a3a1a,MEASURES,1433.0,,1


### Defining the graph

In [11]:
g = dgl.graph((edges_df['source_idx'], edges_df['target_idx']))
for v in nodes_df2.columns:
    if 'value' in v:
        print(v)
        g.ndata[v] = tensor(nodes_df2[v])
g.ndata['plant_status'] = tensor(nodes_df2['plant_status'])
g = dgl.add_self_loop(g)

value
value_label_iot_co
value_label_iot_humidity
value_label_iot_lpg
value_label_iot_rainfall
value_label_iot_smoke
value_label_iot_soil_humidity
value_label_iot_soil_nitrogen
value_label_iot_soil_ph
value_label_iot_soil_phosporous
value_label_iot_soil_potassium
value_label_iot_soil_temp
value_label_iot_temp


### Training the Network

In [12]:
class GCNModel(Module):
    def __init__(self, in_feats, h_feats):
        super(GCNModel, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, 1)

    def forward(self, g, in_feat):
        h =  self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h.squeeze()

In [13]:
feature_cols = [valcol for valcol in g.ndata if valcol != 'label_encoded']
features = stack([g.ndata[feat] for feat in g.ndata], dim=1)
model_labels = g.ndata['plant_status'].float()
num_classes = len(model_labels.unique())
train_mask, test_mask = train_test_split(range(len(features)), test_size=0.2, random_state=42)
sample_mask = random.sample(test_mask, 5)
test_mask = [m for m in test_mask if m not in sample_mask]
train_mask = tensor(train_mask).numpy()
test_mask = tensor(test_mask).numpy()
input_features = features.shape[1]
h_feats = 6
num_epochs = 30
learning_rate = 0.005

nodes_df2.loc[sample_mask].to_csv("sample.csv", index=True, header=True)
#feature_cols = [valcol for valcol in g.ndata if valcol != 'label_encoded']
#features = stack([g.ndata[valcol] for valcol in g.ndata if valcol != 'label_encoded'], dim=1)
#model_labels = g.ndata['status_Plant'].long()
#num_classes = len(model_labels.unique())
#train_mask, test_mask = train_test_split(range(len(features[0])), test_size=0.2, random_state=42)
#train_mask = tensor(train_mask).numpy()
#test_mask = tensor(test_mask).numpy()
#input_features = features.shape[1]
#h_feats = 200
#num_epochs = 5
#learning_rate = 1

In [14]:
model = GCNModel(input_features, h_feats).double()

In [15]:
def train(g, model, features=features, labels=model_labels, 
          train_mask=train_mask, test_mask=test_mask, lr=learning_rate, num_epochs=num_epochs):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.BCELoss()
    
    best_val_acc = 0
    best_test_acc = 0
    for e in range(num_epochs):
        model.train()
        # Forward
        logits = model(g, features)
        
        # Compute loss
        probs = torch.sigmoid(logits)
        loss = criterion(probs[train_mask], labels[train_mask].double())

        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Compute accuracy on training/validation/test
        train_pred = (probs[train_mask] > 0.5).float()
        test_pred = (probs[test_mask] > 0.5).float()
        train_acc = (train_pred == labels[train_mask]).float().mean().item()
        test_acc = (test_pred == labels[test_mask]).float().mean().item()
        print(f"Epoch {e}, Loss {loss}, Train acc {train_acc}, Test acc. {test_acc}")
    return model

model = train(g, model)
torch.save(model.state_dict(), 'model.pth')

#fig = plt.figure()
#ax1 = fig.add_subplot()
#ax1.scatter(x=[i for i in range(len(losses))], c='b',y=losses, label='test', marker='s')
#plt.show()

Epoch 0, Loss 0.657881443485794, Train acc 0.5220916271209717, Test acc. 0.5549226999282837
Epoch 1, Loss 0.6550750201375659, Train acc 0.6128901243209839, Test acc. 0.6265256404876709
Epoch 2, Loss 0.6522113261717174, Train acc 0.644912838935852, Test acc. 0.6533767580986023
Epoch 3, Loss 0.6493159105662806, Train acc 0.6657884120941162, Test acc. 0.6729047894477844
Epoch 4, Loss 0.6464121266255195, Train acc 0.6751114726066589, Test acc. 0.6818551421165466
Epoch 5, Loss 0.6434359929121634, Train acc 0.7026753425598145, Test acc. 0.7070789337158203
Epoch 6, Loss 0.6404155990515542, Train acc 0.7423996925354004, Test acc. 0.7477623820304871
Epoch 7, Loss 0.6373992305663986, Train acc 0.7521280646324158, Test acc. 0.7575264573097229
Epoch 8, Loss 0.6343538493949212, Train acc 0.769152820110321, Test acc. 0.7681041359901428
Epoch 9, Loss 0.6312188784099431, Train acc 0.8344142436981201, Test acc. 0.8323840498924255
Epoch 10, Loss 0.6280126619483621, Train acc 0.8481962084770203, Test acc

This model can be used to use classification to predict if a Plant is Healthy or not, based on sensor data.

We can now use a function to simulate incoming new data, and predicting values for it.

In [21]:
sample_df = pd.read_csv("sample.csv", index_col=0)

model.eval()  # set the model to evaluation mode

# Pass the new data through the model
with torch.no_grad():  # disable gradient computation to save memory
    logits = model(g, features)
    probs = torch.sigmoid(logits)
    print(probs[sample_mask])
    predictions = (probs[sample_mask] > 0.5).float()
    print(predictions)
    print(sample_df['plant_status'])
    sample_df['predicted_status'] = predictions

sample_df.to_csv("sample-results.csv", header=True, index=False)
sample_df

tensor([0.5189, 0.4738, 0.4264, 0.5933, 0.5151], dtype=torch.float64)
tensor([1., 0., 0., 1., 1.])
3968    1.0
1487    0.0
2098    0.0
5723    1.0
1425    0.0
Name: plant_status, dtype: float64


Unnamed: 0,plant_id,sensor_id,sdid,plant_status,event_id,value,value_label_iot_co,value_label_iot_humidity,value_label_iot_lpg,value_label_iot_rainfall,value_label_iot_smoke,value_label_iot_soil_humidity,value_label_iot_soil_nitrogen,value_label_iot_soil_ph,value_label_iot_soil_phosporous,value_label_iot_soil_potassium,value_label_iot_soil_temp,value_label_iot_temp,predicted_status
3968,PLANT_6328e6bf-cb0b-4c74-87c3-fd2bcdb07251,SENSOR_18:24:as:kf:24:00,SENSORDATA_58e5cf72-ee27-5cad-b4f2-230f4f761cf3,1.0,EVENT_901666b0-8de7-5fb1-9654-12ff070a6ced,0.714286,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1487,PLANT_e237c159-b17d-40ee-b584-03259b0478b7,SENSOR_kd:sd:3a:33:69:42,SENSORDATA_7117282c-337c-5109-829e-a2d9adac4825,0.0,EVENT_1db9690c-4734-51ee-9fb7-15510def9bfb,0.468399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2098,PLANT_1b5a55fc-517e-4878-9ab9-533a3939caf2,SENSOR_kd:sd:3a:33:69:42,SENSORDATA_bb6e222a-9cc0-5d80-8373-181745c5c010,0.0,EVENT_f75a3060-415d-58c3-97af-499a29d2bcf4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5723,PLANT_b874295f-5878-497c-99ff-789be0da6bde,SENSOR_o0:4e:ve:rt:1l:l1,SENSORDATA_5f3bb94f-37b3-5f1c-bc99-8ad4a9a317df,1.0,EVENT_db5f197c-0800-52fb-b206-9d42c80323a1,0.957772,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1425,PLANT_1508f8b3-72d0-45fb-a3fa-c1ce84d27810,SENSOR_kd:sd:3a:33:69:42,SENSORDATA_dc058ba9-5284-52e4-a0f3-34a3750a4073,0.0,EVENT_1db9690c-4734-51ee-9fb7-15510def9bfb,0.4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
