In [73]:
pip install onnx

Note: you may need to restart the kernel to use updated packages.


In [107]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import cassandra
import pyspark
import re
import os
import random
from random import randint, randrange
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.clustering import KMeans
import seaborn as sns
from pyspark.ml.stat import Correlation
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.feature import PCA, Imputer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col, asc
from pyspark.sql.functions import isnan
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
import numpy as np
import itertools
from sklearn.metrics import confusion_matrix
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from onnxmltools import convert_sparkml
from onnxmltools.convert.sparkml.utils import buildInitialTypesSimple, buildInputDictSimple
from pyspark.ml import Pipeline
import onnx
import onnxruntime

In [2]:
#Helper for pretty formatting for Spark DataFrames
def showDF(df, limitRows =  5, truncate = True):
    if(truncate):
        pd.set_option('display.max_colwidth', 50)
    else:
        pd.set_option('display.max_colwidth', -1)
    pd.set_option('display.max_rows', limitRows)
    display(df.limit(limitRows).toPandas())
    pd.reset_option('display.max_rows')

In [3]:
def correlation_matrix(df, corr_columns, method='pearson'):
    vector_col = "corr_features"
    assembler = VectorAssembler(inputCols=corr_columns, outputCol=vector_col)
    df_vector = assembler.transform(df).select(vector_col)
    matrix = Correlation.corr(df_vector, vector_col, method)

    result = matrix.collect()[0]["pearson({})".format(vector_col)].values
    return pd.DataFrame(result.reshape(-1, len(corr_columns)), columns=corr_columns, index=corr_columns)

In [4]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Model 1')
    plt.xlabel('Model 2')

In [5]:
from cassandra.cluster import Cluster

cluster = Cluster(['dse'])
session = cluster.connect()

In [6]:
session.execute("""
    CREATE KEYSPACE IF NOT EXISTS accelerate 
    WITH REPLICATION = 
    { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }"""
)

<cassandra.cluster.ResultSet at 0x7f917fd49e90>

In [7]:
session.set_keyspace('accelerate')

In [8]:
query = "CREATE TABLE IF NOT EXISTS diabetes \
                                   (Id int, timesPregnant int, plasmaGlucose int, bloodPressure int, \
                                   tricepThickness int, serumInsulin int, bmi float, diabetesPedegree float, \
                                   age int, label int, PRIMARY KEY (Id))"
session.execute(query)

<cassandra.cluster.ResultSet at 0x7f917c0d5790>

In [9]:
fileName = 'data/pima-indians-diabetes.csv'
input_file = open(fileName, 'r')
i = 1
for line in input_file:
    iD = i
    row = line.split(',')

    query = "INSERT INTO diabetes (Id, timesPregnant, plasmaGlucose, bloodPressure, \
                                   tricepThickness, serumInsulin, bmi, diabetesPedegree, \
                                   age, label)"
    query = query + " VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
    session.execute(query, (int(iD), int(row[0]), int(row[1]), int(row[2]), int(row[3]), int(row[4]), float(row[5]), float(row[6]), int(row[7]), int(row[8])))
    i = i + 1

In [10]:
spark = SparkSession.builder.appName('demo').master("local").getOrCreate()
diabetesDF = spark.read.format("org.apache.spark.sql.cassandra").options(table="diabetes", keyspace="accelerate").load()

print ("Table Row Count: ")
print (diabetesDF.count())

Table Row Count: 
768


In [11]:
print(diabetesDF.schema.names)
print([diabetesDF.where((col(c_name) == 0)).count() for c_name in diabetesDF.schema.names])

['id', 'age', 'bloodpressure', 'bmi', 'diabetespedegree', 'label', 'plasmaglucose', 'seruminsulin', 'timespregnant', 'tricepthickness']
[0, 0, 35, 11, 0, 500, 5, 374, 111, 227]


In [12]:
diabetesDF = diabetesDF.withColumn("bloodpressure", F.when(F.col("bloodpressure")==0, float("nan")).otherwise(F.col("bloodpressure")))
diabetesDF = diabetesDF.withColumn("plasmaglucose", F.when(F.col("plasmaglucose")==0, float("nan")).otherwise(F.col("plasmaglucose")))
diabetesDF = diabetesDF.withColumn("tricepthickness", F.when(F.col("tricepthickness")==0, float("nan")).otherwise(F.col("tricepthickness")))
diabetesDF = diabetesDF.withColumn("seruminsulin", F.when(F.col("seruminsulin")==0, float("nan")).otherwise(F.col("seruminsulin")))
diabetesDF = diabetesDF.withColumn("bmi", F.when(F.col("bmi")==0, float("nan")).otherwise(F.col("bmi")))

In [13]:
print(diabetesDF.schema.names)
print([diabetesDF.where((col(c_name) == 0)).count() for c_name in diabetesDF.schema.names])

['id', 'age', 'bloodpressure', 'bmi', 'diabetespedegree', 'label', 'plasmaglucose', 'seruminsulin', 'timespregnant', 'tricepthickness']
[0, 0, 0, 0, 0, 500, 0, 0, 111, 0]


In [14]:
imputer = Imputer()
imputer.setInputCols(["plasmaglucose", "bloodpressure", "bmi", "tricepthickness", "seruminsulin"])
imputer.setOutputCols(["out_plasmaglucose", "out_bloodpressure", "out_bmi", "out_tricepthickness", "out_seruminsulin"])
model = imputer.fit(diabetesDF)
#model.setInputCols(["plasmaglucose", "bloodpressure", "bmi"])
diabetesDF_imputed = model.transform(diabetesDF)
showDF(diabetesDF_imputed,100)

Unnamed: 0,id,age,bloodpressure,bmi,diabetespedegree,label,plasmaglucose,seruminsulin,timespregnant,tricepthickness,out_bmi,out_plasmaglucose,out_tricepthickness,out_seruminsulin,out_bloodpressure
0,23,41,90.0,39.799999,0.451,1,196.0,,7,,39.799999,196.0,29.15342,155.548223,90.0
1,114,25,62.0,34.0,0.391,0,76.0,,4,,34.0,76.0,29.15342,155.548223,62.0
2,660,27,82.0,34.200001,1.292,1,80.0,70.0,3,31.0,34.200001,80.0,31.0,70.0,82.0
3,53,30,66.0,24.4,0.342,0,88.0,23.0,5,21.0,24.4,88.0,21.0,23.0,66.0
4,110,24,85.0,37.400002,0.247,1,95.0,36.0,0,25.0,37.400002,95.0,25.0,36.0,85.0
5,91,21,55.0,19.1,0.258,0,80.0,,1,,19.1,80.0,29.15342,155.548223,55.0
6,128,23,58.0,33.299999,0.261,0,118.0,94.0,1,36.0,33.299999,118.0,36.0,94.0,58.0
7,363,65,108.0,39.200001,0.305,0,103.0,,5,37.0,39.200001,103.0,37.0,155.548223,108.0
8,251,42,52.0,31.200001,0.38,0,106.0,,9,,31.200001,106.0,29.15342,155.548223,52.0
9,744,45,94.0,32.700001,0.734,1,140.0,,9,,32.700001,140.0,29.15342,155.548223,94.0


In [18]:
initial_types = buildInitialTypesSimple(diabetesDF.drop("label").drop("id").drop("age").drop("diabetespedegree").drop("timespregnant"))
print(initial_types)
onnx_model = convert_sparkml(model, 'Pyspark Imputer model', initial_types, spark_session = spark)
#print(onnx_model.SerializeToString())

The maximum opset needed by this model is only 4.


[('bloodpressure', FloatTensorType(shape=[1, 1])), ('bmi', FloatTensorType(shape=[1, 1])), ('plasmaglucose', FloatTensorType(shape=[1, 1])), ('seruminsulin', FloatTensorType(shape=[1, 1])), ('tricepthickness', FloatTensorType(shape=[1, 1]))]
b'\x08\x03\x12\x0bOnnxMLTools\x1a\x051.7.0"\x14onnxconverter-common(\x002\x00:\x83\x06\np\n\rplasmaglucose\n\rbloodpressure\n\x03bmi\n\x0ftricepthickness\n\x0cseruminsulin\x12\rconcat_tensor\x1a\x06Concat"\x06Concat*\x0b\n\x04axis\x18\x01\xa0\x01\x02:\x00\n\x91\x01\n\rconcat_tensor\x12\x0eimputed_tensor\x1a\x07Imputer"\x07Imputer*2\n\x14imputed_value_floats=\x9f_\xf3B=t\xcf\x90B=q\xd4\x01B=4:\xe9A=X\x8c\x1bC\xa0\x01\x06*\x1e\n\x14replaced_value_float\x15\x00\x00\xc0\x7f\xa0\x01\x01:\nai.onnx.ml\n\x97\x01\n\x0eimputed_tensor\x12\x11out_plasmaglucose\x12\x11out_bloodpressure\x12\x07out_bmi\x12\x13out_tricepthickness\x12\x10out_seruminsulin\x1a\x05Split"\x05Split*\x0b\n\x04axis\x18\x01\xa0\x01\x02*\x12\n\x05split@\x01@\x02@\x03@\x04\xa0\x01\x07:\x00\x

In [26]:
with open(os.path.join("/home/jovyan/work/models/", "imputer_model.onnx"), "wb") as f:
    f.write(onnx_model.SerializeToString())

In [29]:
tempDF = diabetesDF_imputed.filter(F.col("out_tricepthickness") != float("nan"))
tempDF = diabetesDF_imputed.filter(F.col("out_seruminsulin") != float("nan"))

print ("Table Row Count: ")
print (tempDF.count())

showDF(tempDF,100)

Table Row Count: 
768


Unnamed: 0,id,age,bloodpressure,bmi,diabetespedegree,label,plasmaglucose,seruminsulin,timespregnant,tricepthickness,out_bmi,out_plasmaglucose,out_tricepthickness,out_seruminsulin,out_bloodpressure
0,23,41,90.0,39.799999,0.451,1,196.0,,7,,39.799999,196.0,29.15342,155.548223,90.0
1,114,25,62.0,34.0,0.391,0,76.0,,4,,34.0,76.0,29.15342,155.548223,62.0
2,660,27,82.0,34.200001,1.292,1,80.0,70.0,3,31.0,34.200001,80.0,31.0,70.0,82.0
3,53,30,66.0,24.4,0.342,0,88.0,23.0,5,21.0,24.4,88.0,21.0,23.0,66.0
4,110,24,85.0,37.400002,0.247,1,95.0,36.0,0,25.0,37.400002,95.0,25.0,36.0,85.0
5,91,21,55.0,19.1,0.258,0,80.0,,1,,19.1,80.0,29.15342,155.548223,55.0
6,128,23,58.0,33.299999,0.261,0,118.0,94.0,1,36.0,33.299999,118.0,36.0,94.0,58.0
7,363,65,108.0,39.200001,0.305,0,103.0,,5,37.0,39.200001,103.0,37.0,155.548223,108.0
8,251,42,52.0,31.200001,0.38,0,106.0,,9,,31.200001,106.0,29.15342,155.548223,52.0
9,744,45,94.0,32.700001,0.734,1,140.0,,9,,32.700001,140.0,29.15342,155.548223,94.0


In [30]:
assembler = VectorAssembler(
    inputCols=['age', 'out_bloodpressure', 'out_bmi', 'diabetespedegree', 'out_plasmaglucose', 'out_seruminsulin', 'timespregnant', 'out_tricepthickness'],
    outputCol='features', handleInvalid = "keep")

dDF = assembler.transform(diabetesDF_imputed)
showDF(dDF)

Unnamed: 0,id,age,bloodpressure,bmi,diabetespedegree,label,plasmaglucose,seruminsulin,timespregnant,tricepthickness,out_bmi,out_plasmaglucose,out_tricepthickness,out_seruminsulin,out_bloodpressure,features
0,23,41,90.0,39.799999,0.451,1,196.0,,7,,39.799999,196.0,29.15342,155.548223,90.0,"[41.0, 90.0, 39.79999923706055, 0.451000005006..."
1,114,25,62.0,34.0,0.391,0,76.0,,4,,34.0,76.0,29.15342,155.548223,62.0,"[25.0, 62.0, 34.0, 0.39100000262260437, 76.0, ..."
2,660,27,82.0,34.200001,1.292,1,80.0,70.0,3,31.0,34.200001,80.0,31.0,70.0,82.0,"[27.0, 82.0, 34.20000076293945, 1.292000055313..."
3,53,30,66.0,24.4,0.342,0,88.0,23.0,5,21.0,24.4,88.0,21.0,23.0,66.0,"[30.0, 66.0, 24.399999618530273, 0.34200000762..."
4,110,24,85.0,37.400002,0.247,1,95.0,36.0,0,25.0,37.400002,95.0,25.0,36.0,85.0,"[24.0, 85.0, 37.400001525878906, 0.24699999392..."


In [33]:
initial_types = buildInitialTypesSimple(diabetesDF_imputed.drop('label','id','bloodpressure', 'bmi', 'seruminsulin', 'plasmaglucose', 'tricepthickness'))
print(initial_types)
onnx_model = convert_sparkml(assembler, 'Pyspark Vector Assembler model', initial_types, spark_session = spark)
#print(onnx_model.SerializeToString())

The maximum opset needed by this model is only 4.


[('age', FloatTensorType(shape=[1, 1])), ('diabetespedegree', FloatTensorType(shape=[1, 1])), ('timespregnant', FloatTensorType(shape=[1, 1])), ('out_bmi', FloatTensorType(shape=[1, 1])), ('out_plasmaglucose', FloatTensorType(shape=[1, 1])), ('out_tricepthickness', FloatTensorType(shape=[1, 1])), ('out_seruminsulin', FloatTensorType(shape=[1, 1])), ('out_bloodpressure', FloatTensorType(shape=[1, 1]))]


In [34]:
with open(os.path.join("/home/jovyan/work/models/", "vector_assembler_model.onnx"), "wb") as f:
    f.write(onnx_model.SerializeToString())

In [35]:
splits = dDF.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]

print ("Train Dataframe Row Count: ")
print (train.count())
print ("Test Datafram Row Count: ")
print (test.count())

Train Dataframe Row Count: 
527
Test Datafram Row Count: 
241


In [36]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
rf_model = rf.fit(train)

rf_predictions = rf_model.transform(test)
showDF(rf_predictions)

Unnamed: 0,id,age,bloodpressure,bmi,diabetespedegree,label,plasmaglucose,seruminsulin,timespregnant,tricepthickness,out_bmi,out_plasmaglucose,out_tricepthickness,out_seruminsulin,out_bloodpressure,features,rawPrediction,probability,prediction
0,5,33,40.0,43.099998,2.288,1,137.0,168.0,0,35.0,43.099998,137.0,35.0,168.0,40.0,"[33.0, 40.0, 43.099998474121094, 2.28800010681...","[2.5457817436052976, 7.454218256394702]","[0.25457817436052976, 0.7454218256394702]",1.0
1,28,22,66.0,23.200001,0.487,0,97.0,140.0,1,15.0,23.200001,97.0,15.0,140.0,66.0,"[22.0, 66.0, 23.200000762939453, 0.48699998855...","[9.674142262222148, 0.32585773777785104]","[0.967414226222215, 0.03258577377778511]",0.0
2,30,38,92.0,34.099998,0.337,0,117.0,,5,,34.099998,117.0,29.15342,155.548223,92.0,"[38.0, 92.0, 34.099998474121094, 0.33700001239...","[5.9262314463222365, 4.073768553677764]","[0.5926231446322237, 0.40737685536777646]",0.0
3,42,37,84.0,40.200001,0.696,0,133.0,,7,,40.200001,133.0,29.15342,155.548223,84.0,"[37.0, 84.0, 40.20000076293945, 0.695999979972...","[3.029922137428934, 6.970077862571066]","[0.3029922137428934, 0.6970077862571066]",1.0
4,52,26,50.0,24.200001,0.526,0,101.0,36.0,1,15.0,24.200001,101.0,15.0,36.0,50.0,"[26.0, 50.0, 24.200000762939453, 0.52600002288...","[9.338394907724794, 0.6616050922752055]","[0.9338394907724794, 0.06616050922752055]",0.0


In [53]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
rf_score = evaluator.evaluate(rf_predictions)
print("Test set accuracy for Random Forest Classifier = " + str(rf_score))

Test set accuracy for Random Forest Classifier = 0.7717842323651453


In [54]:
pipeline = Pipeline(stages=[imputer, assembler, rf])

In [55]:
splits = diabetesDF.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]

print ("Train Dataframe Row Count: ")
print (train.count())
print ("Test Datafram Row Count: ")
print (test.count())

Train Dataframe Row Count: 
563
Test Datafram Row Count: 
205


In [56]:
model = pipeline.fit(train)

In [58]:
prediction = model.transform(test)
showDF(prediction)

Unnamed: 0,id,age,bloodpressure,bmi,diabetespedegree,label,plasmaglucose,seruminsulin,timespregnant,tricepthickness,out_bmi,out_plasmaglucose,out_tricepthickness,out_seruminsulin,out_bloodpressure,features,rawPrediction,probability,prediction
0,13,57,80.0,27.1,1.441,0,139.0,,10,,27.1,139.0,29.023499,158.450704,80.0,"[57.0, 80.0, 27.100000381469727, 1.44099998474...","[8.96963459360539, 1.030365406394612]","[0.8969634593605388, 0.10303654063946117]",0.0
1,28,22,66.0,23.200001,0.487,0,97.0,140.0,1,15.0,23.200001,97.0,15.0,140.0,66.0,"[22.0, 66.0, 23.200000762939453, 0.48699998855...","[9.430299366199796, 0.5697006338002034]","[0.9430299366199797, 0.05697006338002035]",0.0
2,46,25,66.0,42.0,1.893,1,180.0,,0,39.0,42.0,180.0,39.0,158.450704,66.0,"[25.0, 66.0, 42.0, 1.8930000066757202, 180.0, ...","[3.2459606383754984, 6.754039361624501]","[0.32459606383754985, 0.6754039361624501]",1.0
3,60,22,64.0,41.5,0.173,0,105.0,142.0,0,41.0,41.5,105.0,41.0,142.0,64.0,"[22.0, 64.0, 41.5, 0.17299999296665192, 105.0,...","[7.652376523677515, 2.347623476322486]","[0.7652376523677515, 0.2347623476322486]",0.0
4,73,42,90.0,43.400002,0.583,1,126.0,,13,,43.400002,126.0,29.023499,158.450704,90.0,"[42.0, 90.0, 43.400001525878906, 0.58300000429...","[6.156169546254334, 3.8438304537456665]","[0.6156169546254334, 0.38438304537456663]",0.0


In [163]:
showDF(train)

Unnamed: 0,id,age,bloodpressure,bmi,diabetespedegree,label,plasmaglucose,seruminsulin,timespregnant,tricepthickness
0,1,50,72.0,33.599998,0.627,1,148.0,,6,35.0
1,2,31,66.0,26.6,0.351,0,85.0,,1,29.0
2,5,33,40.0,43.099998,2.288,1,137.0,168.0,0,35.0
3,8,29,,35.299999,0.134,0,115.0,,10,
4,10,54,96.0,,0.232,1,125.0,,8,


In [164]:
initial_types = buildInitialTypesSimple(diabetesDF.drop("label", "id"))
print(initial_types)
onnx_model = convert_sparkml(model, 'Pyspark Pipeline model', initial_types, spark_session = spark)

[('age', FloatTensorType(shape=[1, 1])), ('bloodpressure', FloatTensorType(shape=[1, 1])), ('bmi', FloatTensorType(shape=[1, 1])), ('diabetespedegree', FloatTensorType(shape=[1, 1])), ('plasmaglucose', FloatTensorType(shape=[1, 1])), ('seruminsulin', FloatTensorType(shape=[1, 1])), ('timespregnant', FloatTensorType(shape=[1, 1])), ('tricepthickness', FloatTensorType(shape=[1, 1]))]


The maximum opset needed by this model is only 4.


In [61]:
with open(os.path.join("/home/jovyan/work/models/", "pipeline_model.onnx"), "wb") as f:
    f.write(onnx_model.SerializeToString())

In [83]:
session.execute("drop table models")

<cassandra.cluster.ResultSet at 0x7f9174c64110>

In [84]:
query = "CREATE TABLE IF NOT EXISTS models \
                                   (Id int, name text, model blob, description text, PRIMARY KEY (Id))"
session.execute(query)

<cassandra.cluster.ResultSet at 0x7f915e7cec50>

In [85]:
query = "INSERT INTO models (Id, name, model, description)"
query = query + " VALUES (%s, %s, %s, %s)"
session.execute(query, (int(0), str("Pipeline Model"), (onnx_model.SerializeToString()), str("Full Pima Indians Diabetes Pipeline Model")))

<cassandra.cluster.ResultSet at 0x7f915e7ceed0>

In [105]:
print(session.execute("SELECT * FROM models where Id = 0").current_rows[0].model)

b'\x08\x03\x12\x0bOnnxMLTools\x1a\x051.7.0"\x14onnxconverter-common(\x002\x00:\xc1u\np\n\rplasmaglucose\n\rbloodpressure\n\x03bmi\n\x0ftricepthickness\n\x0cseruminsulin\x12\rconcat_tensor\x1a\x06Concat"\x06Concat*\x0b\n\x04axis\x18\x01\xa0\x01\x02:\x00\n\x91\x01\n\rconcat_tensor\x12\x0eimputed_tensor\x1a\x07Imputer"\x07Imputer*2\n\x14imputed_value_floats=up\xf4B=v\x94\x90B=\xb0\xee\x00B= 0\xe8A=as\x1eC\xa0\x01\x06*\x1e\n\x14replaced_value_float\x15\x00\x00\xc0\x7f\xa0\x01\x01:\nai.onnx.ml\n\x97\x01\n\x0eimputed_tensor\x12\x11out_plasmaglucose\x12\x11out_bloodpressure\x12\x07out_bmi\x12\x13out_tricepthickness\x12\x10out_seruminsulin\x1a\x05Split"\x05Split*\x0b\n\x04axis\x18\x01\xa0\x01\x02*\x12\n\x05split@\x01@\x02@\x03@\x04\xa0\x01\x07:\x00\n\xa6\x01\n\x03age\n\x11out_bloodpressure\n\x07out_bmi\n\x10diabetespedegree\n\x11out_plasmaglucose\n\x10out_seruminsulin\n\rtimespregnant\n\x13out_tricepthickness\x12\x08features\x1a\x07Concat1"\x06Concat*\x0b\n\x04axis\x18\x01\xa0\x01\x02:\x00\n\x

In [200]:
loaded_model = onnx.load_from_string(session.execute("SELECT * FROM models where Id = 0").current_rows[0].model)
print(loaded_model)

ir_version: 3
producer_name: "OnnxMLTools"
producer_version: "1.7.0"
domain: "onnxconverter-common"
model_version: 0
doc_string: ""
graph {
  node {
    input: "plasmaglucose"
    input: "bloodpressure"
    input: "bmi"
    input: "tricepthickness"
    input: "seruminsulin"
    output: "concat_tensor"
    name: "Concat"
    op_type: "Concat"
    attribute {
      name: "axis"
      i: 1
      type: INT
    }
    domain: ""
  }
  node {
    input: "concat_tensor"
    output: "imputed_tensor"
    name: "Imputer"
    op_type: "Imputer"
    attribute {
      name: "imputed_value_floats"
      floats: 122.21964263916016
      floats: 72.28996276855469
      floats: 32.23309326171875
      floats: 29.02349853515625
      floats: 158.45069885253906
      type: FLOATS
    }
    attribute {
      name: "replaced_value_float"
      f: nan
      type: FLOAT
    }
    domain: "ai.onnx.ml"
  }
  node {
    input: "imputed_tensor"
    output: "out_plasmaglucose"
    output: "out_bloodpressure"
    o

In [199]:
onnxsession = onnxruntime.InferenceSession("/home/jovyan/work/models/pipeline_model.onnx", None)
output = onnxsession.get_outputs()[0]
inputs = onnxsession.get_inputs()
print(inputs[7])
print("\n")
#test_row = test.drop("label", "id").collect()[1]
#print(test_row)
#test_row_DF = spark.createDataFrame(spark.sparkContext.parallelize(test_row), test.drop("label", "id").schema)
print(test.drop("label", "id").schema)
input_data = {}
input_data['age'] = np.float32(test.select("age").toPandas().values[1].reshape(1,1))
input_data['bloodpressure'] = np.float32(test.select("bloodpressure").toPandas().values[1].reshape(1,1))
print(input_data["bloodpressure"].dtype)
input_data['bmi'] = np.float32(test.select("bmi").toPandas().values[1].reshape(1,1))
input_data['diabetespedegree'] = np.float32(test.select("diabetespedegree").toPandas().values[1].reshape(1,1))
input_data['plasmaglucose'] = np.float32(test.select("plasmaglucose").toPandas().values[1].reshape(1,1))
input_data['seruminsulin'] = np.float32(test.select("seruminsulin").toPandas().values[1].reshape(1,1))
input_data['timespregnant'] = np.float32(test.select("timespregnant").toPandas().values[1].reshape(1,1))
input_data['tricepthickness'] = np.float32(test.select("tricepthickness").toPandas().values[1].reshape(1,1))
print(input_data)
#input_data= buildInputDictSimple(test_row_DF)
results = onnxsession.run(None, input_data)

NodeArg(name='tricepthickness', type='tensor(float)', shape=[1, 1])


StructType(List(StructField(age,IntegerType,true),StructField(bloodpressure,DoubleType,true),StructField(bmi,DoubleType,true),StructField(diabetespedegree,FloatType,true),StructField(plasmaglucose,DoubleType,true),StructField(seruminsulin,DoubleType,true),StructField(timespregnant,IntegerType,true),StructField(tricepthickness,DoubleType,true)))
float32
{'age': array([[22.]], dtype=float32), 'bloodpressure': array([[66.]], dtype=float32), 'bmi': array([[23.2]], dtype=float32), 'diabetespedegree': array([[0.487]], dtype=float32), 'plasmaglucose': array([[97.]], dtype=float32), 'seruminsulin': array([[140.]], dtype=float32), 'timespregnant': array([[1.]], dtype=float32), 'tricepthickness': array([[15.]], dtype=float32)}


Fail: [ONNXRuntimeError] : 1 : FAIL : Non-zero status code returned while running Split node. Name:'Split' Status Message: Cannot split using values in 'split' attribute. Axis=1 Input shape={1,5} NumOutputs=5 Num entries in 'split' (must equal number of outputs) was 4 Sum of sizes in 'split' (must equal size of selected axis) was 10