# Pediction test

In [35]:
import json
import numpy as np
from google.cloud import aiplatform
from google.cloud import bigquery
from google.api import httpbody_pb2

In [4]:
REGION = "us-central1"
PROJECT = 'gavb-poc-bu-mlops-f-store'
DATASET = 'vertexai_teste'
TABLE = 'iris-vertexai_transformed'

In [4]:
# Collecting some example data
client = bigquery.Client(project=PROJECT)
pred_ = client.query(query=f'SELECT * FROM `{PROJECT}.{DATASET}.{TABLE}` WHERE splits="VAL" LIMIT 5').result().to_dataframe()
pred_ = pred_.drop(['Species','splits'], axis=1)
pred_.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,-0.985734,0.267498,-1.19904,-0.999564
1,-1.227138,-0.316134,-1.31145,-1.447966
2,-0.502925,2.893844,-1.367655,-1.335865
3,-1.589245,-0.024318,-1.367655,-1.335865
4,0.221287,-1.191583,0.093675,-0.102759


In [27]:
pred = pred_.to_dict(orient='records')
pred = [list(preds.values()) for preds in pred]
#pred = list(pred.values())


In [7]:
# List Endpoint
for e in aiplatform.Endpoint.list():
    if e.display_name.startswith('vertexai'): endpoint = e


print(endpoint.display_name)
print(endpoint.resource_name)


vertexai-test-endpoint
projects/52716924283/locations/us-central1/endpoints/5413493877866758144


In [8]:
# Prediction service Client
client_options = {"api_endpoint": f"{REGION}-aiplatform.googleapis.com"}
predictor = aiplatform.gapic.PredictionServiceClient(client_options = client_options)

In [32]:
# 
instances = {"instances": pred, "signature_name":"predict"}
http_body = httpbody_pb2.HttpBody(data = json.dumps(instances).encode("utf-8"), content_type="application/json")

In [34]:
# Prediction
prediction = predictor.raw_predict(endpoint=endpoint.resource_name, http_body=http_body)
json.loads(prediction.data)

{'predictions': [0, 0, 0, 0, 1]}

# Test Monitoring - example pipeline

In [35]:
# data client
client = bigquery.Client(project=PROJECT)
    
# Coletando dados    
table = f"{PROJECT}.{DATASET}.{TABLE}"

data = (
    client
    .query(f"SELECT * FROM `{table}` WHERE splits = 'TRAIN'")
    .result()
    .to_dataframe()
)

data = data.drop('splits', axis=1)


data.to_gbq(destination_table=f"{PROJECT}.{DATASET}.train_transformed", project_id=PROJECT, if_exists="replace")

In [36]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,-1.167448,0.178659,-1.350002,-1.508552,0
1,-0.794025,2.510627,-1.350002,-1.508552,0
2,-1.042973,0.878249,-1.350002,-1.37231,0
3,-0.794025,1.111446,-1.350002,-1.37231,0
4,-0.669551,1.57784,-1.350002,-1.37231,0


### Mudando a distribuição dos dados.

In [37]:
sample = data.head(50)
sample = sample.drop('Species', axis=1)
sample.shape

(50, 4)

In [43]:
import random
import json
import numpy as np
from google.cloud import aiplatform
from google.cloud import bigquery
from google.api import httpbody_pb2

def test_monitoring(df):

    # gerando numeros aleatorios
    randomlist = []
    for i in range(0,df.shape[0]):
        n = random.randint(-3, 5)
        randomlist.append(n)
    print(randomlist)

    # Mudando a distribuicao
    for col in df.columns.to_list():
        df[col]=df.loc[:,str(col)]*randomlist
    
    # formantando os dados para pedicao teste
    samples = df.to_dict(orient='records')
    samples = [list(sample.values()) for sample in samples]

    # Find the endpoint
    for e in aiplatform.Endpoint.list():
        if e.display_name.startswith('vertexai'): endpoint = e
    
    # Prediction service Client
    client_options = {"api_endpoint": f"{REGION}-aiplatform.googleapis.com"}
    predictor = aiplatform.gapic.PredictionServiceClient(client_options = client_options)

    # Formatando o payload
    instances = {"instances": samples, "signature_name":"predict"}
    http_body = httpbody_pb2.HttpBody(data = json.dumps(instances).encode("utf-8"), content_type="application/json")

    # Prediction
    prediction = predictor.raw_predict(endpoint=endpoint.resource_name, http_body=http_body)

    return json.loads(prediction.data)

In [53]:
test_monitoring(df=sample)

[3, -3, 3, 3, -2, -3, 3, 2, 3, 2, 3, 3, 2, -2, 0, 2, 5, -1, -3, 3, -3, -2, 2, 2, 2, 2, -3, -1, 0, 3, 1, 2, 0, -2, 3, 1, 4, 3, -1, -1, -3, 5, 0, -1, -2, -1, -3, -3, -2, -1]


{'predictions': [1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  2,
  0,
  0,
  1,
  1,
  1,
  1,
  2,
  1,
  2,
  1,
  1,
  1,
  0,
  2,
  1,
  2,
  2,
  2,
  1,
  1,
  1,
  0,
  1,
  1,
  2,
  1,
  2,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1]}

# Testing TFDV

In [1]:
import tensorflow_data_validation as tfdv

2022-05-31 16:40:13.202403: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-31 16:40:13.202430: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


#### Dados de treino como dataframe

In [5]:
# data client
client = bigquery.Client(project=PROJECT)
    
# Coletando dados    
table = f"{PROJECT}.{DATASET}.{TABLE}"

data = (
    client
    .query(f"SELECT * FROM `{table}` WHERE splits = 'TRAIN'")
    .result()
    .to_dataframe()
)

data_train = data.drop('splits', axis=1)

In [9]:
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=data_train)

In [12]:
schema = tfdv.infer_schema(train_stats)
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'SepalLengthCm',FLOAT,required,,-
'SepalWidthCm',FLOAT,required,,-
'PetalLengthCm',FLOAT,required,,-
'PetalWidthCm',FLOAT,required,,-
'Species',INT,required,,-


In [18]:
visualizer = tfdv.visualize_statistics(train_stats)

#### Dados Teste como dataframe

In [14]:
# data client
client = bigquery.Client(project=PROJECT)
    
# Coletando dados    
table = f"{PROJECT}.{DATASET}.{TABLE}"

data = (
    client
    .query(f"SELECT * FROM `{table}` WHERE splits = 'TEST'")
    .result()
    .to_dataframe()
)

data_test = data.drop('splits', axis=1)

In [24]:
!pip install tensorflow_metadata

You should consider upgrading via the '/home/brunovn/projetos_gavb/produtos_digitais_template/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [28]:
from tensorflow_metadata.proto.v0 import schema_pb2

test_stats = tfdv.generate_statistics_from_dataframe(data_test)
schema = tfdv.infer_schema(test_stats)

tfdv.set_domain(schema, 'SepalLengthCm', schema_pb2.FloatDomain(name='SepalLengthCm', min=-12.1, max=4.9))

anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)
tfdv.display_anomalies(anomalies)

In [29]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'SepalLengthCm',FLOAT,required,,min: -12.100000; max: 4.900000
'SepalWidthCm',FLOAT,required,,-
'PetalLengthCm',FLOAT,required,,-
'PetalWidthCm',FLOAT,required,,-
'Species',INT,required,,-


In [30]:
def changing_df(df):

    # gerando numeros aleatorios
    randomlist = []
    for i in range(0,df.shape[0]):
        n = random.randint(-5, -2)
        randomlist.append(n)
    print(randomlist)

    # Mudando a distribuicao
    for col in df.columns.to_list():
        df[col]=df.loc[:,str(col)]*randomlist
    
    return df

new_data_test = changing_df(data_test)

[-3, -5, -4, -3, -5, -5, -4, -4, -5, -5, -2, -2, -4, -5, -4, -5]


In [31]:
new_test_stats = tfdv.generate_statistics_from_dataframe(new_data_test)
#schema = tfdv.infer_schema(new_test_stats)
anomalies = tfdv.validate_statistics(statistics=new_test_stats, schema=schema)
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'SepalLengthCm',Multiple errors,Unexpectedly low values: -23.9534<-12.1(upto six significant digits) Unexpectedly high value: 44.349>4.9(upto six significant digits)


#### Testando o loading dos stats

In [32]:
import tensorflow_data_validation as tfdv


stats = tfdv.load_stats_text("gs://test-bucket-vertexai/produtos-digitais-template/52716924283/produtos-digitais-teste-v9-20220601141739/generate-stats-component_-4719261686333177856/results")

In [34]:
tfdv.visualizet_statistics(stats)