In [1]:
import pandas as pd
import numpy as np
import random
import math
import pickle
import json
import os
import requests
import datetime
import boto3
from botocore.exceptions import NoCredentialsError
from domino.training_sets import TrainingSetClient, model

#set DMM vars
bucket = 'wine-quality-monitoring'

#Load in data

In [2]:
print('Reading in data for batch scoring')
df = TrainingSetClient.get_training_set_version('mlops-training-', number = 1).load_raw_pandas()

Reading in data for batch scoring


In [3]:
df.head()

Unnamed: 0,id,density,volatile_acidity,chlorides,is_red,alcohol,quality
0,0,1.001,0.27,0.045,0,8.8,5.58
1,1,0.994,0.3,0.049,0,9.5,5.04
2,2,0.9951,0.28,0.05,0,10.1,5.34
3,3,0.9956,0.23,0.058,0,9.9,4.92
4,4,0.9956,0.23,0.058,0,9.9,5.16


In [4]:
df2 = df.append(df).reset_index(drop=True)

  df2 = df.append(df).reset_index(drop=True)


In [5]:
df.columns

Index(['id', 'density', 'volatile_acidity', 'chlorides', 'is_red', 'alcohol',
       'quality'],
      dtype='object')

In [27]:
##For each input feature adjust data and round/cast as necessary
#Density - 50%-150
densityJitter = df2.density.apply(lambda x : x*(random.randrange(50,150))/100).round(4)
#volatile acidity - 70%-130%
volatileAcidityJitter = df2.volatile_acidity.apply(lambda x : x*(random.randrange(70,130)/100)).round(2)
#Chlorides - 80%-120%
chloridesJitter = df2.chlorides.apply(lambda x : x*(random.randrange(80,120)/100)).round(3)
#is_red - 40%-160%
is_redJitter = df2.is_red.apply(lambda x : x*(random.randrange(40,160)/100)).round(0)
#alcohol - 90%-110%
alcoholJitter = df2.alcohol.apply(lambda x : x*(random.randrange(90,110)/100)).round(1)

#Take all the new 'jittered' variables and write to a new df
#Keep original custid and churn_Y fields
df3 = pd.DataFrame({'id': df2.id,
       'density': densityJitter, 
       'volatile_acidity': volatileAcidityJitter,
       'chlorides': chloridesJitter,
       'is_red': is_redJitter,
       'alcohol': alcoholJitter,
       'quality': df2.quality
                   })
df3=df3.astype({"is_red": int})

In [28]:
df3

Unnamed: 0,id,density,volatile_acidity,chlorides,is_red,alcohol,quality
0,0,0.8909,0.30,0.047,0,9.1,5.58
1,1,0.6362,0.37,0.041,0,9.2,5.04
2,2,1.1145,0.36,0.050,0,10.1,5.34
3,3,0.8662,0.20,0.057,0,10.3,4.92
4,4,1.1151,0.20,0.059,0,9.6,5.16
...,...,...,...,...,...,...,...
12921,6491,0.9566,0.50,0.066,1,9.2,5.10
12922,6492,0.8855,0.68,0.075,2,10.6,5.60
12923,6494,0.5875,0.52,0.069,2,10.0,6.18
12924,6495,0.8362,0.77,0.068,1,9.4,5.65


In [29]:
#Grab between 50 and 500 random rows from jittered data
df_inf = df3.sample(n = random.randint(50,100)).reset_index(drop=True)

#set up clean customer_ids
setup_ids = list(range(0, df_inf.shape[0]))
ids = list()
for i in setup_ids:
    ids.append(str(datetime.date.today())+'_'+str(setup_ids[i]))

In [30]:
df_inf['wine_id']=ids    
print('Sending {} records to model API endpoint for scoring'.format(df_inf.shape[0]))

#Set up dictionaries and lists for loops
setup_dict = {}
scoring_request = {}
results = list()

inputs = df_inf[['wine_id','density', 'volatile_acidity', 'chlorides', 'is_red', 'alcohol']]


Sending 50 records to model API endpoint for scoring


In [31]:
inputs

Unnamed: 0,wine_id,density,volatile_acidity,chlorides,is_red,alcohol
0,2024-04-03_0,1.489,0.53,0.08,1,9.9
1,2024-04-03_1,0.9784,0.15,0.053,0,8.2
2,2024-04-03_2,1.0382,0.37,0.037,0,8.5
3,2024-04-03_3,1.0664,0.16,0.031,0,9.4
4,2024-04-03_4,1.1088,0.48,0.085,1,9.3
5,2024-04-03_5,0.8809,0.61,0.092,1,12.4
6,2024-04-03_6,0.6467,0.23,0.056,0,10.5
7,2024-04-03_7,0.579,0.28,0.042,0,9.3
8,2024-04-03_8,1.118,0.41,0.029,0,13.8
9,2024-04-03_9,1.3467,0.23,0.026,0,12.6


{'wine_id': '2024-04-03_1',
 'density': 0.8318,
 'volatile_acidity': 0.2,
 'chlorides': 0.037,
 'is_red': 0.0,
 'alcohol': 11.2}

In [36]:
#for n in range(inputs.shape[0]):
for i in list(inputs.columns):
    setup_dict.update({i :list(inputs[1:1+1].to_dict().get(i).values())[0]})
    scoring_request = {'data' : setup_dict}
              
response = requests.post("https://se-demo.domino.tech:443/models/660d4f56399d9148750c3716/latest/model",
auth=(
    "7yQ8pczVs9LQFslKlKpg3DgSpN19G5KlqfMJWyd3OcWFNev8FEc8psmUV4mP6oeD",
    "7yQ8pczVs9LQFslKlKpg3DgSpN19G5KlqfMJWyd3OcWFNev8FEc8psmUV4mP6oeD"
),
    json=scoring_request
)
    #results.append(response.json().get('result').get('prediction'))

print(response.json().get('result')[0])

4.231100159581267


In [12]:
df_ground_truth=df_inf[['wine_id', 'quality']].rename({'wine_id': 'event_id', 'quality' : 'quality_GT'}, axis=1)
print(df_ground_truth.shape[0]==inputs.shape[0])
print((df_ground_truth.event_id==inputs.wine_id).sum()==df_ground_truth.shape[0])

True
True


In [13]:
gt_file_name = str('GT_Data_') + str(datetime.date.today())+str('.csv')
gt_file_path = str('/domino/datasets/local/ground_truth_data/')+gt_file_name
df_ground_truth.to_csv(gt_file_path, index=False)

In [14]:
def s3_upload(local_file, bucket):
    s3 = boto3.client('s3', aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
                      aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'])
    
    s3_file_name = '{}'.format(os.path.basename(local_file))
    
    try:
        s3.upload_file(local_file, bucket, s3_file_name)
        print(str(s3_file_name) + " Upload Successful")
        return True
    except FileNotFoundError:
        print("The file was not found")
        return False
    except NoCredentialsError:
        print("Credentials not available")
        return False
    
s3_upload(gt_file_path, bucket)

print('Data Uploaded to s3 bucket at s3://{}/{}'.format(bucket, gt_file_name))
print('Done!')

GT_Data_2023-05-22.csv Upload Successful
Data Uploaded to s3 bucket at s3://wine-quality-monitoring/GT_Data_2023-05-22.csv
Done!
