In [5]:
import sagemaker
import boto3
from sagemaker import get_execution_role

In [6]:
region = boto3.Session().region_name
sm_rt = boto3.Session().client('runtime.sagemaker', region_name=region)

ep_name = 'd-scs-model'
id_name = "ID"
target = 'TARGET'

file_name = "./d-scs/test_Santander Customer Satisfaction.csv"
test_file_name = "./d-scs/test_Santander Customer Satisfaction-no-label.csv"

kaggle_file_name = "./d-scs/kaggle-test-Santander Customer Satisfaction.csv"
kaggle_test_file_name = "./d-scs/kaggle-test-Santander Customer Satisfaction-no-label.csv"
kaggle_pred_file_name = "./d-scs/kaggle-test-Santander Customer Satisfaction-predictions.csv"

In [7]:
import pandas as pd
import numpy as np

df = pd.read_csv(file_name)

df = df.drop([target],axis=1) 
print(df.head())
print(df.shape)

df.to_csv(test_file_name, index=False)

       ID  var3  var15  imp_ent_var16_ult1  imp_op_var39_comer_ult1  \
0   48416     2     37                 0.0                      0.0   
1   33612     2     23                 0.0                      0.0   
2   75760     2     23                 0.0                      0.0   
3   25656     2     50                 0.0                      0.0   
4  110161     2     23                 0.0                      0.0   

   imp_op_var39_comer_ult3  imp_op_var40_comer_ult1  imp_op_var40_comer_ult3  \
0                      0.0                      0.0                      0.0   
1                      0.0                      0.0                      0.0   
2                      0.0                      0.0                      0.0   
3                      0.0                      0.0                      0.0   
4                      0.0                      0.0                      0.0   

   imp_op_var40_efect_ult1  imp_op_var40_efect_ult3  ...  \
0                      0.0      

In [8]:
f = open(test_file_name)
count = 0
pred_label = []
pred_neg_prpb = []
pred_pos_prpb = []

for line in f:
    if count==0:
        count=1
        continue
    response = sm_rt.invoke_endpoint(EndpointName=ep_name, ContentType='text/csv', Accept='text/csv', Body=line.encode('utf-8'))
    response = response['Body'].read().decode("utf-8")
    #print(response)
    pred_label.append(int(response[0]))
    ind1 = response.index("[")
    ind2 = response.index(",", ind1)
    neg_prob = float(response[ind1+1:ind2])
    #print(neg_prob)
    pred_neg_prpb.append(neg_prob)
    ind1 = response.index("]", ind2)
    pos_prob = float(response[ind2+2:ind1])
    #print(pos_prob)
    pred_pos_prpb.append(pos_prob)    


In [10]:
print(pred_label[:10])
print(pred_neg_prpb[:10])
print(pred_pos_prpb[:10])

import pandas as pd
import numpy as np
pd = pd.read_csv(file_name)
y_true = np.array(pd[target])
y_pred = np.array(pred_label)
print(np.mean(y_true==y_pred))

from sklearn import metrics
#fpr, tpr, thresholds = metrics.roc_curve(y_true, pred_pos_prpb, pos_label=1)
#metrics.auc(fpr, tpr)
metrics.roc_auc_score(y_true, pred_pos_prpb)

[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
[0.9758619740605354, 0.9873773753643036, 0.9964950431603938, 0.774812862277031, 0.9961056509055197, 0.2833617329597473, 0.5005277693271637, 0.9982742230640724, 0.9971376592293382, 0.9423586539924145]
[0.02413802593946457, 0.012622624635696411, 0.003504956839606166, 0.22518713772296906, 0.003894349094480276, 0.7166382670402527, 0.4994722306728363, 0.0017257769359275699, 0.002862340770661831, 0.057641346007585526]
0.9373848987108656


0.9125375999390304

In [12]:
import pandas as pd
import numpy as np

df = pd.read_csv(kaggle_file_name)
kaggle_test_ids = np.array(df[id_name])
print(df.head())
print(df.shape)

df.to_csv(kaggle_test_file_name, index=False)

f = open(kaggle_test_file_name)
count = 0
kaggle_pred = []

for line in f:
    if count==0:
        count=1
        continue
    response = sm_rt.invoke_endpoint(EndpointName=ep_name, ContentType='text/csv', Accept='text/csv', Body=line.encode('utf-8'))
    response = response['Body'].read().decode("utf-8")
    test_id = kaggle_test_ids[count-1]
    pred = int(response[0])
    kaggle_pred.append([test_id, pred])
    count = count + 1

kaggle_pred_df = pd.DataFrame(kaggle_pred, columns = [id_name, target])

kaggle_pred_df.to_csv(kaggle_pred_file_name, index=False)

   ID  var3  var15  imp_ent_var16_ult1  imp_op_var39_comer_ult1  \
0   2     2     32                 0.0                      0.0   
1   5     2     35                 0.0                      0.0   
2   6     2     23                 0.0                      0.0   
3   7     2     24                 0.0                      0.0   
4   9     2     23                 0.0                      0.0   

   imp_op_var39_comer_ult3  imp_op_var40_comer_ult1  imp_op_var40_comer_ult3  \
0                      0.0                      0.0                      0.0   
1                      0.0                      0.0                      0.0   
2                      0.0                      0.0                      0.0   
3                      0.0                      0.0                      0.0   
4                      0.0                      0.0                      0.0   

   imp_op_var40_efect_ult1  imp_op_var40_efect_ult3  ...  \
0                      0.0                      0.0  ...

In [16]:
print(kaggle_pred_df.head())
print(kaggle_pred_df.shape)

   ID  TARGET
0   2       0
1   5       0
2   6       0
3   7       0
4   9       0
(75818, 2)


In [17]:
import boto3
sm = boto3.client('sagemaker')
#sm.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
sm.delete_endpoint(EndpointName=ep_name)

ClientError: An error occurred (ValidationException) when calling the DeleteEndpoint operation: Could not find endpoint "arn:aws:sagemaker:eu-central-1:438013780556:endpoint/d-scs-model".