In [1]:
import json
import numpy as np
import pandas as pd

import os
from sklearn.externals import joblib


def init(model_path="model.pkl"):
    global model
    model = joblib.load(model_path)

def predict(data):
    try:
        #data = np.array(json.loads(data))
        result = model.predict(data)
        # You can return any data type, as long as it is JSON serializable.
        return result
    except Exception as e:
        error = str(e)
        return error


def predict_proba(data):
    try:
        #data = np.array(json.loads(data))
        result = model.predict_proba(data)
        # You can return any data type, as long as it is JSON serializable.
        return result
    except Exception as e:
        error = str(e)
        return error



In [2]:
id_name = 'activity_id'
target = "outcome"
model_path = "d_pbv_model.pkl"
test_file = "test_d_pbv.csv"

init(model_path)

test_df = pd.read_csv(test_file)
y_true = test_df[target]

y_pred = predict(test_df.drop([target],axis=1))

print("accuracy:")
print(np.mean(y_pred==y_true))


from sklearn import metrics

y_pred_prob = predict_proba(test_df.drop([target],axis=1))

fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred_prob[:,1], pos_label=1)
print("auc:")
print(metrics.auc(fpr, tpr))

accuracy:
0.6202794338506349
auc:
0.6897163430903834


In [3]:
## predcict test dataset on Kaggle

kaggle_submit_head = [id_name, target]

kaggle_test_file = "kaggle-test-d_pbv.csv"
kaggle_pred_file = "kaggle-test-d_pbv-predictions.csv"

kaggle_test_df = pd.read_csv(kaggle_test_file)
kaggle_test_id = np.array(kaggle_test_df[id_name])
print(kaggle_test_df.head())
print(kaggle_test_df.shape)

kaggle_y_pred = predict(kaggle_test_df)

    people_id   activity_id        date activity_category   char_1   char_2  \
0  ppl_100004   act1_249281  2022-07-20            type 1   type 5  type 10   
1  ppl_100004   act2_230855  2022-07-20            type 5      NaN      NaN   
2   ppl_10001   act1_240724  2022-10-14            type 1  type 12   type 1   
3   ppl_10001    act1_83552  2022-11-27            type 1  type 20  type 10   
4   ppl_10001  act2_1043301  2022-10-15            type 5      NaN      NaN   

   char_3  char_4  char_5  char_6  char_7   char_8   char_9    char_10  
0  type 5  type 1  type 6  type 1  type 1   type 7   type 4        NaN  
1     NaN     NaN     NaN     NaN     NaN      NaN      NaN   type 682  
2  type 5  type 4  type 6  type 1  type 1  type 13  type 10        NaN  
3  type 5  type 4  type 6  type 1  type 1   type 5   type 5        NaN  
4     NaN     NaN     NaN     NaN     NaN      NaN      NaN  type 3015  
(498687, 14)


In [4]:
print(kaggle_test_id.shape)

assert kaggle_y_pred.shape == kaggle_test_id.shape

kaggle_pred_df = pd.DataFrame({kaggle_submit_head[0]: kaggle_test_id, kaggle_submit_head[1]: kaggle_y_pred})

print(kaggle_pred_df.head())

kaggle_pred_df.to_csv(kaggle_pred_file, index=False)

(498687,)
    activity_id  outcome
0   act1_249281        0
1   act2_230855        0
2   act1_240724        1
3    act1_83552        0
4  act2_1043301        1
