In [29]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.metrics import mean_squared_error, max_error, r2_score
from sklearn.metrics import log_loss, roc_auc_score
from scipy.stats import ks_2samp
import pickle

from pydantic import BaseModel

In [None]:
import models

In [10]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [92]:
df = pd.read_csv("test5.csv")

In [95]:
df['target'] = df['target'].apply(lambda x: 1 if x=="yes" else 0)

In [96]:
df.to_csv('test5.csv', index=False)

In [30]:
class InputSchema(BaseModel):
    age: int
    job: str 
    marital: str
    education: str
    default: str
    balance: int
    housing: str
    loan: str
    contact: str
    day: int
    month: str
    campaign: int
    pdays: int
    previous: int
    poutcome: str

In [62]:
class Preprocessor:

    def __init__(self):
        # Load preprocessor tools 

        self.num_cols = ['age', 'balance', 'day', 'campaign', 'pdays', 'previous']
        self.cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
        # self.scaler = joblib.load("model/scaler.pkl")
        with open("model/scaler4.pkl", "rb") as file:
            self.scaler = pickle.load(file)

        with open("model/encoder4.pkl", "rb") as file:
            self.encoder = pickle.load(file)
        
    
    def preprocess(self, raw_data):
        # Steps for preprocessing raw_data
        # array = np.array(raw_data)
        # array = array.reshape((1, -1))

        raw_data_dict = raw_data.dict()
        df = pd.DataFrame([raw_data_dict], columns=list(raw_data_dict.keys()))

        df_ready = df.copy()

        df_ready[self.num_cols] = self.scaler.transform(df[self.num_cols])

        df_encoded = pd.DataFrame(self.encoder.transform(df_ready[self.cat_cols]))
        df_encoded.columns = self.encoder.get_feature_names_out(self.cat_cols)
        
        # Replace Categotical Data with Encoded Data
        df_ready = df_ready.drop(self.cat_cols ,axis=1)
        df_ready = pd.concat([df_encoded, df_ready], axis=1)

        data_with_column_name = df_ready

        return data_with_column_name

    def batch_preprocess(self, items):
        # Steps for preprocessing raw_data
        # array = np.array(raw_data)
        # array = array.reshape((1, -1))

        list_raw_data_dict = [raw_data.dict() for raw_data in items]
        df = pd.DataFrame(list_raw_data_dict, columns=list(list_raw_data_dict[0].keys()))

        df_ready = df.copy()

        df_ready[self.num_cols] = self.scaler.transform(df[self.num_cols])

        df_encoded = pd.DataFrame(self.encoder.transform(df_ready[self.cat_cols]))
        df_encoded.columns = self.encoder.get_feature_names_out(self.cat_cols)
        
        # Replace Categotical Data with Encoded Data
        df_ready = df_ready.drop(self.cat_cols ,axis=1)
        df_ready = pd.concat([df_encoded, df_ready], axis=1)

        data_with_column_name = df_ready

        return data_with_column_name

In [61]:
class Model:

    def __init__(self):
        # Load model
        # self.model = joblib.load("model/rf_model.pkl")

        # self.model = MLPClassifier(hidden_layer_sizes=(60), max_iter=1000)
        # self.model.load_model("model/rf_model.pkl")
        
        with open("model/model4.pkl", "rb") as file:
            model = pickle.load(file)

        self.model = model

    
    def model_predict(self, data):
        # Define how model makes prediction with processed input data
        pred = self.model.predict(data)
        final_pred = list(map(lambda x: 'yes' if x == 1 else 'no', pred))
        return final_pred[0]
    
    def model_batch_predict(self, batch_data):
        # Define how model makes prediction with processed input data
        batch_pred = self.model.predict(batch_data)
        final_batch_pred = list(map(lambda x: 'yes' if x == 1 else 'no', batch_pred))
        return final_batch_pred
    
    def model_predict_proba(self, data):
        # Define how model makes prediction with processed input data
        pred = self.model.predict_proba(data)
        # final_pred = list(map(lambda x: 'yes' if x == 1 else 'no', pred))
        return pred[0]
    
    def model_batch_predict_proba(self, batch_data):
        # Define how model makes prediction with processed input data
        batch_pred = self.model.predict_proba(batch_data)
        # final_batch_pred = list(map(lambda x: 'yes' if x == 1 else 'no', batch_pred))
        return batch_pred

In [2]:
def gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float64)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    giniSum = all[:, 0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)


def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)

In [15]:
num_cols = ['age', 'balance', 'day', 'campaign', 'pdays', 'previous']
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

In [9]:
with open("model/scaler2.pkl", "rb") as file:
    scaler = pickle.load(file)

with open("model/encoder2.pkl", "rb") as file:
    encoder = pickle.load(file)

In [8]:
with open("model/model2.pkl", "rb") as file:
    model = pickle.load(file)

In [13]:
df_train['deposit'].value_counts()

no     5299
yes    4746
Name: deposit, dtype: int64

In [63]:
processor = Preprocessor()

In [64]:
model = Model()

In [74]:
raw_data = [InputSchema(**row) for row in df_test.drop(columns='deposit').to_dict(orient='row')]

  raw_data = [InputSchema(**row) for row in df_test.drop(columns='deposit').to_dict(orient='row')]


In [75]:
raw_data[:3]

[InputSchema(age=31, job='admin.', marital='single', education='secondary', default='no', balance=554, housing='yes', loan='no', contact='cellular', day=5, month='feb', campaign=1, pdays=-1, previous=0, poutcome='unknown'),
 InputSchema(age=43, job='blue-collar', marital='married', education='secondary', default='no', balance=514, housing='yes', loan='yes', contact='cellular', day=20, month='apr', campaign=2, pdays=-1, previous=0, poutcome='unknown'),
 InputSchema(age=26, job='blue-collar', marital='single', education='secondary', default='no', balance=277, housing='no', loan='no', contact='unknown', day=14, month='may', campaign=2, pdays=-1, previous=0, poutcome='unknown')]

In [76]:
data = processor.batch_preprocess(raw_data)

In [77]:
pred = model.model_batch_predict(data)

In [78]:
pred_proba = model.model_batch_predict_proba(data)

In [79]:
pred

['yes',
 'no',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'no

In [80]:
y_true = [1 if target == "yes" else 0 for target in df_train['deposit'].values]

In [81]:
y_true

[0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,


In [85]:
temp = convert_cat_to_int(df_test['deposit'].values)

In [86]:
pred_proba[:, 1]

array([0.64, 0.37, 0.56, ..., 0.37, 0.63, 0.64])

In [90]:
roc_auc_score(temp, pred_proba[:, 1])

0.6494696517604481

In [91]:
gini_normalized(temp, pred_proba[:, 1])

0.2984772941652069

In [56]:
def convert_cat_to_int(lst):
    d = {x: i for i, x in enumerate(set(lst))}
    lst_new = [d[x] for x in lst]
    return lst_new
