In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
%cd /content/drive/My Drive/Colab Notebooks

In [None]:
from utils import prepare_data, cost_func, evaluate_model

In [None]:
from custom_estimator import IdentifyRepeats, TimeDiff

In [None]:
X_train, X_test, y_train, y_test = prepare_data()

In [None]:
pip_cat = Pipeline([
    ("encoder", OneHotEncoder()),
])

selector = ColumnTransformer([
    ("numerical", "passthrough", ["age", "purchase_value"]),
    ("categorical", pip_cat, ["sex", "browser", "source"]),
    ("repated_devices", IdentifyRepeats(), "device_id"),
    ("time_diff", TimeDiff(), ["signup_time", "purchase_time"])
])

model = Pipeline([
    ("selector", selector),
    ("classifier", DecisionTreeClassifier(max_depth=5))
])

In [None]:
param_grid = {"classifier__max_depth": range(1,20), "classifier__class_weight": [None, "balanced"]}
grid_search = GridSearchCV(model, param_grid, cv=5, verbose=1, scoring=cost_func)

In [None]:
import custom_estimator
import importlib
importlib.reload(custom_estimator)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
evaluate_model(grid_search, X_train, y_train, "Training")

In [None]:
cost_func(grid_search, X_train, y_train)/ X_train.shape[0]

In [None]:
evaluate_model(grid_search, X_test, y_test, "Testing")

In [None]:
cost_func(grid_search, X_test, y_test)/ X_test.shape[0]

In [None]:
grid_search.best_params_

##The effect of Threshold

In [None]:
import numpy as np
thresholds = np.linspace(0, 1, 100)

In [None]:
def predict_labels(model, X, threshold=0.5):
    return (model.predict_proba(X)[:, 1] > threshold).astype(int)

In [None]:
predict_labels(grid_search, X_train)

In [None]:
def cost_func_w_threshold(model, X, y_true, threshold=0.5):
    """
    Return cost of model based upon FP and FN
    Cost = 7 * FP + purchase value of each FN
    """
    y_pred = (model.predict_proba(X)[:, 1] > threshold).astype(int)
    FP = (y_pred & ~y_true).sum()

    # FN: we say it's NOT fraudulent (y=0) AND it's truely Fraudulent (y_true = 1)
    FN = X.loc[(~y_pred & y_true).index, "purchase_value"].sum()

    return -(7 * FP + FN)

In [None]:
cost_func_w_threshold(grid_search, X_train, y_train) / X_train.shape[0]

In [None]:
#probably i should tweak the way i find the best model
#for a given set of hyper params, there is an optimal threshold
#And by optimal, i mean a value for threshold that will have the lowest cost func value

In [None]:
model.set_params(classifier__max_depth=2)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.predict_proba(X_train)[:,1]

In [None]:
cost_func_w_threshold(model, X_train, y_train, threshold=0.4) / X_train.shape[0]

In [None]:
costs = np.array([
                  cost_func_w_threshold(model,X_train, y_train, threshold=threshold) for threshold in thresholds
                  ]) / X_train.shape[0]

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(thresholds, costs, '--r')

##Trying Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
pip_cat = Pipeline([
    ("encoder", OneHotEncoder()),
])

selector = ColumnTransformer([
    ("numerical", "passthrough", ["age", "purchase_value"]),
    ("categorical", pip_cat, ["sex", "browser", "source"]),
    ("repated_devices", IdentifyRepeats(), "device_id"),
    ("time_diff", TimeDiff(), ["signup_time", "purchase_time"])
])

model = Pipeline([
    ("selector", selector),
    ("classifier", RandomForestClassifier())
])

In [None]:
param_grid = {"classifier__max_depth": range(1,20), "classifier__class_weight": [None, "balanced"]}
grid_search = GridSearchCV(model, param_grid, cv=5, verbose=3, scoring=cost_func)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
#for the business as usual approach
#sum the purchase value of all fraud

In [None]:
X_train.assign(label=y_train).query("label == 1")["purchase_value"].sum() / X_train[0]

In [None]:
X_test.assign(label=y_test).query("label == 1")["purchase_value"].sum() / X_test[0]