In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
%cd /content/drive/My Drive/Colab Notebooks

In [None]:
from utils import cost_func, prepare_data

X_train, X_test, y_train, y_test = prepare_data()

In [None]:
# Assume a Naive Model
from sklearn.dummy import DummyClassifier

model_naive = DummyClassifier(strategy="constant", constant=1).fit(X_train, y_train)

In [None]:
from utils import evaluate_model

In [None]:
evaluate_model(model_naive, X_train, y_train, "Training")

Training recall: 1
Training F-1 score: 0.172046


In [None]:
evaluate_model(model_naive, X_test, y_test, "Testing")

Testing recall: 1
Testing F-1 score: 0.168081


In [None]:
cost_func(model_naive, X_train, y_train) / X_train.shape[0]

-43.309110010009185

In [None]:
cost_func(model_naive, X_test, y_test) / X_test.shape[0]

-43.16282301558416

## My first serious model

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
pipe_num = Pipeline([
    ("scaler", StandardScaler()),
])

pip_cat = Pipeline([
    ("encoder", OneHotEncoder()),
])

In [None]:
model = Pipeline([
    ("selector", ColumnTransformer([
        ("numerical", pipe_num, ["age", "purchase_value"]),
        ("categorical", pip_cat, ["sex", "browser", "source"]),
    ])),
    ("classifier", LogisticRegression(class_weight="balanced"))
])

In [None]:
model.fit(X_train, y_train)

Pipeline(steps=[('selector',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'purchase_value']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['sex', 'browser',
                                                   'source'])])),
                ('classifier', LogisticRegression(class_weight='balanced'))])

In [None]:
evaluate_model(model, X_train, y_train, "Training")

Training recall: 0.503691
Training F-1 score: 0.167873


In [None]:
evaluate_model(model, X_test, y_test, "Testing")

Testing recall: 0.514605
Testing F-1 score: 0.166201


In [None]:
cost_func(model, X_train, y_train) / X_train.shape[0]

-39.93085392384749

In [None]:
cost_func(model_naive, X_test, y_test) / X_test.shape[0]

-43.16282301558416

## Feature Engineering

In [None]:
# Repeat customer/device
# Time diff
# Country

In [None]:
from custom_estimator import IdentifyRepeats

In [None]:
model = Pipeline([
    ("selector", ColumnTransformer([
        ("numerical", pipe_num, ["age", "purchase_value"]),
        ("categorical", pip_cat, ["sex", "browser", "source"]),
        ("repated_devices", IdentifyRepeats(), "device_id"),
    ])),
    ("classifier", LogisticRegression(class_weight="balanced"))
])

In [None]:
model.fit(X_train, y_train)

Pipeline(steps=[('selector',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'purchase_value']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['sex', 'browser', 'source']),
                                                 ('repated_devices',
                                                  IdentifyRepeats(),
                                                  'device_id')])),
                ('classifier', LogisticRegression(class_weight='balanced'))])

In [None]:
evaluate_model(model, X_train, y_train, "Training")

Training recall: 0.676745
Training F-1 score: 0.61357


In [None]:
evaluate_model(model, X_test, y_test, "Testing")

Testing recall: 0.562928
Testing F-1 score: 0.693162


In [None]:
cost_func(model, X_train, y_train) / X_train.shape[0]

-37.31658794431255

In [None]:
cost_func(model, X_test, y_test) / X_test.shape[0]

-36.84445620884757

### Time difference

In [None]:
X_train.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address
32979,145352,2015-02-16 13:01:04,2015-06-12 23:40:56,30,IKWJBCUANABZN,Ads,IE,F,34,2811704000.0
30067,288944,2015-02-02 10:07:32,2015-02-14 06:02:02,18,SEXQLWLZEMBFC,Ads,IE,M,36,3180896000.0
46626,202506,2015-02-06 04:43:54,2015-03-03 20:56:41,43,QCSEKSYMTDQEW,SEO,Chrome,F,35,3730920000.0
9053,321763,2015-08-10 23:26:07,2015-08-27 03:34:18,24,JKLAOBCAXKVTB,Ads,Chrome,F,26,3488115000.0
15635,91968,2015-02-13 07:15:35,2015-05-01 22:20:20,55,YSIZOCQHNEGSE,SEO,Safari,M,22,855083300.0


In [None]:
import pandas as pd

(pd.to_datetime(X_train['purchase_time']) - pd.to_datetime(X_train['signup_time'])).dt.total_seconds()

32979     10060792.0
30067      1022070.0
46626      2218367.0
9053       1397291.0
15635      6707085.0
             ...    
41993      2077411.0
97639      7751332.0
95939      9931149.0
117952     6270166.0
43567      2329635.0
Length: 120889, dtype: float64

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TimeDiff(BaseEstimator, TransformerMixin):
    def __init__(self, fmt=None):
        self.fmt = fmt
    
    def _convert_to_datetime(self, series):
        return pd.to_datetime(series, format=self.fmt)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """X is going to have two columns"""
        t_1 = self._convert_to_datetime(X.iloc[:, 0])
        t_2 = self._convert_to_datetime(X.iloc[:, 1])
        
        return (t_2 - t_1).dt.total_seconds().values.reshape(-1, 1)

In [None]:
# TimeDiff('signup_time', 'purchase_time').fit_transform(X_train)

In [None]:
def MyColumnTransform(name, transformer, columns):
    transformer.fit_transform(X[columns])

In [None]:
pip_cat = Pipeline([
    ("encoder", OneHotEncoder()),
])

selector = ColumnTransformer([
    ("numerical", "passthrough", ["age", "purchase_value"]),
    ("categorical", pip_cat, ["sex", "browser", "source"]),
    ("repated_devices", IdentifyRepeats(), "device_id"),
    ("time_diff", TimeDiff(), ["signup_time", "purchase_time"]),
])

model = Pipeline([
    ("selector", selector),
    ("scaler", StandardScaler()),
    ("classifier", LogisticRegression(class_weight="balanced"))
])

In [None]:
model.fit(X_train, y_train)

Pipeline(steps=[('selector',
                 ColumnTransformer(transformers=[('numerical', 'passthrough',
                                                  ['age', 'purchase_value']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['sex', 'browser', 'source']),
                                                 ('repated_devices',
                                                  IdentifyRepeats(),
                                                  'device_id'),
                                                 ('time_diff', TimeDiff(),
                                                  ['signup_time',
                                                   'purchase_time'])])),
                ('scaler', StandardScaler()),
                ('classifier', LogisticReg

In [None]:
evaluate_model(model, X_train, y_train, "Training")

Training recall: 0.676745
Training F-1 score: 0.61357


In [None]:
evaluate_model(model, X_test, y_test, "Training")

Training recall: 0.562928
Training F-1 score: 0.693162


In [None]:
cost_func(model, X_train, y_train) / X_train.shape[0]

-37.31658794431255

In [None]:
cost_func(model, X_test, y_test) / X_test.shape[0]

-36.84445620884757

### Tune hyper parameters

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV

param_grid = {"C": np.logspace(-2, 2), "class_weight": ["balanced", None]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, verbose=1)

model = Pipeline([
    ("selector", selector),
    ("scaler", StandardScaler()),
    ("classifier", grid_search)
])

In [None]:
model.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  1.2min finished


Pipeline(steps=[('selector',
                 ColumnTransformer(transformers=[('numerical', 'passthrough',
                                                  ['age', 'purchase_value']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['sex', 'browser', 'source']),
                                                 ('repated_devices',
                                                  IdentifyRepeats(),
                                                  'device_id'),
                                                 ('time_diff', TimeDiff(),
                                                  ['signup_time',
                                                   'purchase_time'])])),
                ('scaler', StandardScaler()),
                ('classifi...
       1.930

In [None]:
model[-1].best_params_

{'C': 0.01, 'class_weight': None}

In [None]:
evaluate_model(model, X_train, y_train, "Training")

Training recall: 0.591668
Training F-1 score: 0.657743


In [None]:
evaluate_model(model, X_test, y_test, "Testing")

Testing recall: 0.557519
Testing F-1 score: 0.704488


In [None]:
cost_func(model, X_train, y_train) / X_train.shape[0]

-37.10460008768374

In [None]:
cost_func(model, X_test, y_test) / X_test.shape[0]

-36.82129504020117

In [None]:
GridSearchCV?

### Let's try our custom loss function

In [None]:
def cost_func(model, X, y_true):
    """
    Return cost of model based upon FP and FN
    Cost = 7 * FP + purchase value of each FN
    """
    y_pred = model.predict(X)
    FP = (y_pred & ~y_true).sum()
    
    # FN: we say it's NOT fraudelent (y = 0) AND it's truely Fraudelent (y_true = 1)
    FN = X.loc[(~y_pred & y_true).index, "purchase_value"].sum()
    
    return -(7 * FP + FN)

In [None]:
param_grid = {"classifier__C": np.logspace(-2, 2), "classifier__class_weight": ["balanced", None]}

model = Pipeline([
    ("selector", selector),
    ("scaler", StandardScaler()),
    ("classifier", LogisticRegression())
])

grid_search = GridSearchCV(model, param_grid, cv=5, verbose=1, scoring=cost_func)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  4.8min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('selector',
                                        ColumnTransformer(transformers=[('numerical',
                                                                         'passthrough',
                                                                         ['age',
                                                                          'purchase_value']),
                                                                        ('categorical',
                                                                         Pipeline(steps=[('encoder',
                                                                                          OneHotEncoder())]),
                                                                         ['sex',
                                                                          'browser',
                                                                          'source']),
                             

In [None]:
grid_search.best_score_

-894262.8

In [None]:
cost_func(grid_search, X_train, y_train) / X_train.shape[0]

-37.10460008768374

In [None]:
cost_func(grid_search, X_test, y_test) / X_test.shape[0]

-36.82129504020117