In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
%cd /content/drive/My Drive/Colab Notebooks

/content/drive/My Drive/Colab Notebooks


In [42]:
import pandas as pd

In [43]:
df = pd.read_csv('Fraud_Data.csv')

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
X = df.drop('class', axis=1)

In [None]:
X.shape

In [None]:
y = df['class']

In [None]:
# Most of the raw features we had don't seem to correlate so well with fraud vs not fraud

# [Featre Engineering]
# The time difference could be useful
# The country information could also be useful

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
def prepare_data():
    df = pd.read_csv('Fraud_Data.csv')
    X = df.drop('class', axis=1) 
    y = df['class']
    
    return train_test_split(X, y, test_size=0.2, random_state=0)

In [58]:
from utils import prepare_data

X_train, X_test, y_train, y_test = prepare_data()

In [8]:
# Assume a Naive Model

from sklearn.dummy import DummyClassifier

DummyClassifier?

In [None]:
# What is our problem?
# Well, we want to detect fraud in commercial transactions
# So we can either say something is Fraud or not Fraud

In [None]:
# We can say that it's really important that we identify all instances of fraud
# So in general, it's OK if we misclassify a transaction as fraudulent when it's not really
# If we call a fraudulent transaction as +
# Then what do we care about more? Precision or Recall?
# Given our problem statement, we want high recall

In [None]:
DummyClassifier

In [9]:
model_naive = DummyClassifier(strategy="constant", constant=1).fit(X_train, y_train)
y_pred = model_naive.predict(X_train)

In [10]:
from sklearn.metrics import recall_score, f1_score

In [None]:
recall_score(y_train, y_pred)

In [None]:
DummyClassifier?

In [None]:
y_train.mean()

In [None]:
f1_score(y_test, y_pred)

In [11]:
def evaluate_model(model, X, y, mode):
    y_pred = model.predict(X)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    
    print(f"{mode} recall: {recall :g}")
    print(f"{mode} F-1 score: {f1 :g}")

In [None]:
evaluate_model(model_naive, X_train, y_train, "Training")

In [None]:
evaluate_model(model_naive, X_test, y_test, "Testing")

In [None]:
X_train

In [None]:
# our first model will be built using the features:
# purchse_value, age, source, browser, and sex

In [12]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression

In [None]:
# We have numerical and categorical data

In [13]:
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


model = Pipeline([
    ("selector", ColumnTransformer([("numerical", "passthrough", ["purchase_value", "age"])])),
    ("scaler", StandardScaler()),
    ("classifier", LogisticRegression(class_weight="balanced"))
])

In [14]:
model.fit(X_train, y_train)

Pipeline(steps=[('selector',
                 ColumnTransformer(transformers=[('numerical', 'passthrough',
                                                  ['purchase_value', 'age'])])),
                ('scaler', StandardScaler()),
                ('classifier', LogisticRegression(class_weight='balanced'))])

In [None]:
#model[-1].C_

In [15]:
evaluate_model(model, X_train, y_train, "Training")

Training recall: 0.481631
Training F-1 score: 0.159678


In [None]:
evaluate_model(model, X_test, y_test, "Testing")

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_train.mean()

In [17]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
model = Pipeline([
    ("selector", ColumnTransformer([("numerical", "passthrough", ["sex", "browser", "source"])])),
    ("encoder", OneHotEncoder()),
    ("classifier", LogisticRegression(class_weight="balanced"))
])

In [None]:
model.fit(X_train, y_train)

In [None]:
evaluate_model(model, X_train, y_train, "Training")

In [None]:
evaluate_model(model, X_test, y_test, "Testing")

In [None]:
## Using Both numerical and Categorical

In [18]:
pipe_num = Pipeline([
    ("scaler", StandardScaler()),
])

pip_cat = Pipeline([
    ("encoder", OneHotEncoder()),
])

In [19]:
model = Pipeline([
    ("selector", ColumnTransformer([
        ("numerical", pipe_num, ["age", "purchase_value"]),
        ("categorical", pip_cat, ["sex", "browser", "source"]),
    ])),
    ("classifier", LogisticRegression(class_weight="balanced"))
])

In [45]:
model.fit(X_train, y_train)

Pipeline(steps=[('selector',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'purchase_value']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['sex', 'browser',
                                                   'source'])])),
                ('classifier', LogisticRegression(class_weight='balanced'))])

In [46]:
evaluate_model(model, X_train, y_train, "Training")

Training recall: 0.503691
Training F-1 score: 0.167873


In [22]:
evaluate_model(model, X_test, y_test, "Testing")

Testing recall: 0.514605
Testing F-1 score: 0.166201


In [36]:
threshold = 0.25

y_pred = (model.predict_proba(X_test)[:, 1] > threshold).astype(int)

In [37]:
recall_score(y_test, y_pred)

1.0

In [35]:
from sklearn.metrics import precision_score

In [38]:
precision_score(y_test, y_pred)

0.09175131522350528

In [None]:
#We can achieve any level of recall by changing the threshold
#Maybe there's another metric we can use to choose our model

In [None]:
#Two types of mistakes we can make:
#False + : when we predict an observation is FRAUD but it's not
#False - : When we predict an observation is FRAUD and it's really FRAUD (Costly in our case)

In [None]:
#if False + is costly then you want HIGH PRECISION
#if False _ is costly then you want HIGH RECALL

In [None]:
#We can say that if a transaction is Fraud and we fail to identify it 
#THEN we lose out however much money the transaction is 


In [None]:
#if we have a False +, then we run the risk of annoying our customer
#and there is some probability P that they will not go through the transaction

In [40]:
0.2 * X_train['purchase_value'].mean()

7.393589160304081

In [None]:
#let's populate that the cost of a False + is $7

In [None]:
#Our Cost based on the two types we can make is as follows:
#C = 7 * FP + purchase value of each FN

## Feature Engineering

In [None]:
# Repeat customer/device
# Time diff
# Country

In [67]:
X_train.columns

Index(['user_id', 'signup_time', 'purchase_time', 'purchase_value',
       'device_id', 'source', 'browser', 'sex', 'age', 'ip_address'],
      dtype='object')

In [48]:
X_train['device_id'].nunique()

111409

In [68]:
X_train.shape

(120889, 10)

In [55]:
repeated_device = set((X_train['device_id'].value_counts() > 1).to_frame().query("device_id == True").index)

In [57]:
X_train['device_id'].isin(repeated_device).sum()

13721

In [59]:
from sklearn.base import BaseEstimator, TransformerMixin

class IdentifyRepeats(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        #construct repeated devices
        self.repeated_devices = set(
            (X.value_counts() > 1)
            .to_frame()
            .query("device_id == True")
            .index
            
            )

        return self

    def transform(self, X):
        """X is a series, FYI"""
        return X.isin(self.repeated_devices).values.reshape(-1,1)

In [62]:
IdentifyRepeats().fit_transform(X_train['device_id']).sum()

13721

In [63]:
IdentifyRepeats().fit(X_train['device_id']).transform(X_test['device_id'])

array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [False]])

In [70]:
model = Pipeline([
                  ("selector", ColumnTransformer([
                      ("numerical", pipe_num, ["age", "purchase_value"]),
                      ("categorical", pip_cat, ["sex", "browser", "source"]),
                      ("repeated_devices", IdentifyRepeats(), "device_id")
                  ])),
                  ("classifier", LogisticRegression(class_weight="balanced"))


])

In [71]:
model.fit(X_train, y_train)

Pipeline(steps=[('selector',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'purchase_value']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['sex', 'browser', 'source']),
                                                 ('repeated_devices',
                                                  IdentifyRepeats(),
                                                  'device_id')])),
                ('classifier', LogisticRegression(class_weight='balanced'))])

In [72]:
evaluate_model(model, X_train, y_train, "Training")

Training recall: 0.676745
Training F-1 score: 0.61357


In [73]:
evaluate_model(model, X_test, y_test, "Testing")

Testing recall: 0.562928
Testing F-1 score: 0.693162


In [74]:
y_pred = model.predict(X_train)

In [75]:
y_pred & ~y_train

32979     0
30067     0
46626     0
9053      0
15635     0
         ..
41993     0
97639     0
95939     0
117952    1
43567     0
Name: class, Length: 120889, dtype: int64

In [76]:
from sklearn.metrics import confusion_matrix

In [77]:
confusion_matrix(y_train, y_pred)

array([[103490,   6021],
       [  3678,   7700]])

In [78]:
X_train.loc[(~y_pred & y_train).index, "purchase_value"]

32979     30
30067     18
46626     43
9053      24
15635     55
          ..
41993     70
97639     37
95939      9
117952    11
43567     24
Name: purchase_value, Length: 120889, dtype: int64

In [84]:
def cost_func(model, X, y_true):
    """
    Return cost of model based upon FP and FN
    Cost = 7 * FP + purchase value of each FN
    """
    y_pred = model.predict(X)
    FP = (y_pred & ~y_true).sum()

    # FN: we say it's NOT fraudulent (y=0) AND it's truely Fraudulent (y_true = 1)
    FN = X.loc[(~y_pred & y_true).index, "purchase_value"].sum()

    return 7 * FP + FN



In [87]:
cost_func(model, X_train, y_train) / X_train.shape[0]

37.31658794431255

In [89]:
cost_func(model_naive, X_train, y_train) / X_train.shape[0]

43.309110010009185

In [92]:
cost_func(model, X_test, y_test) / X_test.shape[0]

36.84445620884757

In [93]:
cost_func(model_naive, X_test, y_test) / X_test.shape[0]

43.16282301558416