In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('Fraud_Data.csv')

In [None]:
df.shape

(151112, 11)

In [None]:
X = df.drop('class', axis=1)

In [None]:
X.shape

(151112, 10)

In [None]:
y = df['class']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TfidVectorizer

In [None]:
# Most of the raw features we had don't seem to correlate so well with fraud vs not fraud

# [Featre Engineering]
# The time difference could be useful
# The country information could also be useful

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Assume a Naive Model

from sklearn.dummy import DummyClassifier

DummyClassifier?

In [None]:
# What is our problem?
# Well, we want to detect fraud in commercial transactions
# So we can either say something is Fraud or not Fraud

In [None]:
# We can say that it's really important that we identify all instances of fraud
# So in general, it's OK if we misclassify a transaction as fraudulent when it's not reall
# If we call a fraudulent transaction as +
# Then what do we care about more? Precision or Recall?
# Given our problem statement, we want high recall

In [None]:
DummyClassifier

In [None]:
model_naive = DummyClassifier(strategy="constant", constant=1).fit(X_train, y_train)
#.predict(X_test)

In [None]:
from sklearn.metrics import recall_score, f1_score

In [None]:
recall_score(y_test, y_pred)

1.0

In [None]:
DummyClassifier?

In [None]:
y_train.mean()

0.09310533467450191

In [None]:
f1_score(y_test, y_pred)

0.173961379510356

In [None]:
def evaluate_model(model, X, y, mode):
    y_pred = model.predict(X)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    
    print(f"{mode} recall: {recall :g}")
    print(f"{mode} F-1 score: {f1 :g}")

In [None]:
evaluate_model(model_naive, X_train, y_train, "Training")

Training recall: 1
Training F-1 score: 0.17035


In [None]:
evaluate_model(model_naive, X_test, y_test, "Testing")

Testing recall: 1
Testing F-1 score: 0.173961


In [None]:
X_train

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address
14017,396324,2015-05-14 20:04:14,2015-09-11 07:33:18,36,IFEEJSFSPORAV,SEO,Chrome,M,27,1.976885e+09
55601,145984,2015-04-28 17:24:46,2015-06-29 03:02:25,45,UVBMLYAOPJVAC,Ads,Safari,F,26,2.796086e+09
121348,203232,2015-03-06 08:33:15,2015-05-31 07:45:21,52,XASNZETNPFZTQ,Ads,IE,M,41,1.310155e+09
69508,205260,2015-05-28 23:50:10,2015-06-14 11:51:21,54,HVSZDGPDBKCII,SEO,Chrome,M,31,5.263339e+08
82142,66995,2015-04-09 03:48:52,2015-04-12 20:31:01,41,EWAXBHQORMYLB,SEO,Safari,M,39,3.756027e+09
...,...,...,...,...,...,...,...,...,...,...
31215,154824,2015-05-28 00:09:10,2015-09-03 23:06:20,51,PENHVDYZCLYMR,Ads,FireFox,M,37,2.643337e+09
147009,233710,2015-06-27 21:00:07,2015-09-06 21:05:13,33,QTXKRYHRDEODM,Direct,IE,M,33,3.091727e+09
115801,193981,2015-05-18 23:11:34,2015-08-23 19:04:21,63,USLGMTIPBXGTI,Ads,IE,F,32,2.559315e+09
68314,60385,2015-05-30 06:27:42,2015-07-20 11:08:04,38,UUTJJVXNUKVDB,Ads,Chrome,M,45,3.260783e+09


In [None]:
# our first model will be built using the features:
# purchse_value, age, source, browser, and sex

In [None]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression

In [None]:
# We have numerical and categorical data

In [None]:
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


model = Pipeline([
    ("selector", ColumnTransformer([("numerical", "passthrough", ["purchase_value", "age"])])),
    ("scaler", StandardScaler()),
    ("classifier", LogisticRegression(class_weight="balanced"))
])

In [None]:
model.fit(X_train, y_train)

Pipeline(steps=[('selector',
                 ColumnTransformer(transformers=[('numerical', 'passthrough',
                                                  ['purchase_value', 'age'])])),
                ('scaler', StandardScaler()),
                ('classifier', LogisticRegression(class_weight='balanced'))])

In [None]:
evaluate_model(model, X_train, y_train, "Training")

Training recall: 0.483321
Training F-1 score: 0.158906


In [None]:
evaluate_model(model, X_test, y_test, "Testing")

Testing recall: 0.480133
Testing F-1 score: 0.161367


In [None]:
y_pred = model.predict(X_test)

In [None]:
y_train.mean()

0.09310533467450191

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
model = Pipeline([
    ("selector", ColumnTransformer([("numerical", "passthrough", ["sex", "browser", "source"])])),
    ("encoder", OneHotEncoder()),
    ("classifier", LogisticRegression(class_weight="balanced"))
])

In [None]:
model.fit(X_train, y_train)

Pipeline(steps=[('selector',
                 ColumnTransformer(transformers=[('numerical', 'passthrough',
                                                  ['sex', 'browser',
                                                   'source'])])),
                ('encoder', OneHotEncoder()),
                ('classifier', LogisticRegression(class_weight='balanced'))])

In [None]:
evaluate_model(model, X_train, y_train, "Training")

Training recall: 0.499716
Training F-1 score: 0.165651


In [None]:
evaluate_model(model, X_test, y_test, "Testing")

Testing recall: 0.496249
Testing F-1 score: 0.167936


In [None]:
## Using Both numerical and Categorical

In [None]:
pipe_num = Pipeline([
    ("scaler", StandardScaler()),
])

pip_cat = Pipeline([
    ("encoder", OneHotEncoder()),
])

In [None]:
model = Pipeline([
    ("selector", ColumnTransformer([
        ("numerical", pipe_num, ["age", "purchase_value"]),
        ("categorical", pip_cat, ["sex", "browser", "source"]),
    ])),
    ("classifier", LogisticRegression(class_weight="balanced"))
])

In [None]:
model.fit(X_train, y_train)

Pipeline(steps=[('selector',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'purchase_value']),
                                                 ('categorical',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  ['sex', 'browser',
                                                   'source'])])),
                ('classifier', LogisticRegression(class_weight='balanced'))])

In [None]:
evaluate_model(model, X_train, y_train, "Training")

Training recall: 0.509666
Training F-1 score: 0.168436


In [None]:
evaluate_model(model, X_test, y_test, "Testing")

Testing recall: 0.496249
Testing F-1 score: 0.167936


## Feature Engineering

In [None]:
# Repeat customer/device
# Time diff
# Country