In [1]:
import pandas as pd

fraud = pd.read_csv("../Data/processed/fraud_processed.csv")
fraud.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,hour_of_day,day_of_week,time_since_signup,user_txn_count
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758368,0,2,5,1251.856111,1
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311387,0,1,0,4.984444,1
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621473820,1,18,3,0.000278,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542443,0,13,0,136.690278,1
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583117,0,18,2,1211.516944,1


In [2]:
X = fraud.drop(
    columns=[
        'class',
        'signup_time',
        'purchase_time',
        'device_id'
    ]
)
y = fraud['class']
X.columns

Index(['user_id', 'purchase_value', 'source', 'browser', 'sex', 'age',
       'ip_address', 'hour_of_day', 'day_of_week', 'time_since_signup',
       'user_txn_count'],
      dtype='object')

In [3]:
X = pd.get_dummies(
    X,
    columns=['source', 'browser', 'sex'],
    drop_first=True
)
X.dtypes.unique()

array([dtype('int64'), dtype('float64'), dtype('bool')], dtype=object)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120889, 15), (30223, 15), (120889,), (30223,))

In [5]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
y_train_res.value_counts(normalize=True)

class
0    0.5
1    0.5
Name: proportion, dtype: float64

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_recall_curve, auc, confusion_matrix

lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_res, y_train_res)

y_pred = lr.predict(X_test)
y_proba = lr.predict_proba(X_test)[:,1]
f1 = f1_score(y_test, y_pred)
precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)

cm = confusion_matrix(y_test, y_pred)
f1, pr_auc, cm

(0.2921365672794686,
 0.28582128767856985,
 array([[19168,  8225],
        [  939,  1891]]))

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train_res, y_train_res)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

f1_rf = f1_score(y_test, y_pred_rf)
precision_rf, recall_rf, _ = precision_recall_curve(y_test, y_proba_rf)
pr_auc_rf = auc(recall_rf, precision_rf)
cm_rf = confusion_matrix(y_test, y_pred_rf)

f1_rf, pr_auc_rf, cm_rf

(0.6011446615354253,
 0.6207642793661154,
 array([[26679,   714],
        [ 1307,  1523]]))

In [8]:
X_test_sample = X_test.sample(n=1000, random_state=42)

In [9]:
import shap

explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test_sample)

In [15]:
X_train_res.dtypes

user_id                int64
purchase_value         int64
age                    int64
ip_address             int64
hour_of_day            int64
day_of_week            int64
time_since_signup    float64
user_txn_count         int64
source_Direct           bool
source_SEO              bool
browser_FireFox         bool
browser_IE              bool
browser_Opera           bool
browser_Safari          bool
sex_M                   bool
dtype: object

In [16]:
X_test_sample.dtypes

user_id                int64
purchase_value         int64
age                    int64
ip_address             int64
hour_of_day            int64
day_of_week            int64
time_since_signup    float64
user_txn_count         int64
source_Direct           bool
source_SEO              bool
browser_FireFox         bool
browser_IE              bool
browser_Opera           bool
browser_Safari          bool
sex_M                   bool
dtype: object

In [17]:
X_train_res_num = X_train_res.astype(float)
X_test_sample_num = X_test_sample.astype(float)

In [20]:
X_train_res_num = X_train_res.apply(pd.to_numeric, errors="coerce")
X_test_sample_num = X_test_sample.apply(pd.to_numeric, errors="coerce")

In [21]:
print(X_train_res_num.isna().sum()[X_train_res_num.isna().sum() > 0])

Series([], dtype: int64)


In [23]:
X_train_res_num = X_train_res_num.fillna(X_train_res_num.mean())
X_test_sample_num = X_test_sample_num.fillna(X_train_res_num.mean())

In [24]:
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test_sample_num)

In [26]:
X_test_sample_num = X_test_sample_num[X_train_res_num.columns]

In [27]:
shap.summary_plot(
    shap_values[1],   
    X_test_sample_num,       
    plot_type="bar",
    feature_names=X_test_sample_num.columns
)

AssertionError: The shape of the shap_values matrix does not match the shape of the provided data matrix.