In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, fbeta_score, f1_score, make_scorer, \
                            average_precision_score, precision_recall_curve, confusion_matrix,balanced_accuracy_score, cohen_kappa_score
from imblearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
import statsmodels.api as sm
from imblearn.over_sampling import SMOTE
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import fbeta_score, make_scorer

### **Load Data**

In [7]:
# # Read every column except 'device_fraud_count' as its value is a constant 0
df = pd.read_csv('../data/raw/Base.csv', usecols=lambda x: x != 'device_fraud_count')

### **Handle Missing Values**

In [8]:
# Features with missing values represented by negative values according to documentation
missing_features = ['prev_address_months_count', 'current_address_months_count', 'intended_balcon_amount',
                    'bank_months_count', 'session_length_in_minutes', 'device_distinct_emails_8w']

# Replace negative values with NaN
for feature in missing_features:
    df[feature] = df[feature].apply(lambda x: x if x >= 0 else np.nan)

### **Encode missing values**

In [9]:
features_to_drop = ['prev_address_months_count', 'intended_balcon_amount', 'bank_months_count']
for col in features_to_drop:
    missing_column_name = f'{col}_missing'
    df[missing_column_name] = np.where(df[col].isna(), 1, 0)

Drop features with a high percentage of missing values, and have very weak correlation with fraud status.

In [10]:
df.drop(features_to_drop, axis=1, inplace=True)

Drop rows with missing values as a very small percentage of the remaining observations have missing values.

In [11]:
df.dropna(inplace=True)

### **Handle Categorical Features**

Perform dummy encoding. Very similar to one-hot encoding, but the first encoded column is dropped to reduce correlation between encoded columns.

In [12]:
# Only features with String data type need to be encoded
encoded_features = [feature for feature in df.columns if df[feature].dtype == 'object']

df = pd.get_dummies(df, columns=encoded_features, drop_first=True, dtype=int)

### **Train-Test Split**

In [13]:
# Separate the feature matrix and target variable
X = df.drop('fraud_bool', axis=1)
y = df['fraud_bool']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20, stratify=y)

### **Feature Scaling**

#### Min-Max Scaling (Normalization)

From EDA, numerical features were identified. Min-max scaling is applied as parametric models are sensitive to scale.

In [14]:
numeric_features = ['income', 'name_email_similarity', 'current_address_months_count', 'customer_age', 'days_since_request', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
                    'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 'proposed_credit_limit', 'session_length_in_minutes']

scaler = MinMaxScaler()

# Fit only on the training data
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

### **Feature Selection - Backward Stepwise (logistic model)**

In [15]:
def backward_stepwise_selection(X, y, p_threshold=0.05):
    features = X.columns.tolist()
    num_features = len(features)

    for i in range(num_features, 0, -1):
        model = sm.Logit(y, X[features]).fit()
        p_values = model.pvalues
        max_p_value = p_values.max()
        if max_p_value > p_threshold:
            remove_feature = p_values.idxmax()
            print(f"Removing '{remove_feature}' with p-value: {max_p_value:.4f}")
            features.remove(remove_feature)
        else:
            break

    return features

selected_features = backward_stepwise_selection(X_train, y_train)
print("Selected Features:", selected_features) 

In [16]:
# selected_features = ['income', 'name_email_similarity', 'customer_age', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
# 'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score',
# 'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'proposed_credit_limit',
# 'foreign_request', 'session_length_in_minutes', 'keep_alive_session', 'device_distinct_emails_8w', 'month',
# 'payment_type_AC', 'employment_status_CB', 'employment_status_CC', 'employment_status_CD', 'employment_status_CE',
# 'employment_status_CF', 'housing_status_BB', 'housing_status_BC', 'housing_status_BD', 'housing_status_BE', 'housing_status_BF',
# 'source_TELEAPP', 'device_os_macintosh', 'device_os_windows', 'device_os_x11'] + ['prev_address_months_count_missing', 'intended_balcon_amount_missing', 'bank_months_count_missing']

In [None]:
X_train = X_train[selected_features]
X_test = X_test[selected_features]

### **Resampling**

Fraud class vs non fraud class

In [17]:
ratio = y.value_counts() / len(y) * 100
print(f'% of non-fraud class in y: {round(ratio[0],3)}%\n% of fraud class in y: {round(ratio[1],3)}%\n')

ratio_train = y_train.value_counts() / len(y_train) * 100
print(f'% of non-fraud class in y_train: {round(ratio_train[0],3)}%\n% of fraud class in y_train: {round(ratio_train[1],3)}%\n')

ratio_test = y_test.value_counts() / len(y_test) * 100
print(f'% of non-fraud class in y_test: {round(ratio_test[0],3)}%\n% of fraud class in y_test: {round(ratio_test[1],3)}%')

% of non-fraud class in y: 98.893%
% of fraud class in y: 1.107%

% of non-fraud class in y_train: 98.893%
% of fraud class in y_train: 1.107%

% of non-fraud class in y_test: 98.893%
% of fraud class in y_test: 1.107%


SMOTE

In [18]:
smote = SMOTE(random_state=42, sampling_strategy = 0.666) #ratio of minority:majority 40:60

Xt_resampled_SMOTE, yt_resampled_SMOTE = smote.fit_resample(X_train, y_train)

ratio_SMOTE = yt_resampled_SMOTE.value_counts() / len(yt_resampled_SMOTE) * 100
print(f'% of non-fraud class in resampled data: {round(ratio_SMOTE[0],3)}%\n% of fraud class in resampled data: {round(ratio_SMOTE[1],3)}%')

% of non-fraud class in resampled data: 60.024%
% of fraud class in resampled data: 39.976%


### **Evaluation metric**

In [19]:
metrics_names = ['Ratio of Classes', 'Accuracy', 'Recall','Precision', 'F2 Score', 'F1.5 Score','F1 Score',
                 'TPR','FNR', "PR-AUC", 'Balanced Accuracy', 'Kappa Statistic']
results = pd.DataFrame(index= metrics_names,columns=['Original Dataset', 'SMOTE'])
class_reports = {}
pr_auc_pts = {}

results.loc['Ratio of Classes','Original Dataset'] = str(round(ratio_train,3)[0]) + '% : ' +str(round(ratio_train,3)[1])+'%'
results.loc['Ratio of Classes','SMOTE'] = str(round(ratio_SMOTE,3)[0]) + '% : ' +str(round(ratio_SMOTE,3)[1])+'%'

In [20]:
def evaluate_results(y_test, y_pred):
    score_results = {}
    score_results["accuracy_score"] = accuracy_score(y_test, y_pred)
    score_results["classification_report"] = classification_report(y_test, y_pred)
    score_results["recall_score"] = recall_score(y_test, y_pred)
    score_results["precision_score"] = precision_score(y_test, y_pred)
    score_results["F2-score"] = fbeta_score(y_test, y_pred, beta =2)
    score_results["F1-score"] = f1_score(y_test, y_pred)
    score_results["average_precision_score"] = average_precision_score(y_test, y_pred)
    score_results["PR-AUC"] = precision_recall_curve(y_test, y_pred)
    score_results["balanced_accuracy_score"] = balanced_accuracy_score(y_test, y_pred)
    score_results["Kappa statistics"] = cohen_kappa_score(y_test,y_pred)
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    TN, FP, FN, TP = cm.ravel()
    TPR = TP/(TP+FN)
    FNR = FN/(TP+FN)
    score_results["TPR"] = TPR
    score_results["FNR"] = FNR
    return score_results

'''
## To use function
xgb_base_pred = xgb_base.predict(X_test_new)

xgb_base_results = evaluate_results(y_test=y_test_new, y_pred=xgb_base_pred)
'''


'\n## To use function\nxgb_base_pred = xgb_base.predict(X_test_new)\n\nxgb_base_results = evaluate_results(y_test=y_test_new, y_pred=xgb_base_pred)\n'

In [21]:
import pickle

def save_model(model, model_filename):
    pickle.dump(model, open(model_filename,"wb"))

# This function is more specific to the results from lazy classifier. feel free to overwrite it
def save_results(results_df, results_filename):
    results_df = results_df["evaluate_results"].reset_index()
    # Convert the 'Metrics' column into separate columns
    df_metrics = pd.json_normalize(results_df['evaluate_results'])

    # Concatenate the two DataFrames
    results_df = pd.concat([results_df['Model'].rename('Model'), df_metrics], axis=1)
    results_df.to_csv(results_filename, index=False)

def save_model_and_results(model, model_filename, results_df, results_filename):
    save_model(model, model_filename)
    save_results(results_df, results_filename)

'''
## Sample usage
save_model_and_results(clf_smote_encoded,"C:/NUS/Fraud-Hackathon/models/baseline_encoded.pkl", models_smote_encoded, "C:/NUS/Fraud-Hackathon/models/baseline_encoded_results.csv")
'''

'\n## Sample usage\nsave_model_and_results(clf_smote_encoded,"C:/NUS/Fraud-Hackathon/models/baseline_encoded.pkl", models_smote_encoded, "C:/NUS/Fraud-Hackathon/models/baseline_encoded_results.csv")\n'

### **Models**

#### **XGBoost**

In [None]:
# Device agnostic code
from numba import cuda

device = 'cuda' if cuda.is_available() else 'cpu'
device

In [23]:
xgb = XGBClassifier(n_estimators=100,
                    learning_rate=0.01,
                    colsample_bytree=0.8,
                    subsample=0.8,
                    device=device,
                    random_state=42)

xgb.fit(Xt_resampled_SMOTE, yt_resampled_SMOTE)
xgb_pred = xgb.predict(X_test)

In [24]:
save_model(xgb, '../models/xgboost.pkl')
# pickle.dump(xgb, open('../models/xgboost.pkl','wb'))

In [23]:
xgb_results = evaluate_results(y_test, xgb_pred)
xgb_results

{'accuracy_score': 0.983152343474804,
 'classification_report': '              precision    recall  f1-score   support\n\n           0       0.99      0.99      0.99    196523\n           1       0.13      0.10      0.11      2199\n\n    accuracy                           0.98    198722\n   macro avg       0.56      0.54      0.55    198722\nweighted avg       0.98      0.98      0.98    198722\n',
 'recall_score': 0.0964074579354252,
 'precision_score': 0.13477431659249842,
 'F2-score': 0.10222779438711543,
 'F1-score': 0.11240721102863202,
 'average_precision_score': 0.022992142183462808,
 'PR-AUC': (array([0.01106571, 0.13477432, 1.        ]),
  array([1.        , 0.09640746, 0.        ]),
  array([0, 1])),
 'balanced_accuracy_score': 0.5447410299452063,
 'Kappa statistics': 0.10413908853985965,
 'TPR': 0.0964074579354252,
 'FNR': 0.9035925420645748}

In [25]:
print(xgb_results['classification_report'])

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    196523
           1       0.13      0.10      0.11      2199

    accuracy                           0.98    198722
   macro avg       0.56      0.54      0.55    198722
weighted avg       0.98      0.98      0.98    198722



##### Tune XGBoost

In [27]:
f2_scorer = make_scorer(fbeta_score, beta=2)

xgb = XGBClassifier(n_estimators=100,
                    learning_rate=0.01,
                    colsample_bytree=0.8,
                    subsample=0.8,
                    device=device,
                    random_state=42)

smote = SMOTE(sampling_strategy = 0.666, random_state=42) #ratio of minority:majority 40:60
pipeline = make_pipeline(smote, xgb)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

xgb_params = {'xgbclassifier__max_depth': [2, 4, 6, 8, 10],
              'xgbclassifier__min_child_weight': [1, 2, 3, 4],
              'xgbclassifier__gamma': [i/10.0 for i in range(0,5)]
             }

search_xgb = HalvingGridSearchCV(estimator=pipeline,
                            param_grid=xgb_params,
                            cv=cv,
                            n_jobs=-1,
                            random_state=42,
                            scoring = f2_scorer,
                            verbose=2)

search_xgb.fit(X_train, y_train)

print(f'CV Results: {search_xgb.cv_results_}')
print(f'Best parameters: {search_xgb.best_params_}')
print(f'Best Score: {search_xgb.best_score_}')

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 9813
max_resources_: 794885
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 100
n_resources: 9813
Fitting 5 folds for each of 100 candidates, totalling 500 fits
----------
iter: 1
n_candidates: 34
n_resources: 29439
Fitting 5 folds for each of 34 candidates, totalling 170 fits
----------
iter: 2
n_candidates: 12
n_resources: 88317
Fitting 5 folds for each of 12 candidates, totalling 60 fits
----------
iter: 3
n_candidates: 4
n_resources: 264951
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 4
n_candidates: 2
n_resources: 794853
Fitting 5 folds for each of 2 candidates, totalling 10 fits
CV Results: {'iter': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0

In [25]:
# # import pickle
# pickle.dump(search_xgb, open('../models/xgboost_tuned.pkl','wb'))
save_model(search_xgb, '../models/xgboost_tuned.pkl')
search_xgb

In [28]:
search_xgb_pred = search_xgb.predict(X_test)

In [29]:
xgb_tuned_results = evaluate_results(y_test, search_xgb_pred)
xgb_tuned_results

{'accuracy_score': 0.9725445597367176,
 'classification_report': '              precision    recall  f1-score   support\n\n           0       0.99      0.98      0.99    196523\n           1       0.08      0.15      0.11      2199\n\n    accuracy                           0.97    198722\n   macro avg       0.54      0.56      0.55    198722\nweighted avg       0.98      0.97      0.98    198722\n',
 'recall_score': 0.14643019554342884,
 'precision_score': 0.08254293770827993,
 'F2-score': 0.12680160667874302,
 'F1-score': 0.10557377049180328,
 'average_precision_score': 0.021532134333066028,
 'PR-AUC': (array([0.01106571, 0.08254294, 1.        ]),
  array([1.       , 0.1464302, 0.       ]),
  array([0, 1])),
 'balanced_accuracy_score': 0.5641092933620524,
 'Kappa statistics': 0.09273301734134598,
 'TPR': 0.14643019554342884,
 'FNR': 0.8535698044565712}

In [30]:
print(xgb_tuned_results['classification_report'])

              precision    recall  f1-score   support

           0       0.99      0.98      0.99    196523
           1       0.08      0.15      0.11      2199

    accuracy                           0.97    198722
   macro avg       0.54      0.56      0.55    198722
weighted avg       0.98      0.97      0.98    198722



In [35]:
# calculate uplift for metrics after tuning
for k,v in xgb_tuned_results.items():
    if k in ['classification_report', 'PR-AUC']:
        continue
    print(f'{k} uplift = {(v - xgb_results[k])*100:.3f}%')

accuracy_score uplift = -1.061%
recall_score uplift = 5.002%
precision_score uplift = -5.223%
F2-score uplift = 2.457%
F1-score uplift = -0.683%
average_precision_score uplift = -0.146%
balanced_accuracy_score uplift = 1.937%
Kappa statistics uplift = -1.141%
TPR uplift = 5.002%
FNR uplift = -5.002%
