In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler

### **Load Data**

In [2]:
# # Read every column except 'device_fraud_count' as its value is a constant 0
df = pd.read_csv('Base.csv', usecols=lambda x: x != 'device_fraud_count')

### **Handle Missing Values**

In [3]:
# Features with missing values represented by negative values according to documentation
missing_features = ['prev_address_months_count', 'current_address_months_count', 'intended_balcon_amount',
                    'bank_months_count', 'session_length_in_minutes', 'device_distinct_emails_8w']

# Replace negative values with NaN
for feature in missing_features:
    df[feature] = df[feature].apply(lambda x: x if x >= 0 else np.nan)

### **Encode missing values**

In [4]:
features_to_drop = ['prev_address_months_count', 'intended_balcon_amount', 'bank_months_count']
for col in features_to_drop:
    missing_column_name = f'{col}_missing'
    df[missing_column_name] = np.where(df[col].isna(), 1, 0)

Drop features with a high percentage of missing values, and have very weak correlation with fraud status.

In [5]:
df.drop(features_to_drop, axis=1, inplace=True)

Drop rows with missing values as a very small percentage of the remaining observations have missing values.

In [6]:
df.dropna(inplace=True)

### **Handle Categorical Features**

Perform dummy encoding. Very similar to one-hot encoding, but the first encoded column is dropped to reduce correlation between encoded columns.

In [7]:
# Only features with String data type need to be encoded
encoded_features = [feature for feature in df.columns if df[feature].dtype == 'object']

df = pd.get_dummies(df, columns=encoded_features, drop_first=True, dtype=int)

### **Train-Test Split**

In [8]:
# Separate the feature matrix and target variable
X = df.drop('fraud_bool', axis=1)
y = df['fraud_bool']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20, stratify=y)

### **Feature Scaling**

#### Min-Max Scaling (Normalization)

From EDA, numerical features were identified. Min-max scaling is applied as parametric models are sensitive to scale.

In [9]:
numeric_features = ['income', 'name_email_similarity', 'current_address_months_count', 'customer_age', 'days_since_request', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
                    'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 'proposed_credit_limit', 'session_length_in_minutes']

scaler = MinMaxScaler()

# Fit only on the training data
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

### **Feature Selection - Backward Stepwise (logistic model)**

In [10]:
# import statsmodels.api as sm

# def backward_stepwise_selection(X, y, p_threshold=0.05):
#     features = X.columns.tolist()
#     num_features = len(features)

#     for i in range(num_features, 0, -1):
#         model = sm.Logit(y, X[features]).fit()
#         p_values = model.pvalues
#         max_p_value = p_values.max()
#         if max_p_value > p_threshold:
#             remove_feature = p_values.idxmax()
#             print(f"Removing '{remove_feature}' with p-value: {max_p_value:.4f}")
#             features.remove(remove_feature)
#         else:
#             break

#     return features

# selected_features = backward_stepwise_selection(X_train, y_train)
# print("Selected Features:", selected_features) #35 features
# #['income', 'name_email_similarity', 'customer_age', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
# # 'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score',
# # 'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'proposed_credit_limit',
# # 'foreign_request', 'session_length_in_minutes', 'keep_alive_session', 'device_distinct_emails_8w', 'month',
# # 'payment_type_AC', 'employment_status_CB', 'employment_status_CC', 'employment_status_CD', 'employment_status_CE',
# # 'employment_status_CF', 'housing_status_BB', 'housing_status_BC', 'housing_status_BD', 'housing_status_BE', 'housing_status_BF',
# # 'source_TELEAPP', 'device_os_macintosh', 'device_os_windows', 'device_os_x11']

In [11]:
selected_features = ['income', 'name_email_similarity', 'customer_age', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score',
'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'proposed_credit_limit',
'foreign_request', 'session_length_in_minutes', 'keep_alive_session', 'device_distinct_emails_8w', 'month',
'payment_type_AC', 'employment_status_CB', 'employment_status_CC', 'employment_status_CD', 'employment_status_CE',
'employment_status_CF', 'housing_status_BB', 'housing_status_BC', 'housing_status_BD', 'housing_status_BE', 'housing_status_BF',
'source_TELEAPP', 'device_os_macintosh', 'device_os_windows', 'device_os_x11'] + ['prev_address_months_count_missing', 'intended_balcon_amount_missing', 'bank_months_count_missing']

X_train = X_train[selected_features]
X_test = X_test[selected_features]

### **Resampling**

Fraud class vs non fraud class

In [12]:
ratio = y.value_counts() / len(y) * 100
print(f'% of non-fraud class in y: {round(ratio[0],3)}%\n% of fraud class in y: {round(ratio[1],3)}%\n')

ratio_train = y_train.value_counts() / len(y_train) * 100
print(f'% of non-fraud class in y_train: {round(ratio_train[0],3)}%\n% of fraud class in y_train: {round(ratio_train[1],3)}%\n')

ratio_test = y_test.value_counts() / len(y_test) * 100
print(f'% of non-fraud class in y_test: {round(ratio_test[0],3)}%\n% of fraud class in y_test: {round(ratio_test[1],3)}%')

% of non-fraud class in y: 98.893%
% of fraud class in y: 1.107%

% of non-fraud class in y_train: 98.893%
% of fraud class in y_train: 1.107%

% of non-fraud class in y_test: 98.893%
% of fraud class in y_test: 1.107%


SMOTE

In [13]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42, sampling_strategy = 0.666) #ratio of minority:majority 40:60

Xt_resampled_SMOTE, yt_resampled_SMOTE = smote.fit_resample(X_train, y_train)

ratio_SMOTE = yt_resampled_SMOTE.value_counts() / len(yt_resampled_SMOTE) * 100
print(f'% of non-fraud class in resampled data: {round(ratio_SMOTE[0],3)}%\n% of fraud class in resampled data: {round(ratio_SMOTE[1],3)}%')

% of non-fraud class in resampled data: 60.024%
% of fraud class in resampled data: 39.976%


### **Evaluation metric**

In [14]:
metrics_names = ['Ratio of Classes', 'Accuracy', 'Recall','Precision', 'F2 Score', 'F1.5 Score','F1 Score',
                 'TPR','FNR', "PR-AUC", 'Balanced Accuracy', 'Kappa Statistic']
results = pd.DataFrame(index= metrics_names,columns=['Original Dataset', 'SMOTE'])
class_reports = {}
pr_auc_pts = {}

results.loc['Ratio of Classes','Original Dataset'] = str(round(ratio_train,3)[0]) + '% : ' +str(round(ratio_train,3)[1])+'%'
results.loc['Ratio of Classes','SMOTE'] = str(round(ratio_SMOTE,3)[0]) + '% : ' +str(round(ratio_SMOTE,3)[1])+'%'

In [15]:
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, fbeta_score, f1_score, average_precision_score, precision_recall_curve, confusion_matrix,balanced_accuracy_score, cohen_kappa_score
def evaluate_results(y_test, y_pred):
    score_results = {}
    score_results["accuracy_score"] = accuracy_score(y_test, y_pred)
    score_results["classification_report"] = classification_report(y_test, y_pred)
    score_results["recall_score"] = recall_score(y_test, y_pred)
    score_results["precision_score"] = precision_score(y_test, y_pred)
    score_results["F2-score"] = fbeta_score(y_test, y_pred, beta =2)
    score_results["F1-score"] = f1_score(y_test, y_pred)
    score_results["average_precision_score"] = average_precision_score(y_test, y_pred)
    score_results["PR-AUC"] = precision_recall_curve(y_test, y_pred)
    score_results["balanced_accuracy_score"] = balanced_accuracy_score(y_test, y_pred)
    score_results["Kappa statistics"] = cohen_kappa_score(y_test,y_pred)
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    TN, FP, FN, TP = cm.ravel()
    TPR = TP/(TP+FN)
    FNR = FN/(TP+FN)
    score_results["TPR"] = TPR
    score_results["FNR"] = FNR
    return score_results

'''
## To use function
xgb_base_pred = xgb_base.predict(X_test_new)

xgb_base_results = evaluate_results(y_test=y_test_new, y_pred=xgb_base_pred)
'''


'\n## To use function\nxgb_base_pred = xgb_base.predict(X_test_new)\n\nxgb_base_results = evaluate_results(y_test=y_test_new, y_pred=xgb_base_pred)\n'

In [16]:
import pickle

def save_model(model, model_filename):
    pickle.dump(model, open(model_filename,"wb"))

# This function is more specific to the results from lazy classifier. feel free to overwrite it
def save_results(results_df, results_filename):
    results_df = results_df["evaluate_results"].reset_index()
    # Convert the 'Metrics' column into separate columns
    df_metrics = pd.json_normalize(results_df['evaluate_results'])

    # Concatenate the two DataFrames
    results_df = pd.concat([results_df['Model'].rename('Model'), df_metrics], axis=1)
    results_df.to_csv(results_filename, index=False)

def save_model_and_results(model, model_filename, results_df, results_filename):
    save_model(model, model_filename)
    save_results(results_df, results_filename)

'''
## Sample usage
save_model_and_results(clf_smote_encoded,"C:/NUS/Fraud-Hackathon/models/baseline_encoded.pkl", models_smote_encoded, "C:/NUS/Fraud-Hackathon/models/baseline_encoded_results.csv")
'''

'\n## Sample usage\nsave_model_and_results(clf_smote_encoded,"C:/NUS/Fraud-Hackathon/models/baseline_encoded.pkl", models_smote_encoded, "C:/NUS/Fraud-Hackathon/models/baseline_encoded_results.csv")\n'

### **Models**

#### **XGBoost**

In [18]:
# Device agnostic code
from numba import cuda

device = 'cuda' if cuda.is_available() else 'cpu'
device

'cpu'

In [None]:
from imblearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from xgboost.sklearn import XGBClassifier

xgb = XGBClassifier(n_estimators=100,
                    learning_rate=0.01,
                    colsample_bytree=0.8,
                    subsample=0.8,
                    device=device,
                    random_state=42)

smote = SMOTE(sampling_strategy = 0.666, random_state=42) #ratio of minority:majority 40:60
pipeline = make_pipeline(smote, xgb)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

xgb_params = {'xgbclassifier__max_depth': [2, 4, 6, 8, 10],
              'xgbclassifier__min_child_weight': [1, 2, 3, 4],
              'xgbclassifier__gamma': [i/10.0 for i in range(0,5)]
             }

search_xgb = HalvingGridSearchCV(estimator=pipeline,
                            param_grid=xgb_params,
                            cv=cv,
                            n_jobs=-1,
                            random_state=42)

search_xgb.fit(X_train, y_train)

In [None]:
search_xgb_pred = search_xgb.predict(X_test)

In [None]:
evaluate_results(y_test, search_xgb_pred)

{'accuracy_score': 0.9828655106128159,
 'classification_report': '              precision    recall  f1-score   support\n\n           0       0.99      0.99      0.99    196523\n           1       0.15      0.12      0.13      2199\n\n    accuracy                           0.98    198722\n   macro avg       0.57      0.56      0.56    198722\nweighted avg       0.98      0.98      0.98    198722\n',
 'recall_score': 0.11959981809913597,
 'precision_score': 0.15184757505773672,
 'F2-score': 0.12490501519756839,
 'F1-score': 0.13380819129992366,
 'average_precision_score': 0.027903195352348795,
 'PR-AUC': (array([0.01106571, 0.15184758, 1.        ]),
  array([1.        , 0.11959982, 0.        ]),
  array([0, 1])),
 'balanced_accuracy_score': 0.5560624330289495,
 'Kappa statistics': 0.125278686296608,
 'TPR': 0.11959981809913597,
 'FNR': 0.880400181900864}

##### **Hyperparameter tuning to improve model performance**

In [None]:
train_df = pd.concat([Xt_resampled_SMOTE, yt_resampled_SMOTE], axis=1)
predictors = [x for x in train_df if x not in ['fraud_bool']]

##### Tune max_depth and min_child_weight

In [None]:
from sklearn.model_selection import GridSearchCV
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=1000, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=20),
 param_grid = param_test1, scoring='roc_auc',n_jobs=4, cv=5)
gsearch1.fit(train_df[predictors],train_df['fraud_bool'])
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_
#{'max_depth': 9, 'min_child_weight': 1}

##### Tune gamma

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=1000, max_depth=9,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
 param_grid = param_test3, scoring='roc_auc',n_jobs=4, cv=5)
gsearch3.fit(train_df[predictors],train_df['fraud_bool'])
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_
#gamma: 0.0

In [None]:
xgb_tuned = XGBClassifier(max_depth=9, min_weight_child = 1, gamma = 0.0,
 seed=20)
evaluate_results(xgb_tuned,"XGBoost",X_test, y_test)