In [82]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler

### **Load Data**

In [96]:
# read every column except 'device_fraud_count' as its value is a constant 0
df = pd.read_csv('Base.csv', usecols=lambda x: x != 'device_fraud_count')

### **Handle Missing Values**

In [97]:
# Features with missing values represented by negative values according to documentation
missing_features = ['prev_address_months_count', 'current_address_months_count', 'intended_balcon_amount', 
                    'bank_months_count', 'session_length_in_minutes', 'device_distinct_emails_8w']

# Replace negative values with NaN
for feature in missing_features:
    df[feature] = df[feature].apply(lambda x: x if x >= 0 else np.nan)

Drop features with a high percentage of missing values, and have very weak correlation with fraud status.

In [98]:
features_to_drop = ['prev_address_months_count', 'intended_balcon_amount', 'bank_months_count']

df.drop(features_to_drop, axis=1, inplace=True)

Drop rows with missing values as a very small percentage of the remaining observations have missing values.

In [99]:
df.dropna(inplace=True)

### **Handle Categorical Features**

Perform dummy encoding. Very similar to one-hot encoding, but the first encoded column is dropped to reduce correlation between encoded columns.

In [100]:
# Only features with String data type need to be encoded
encoded_features = [feature for feature in df.columns if df[feature].dtype == 'object']

df = pd.get_dummies(df, columns=encoded_features, drop_first=True, dtype=int)

### **Train-Test Split**

In [101]:
# Separate the feature matrix and target variable
X = df.drop('fraud_bool', axis=1)
y = df['fraud_bool']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### **Feature Scaling**

#### Min-Max Scaling (Normalization)

From EDA, numerical features were identified. Min-max scaling is applied as parametric models are sensitive to scale.

In [102]:
numeric_features = ['income', 'name_email_similarity', 'current_address_months_count', 'customer_age', 'days_since_request', 'zip_count_4w', 'velocity_6h', 'velocity_24h', 
                    'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 'proposed_credit_limit', 'session_length_in_minutes']

scaler = MinMaxScaler()

# Fit only on the training data
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

### **Feature Selection - Backward Stepwise (logistic model)**

In [103]:
import statsmodels.api as sm

def backward_stepwise_selection(X, y, p_threshold=0.05):
    features = X.columns.tolist()
    num_features = len(features)
    
    for i in range(num_features, 0, -1):
        model = sm.Logit(y, X[features]).fit()
        p_values = model.pvalues
        max_p_value = p_values.max()
        if max_p_value > p_threshold:
            remove_feature = p_values.idxmax()
            print(f"Removing '{remove_feature}' with p-value: {max_p_value:.4f}")
            features.remove(remove_feature)
        else:
            break
            
    return features

selected_features = backward_stepwise_selection(X_train, y_train)
print("Selected Features:", selected_features) #35 features
#['income', 'name_email_similarity', 'customer_age', 'zip_count_4w', 'velocity_6h', 'velocity_24h', 
# 'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 
# 'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'proposed_credit_limit',
# 'foreign_request', 'session_length_in_minutes', 'keep_alive_session', 'device_distinct_emails_8w', 'month',
# 'payment_type_AC', 'employment_status_CB', 'employment_status_CC', 'employment_status_CD', 'employment_status_CE',
# 'employment_status_CF', 'housing_status_BB', 'housing_status_BC', 'housing_status_BD', 'housing_status_BE', 'housing_status_BF',
# 'source_TELEAPP', 'device_os_macintosh', 'device_os_windows', 'device_os_x11']



         Current function value: 0.049074
         Iterations: 35




Removing 'housing_status_BG' with p-value: 0.9690
Optimization terminated successfully.
         Current function value: 0.049078
         Iterations 10
Removing 'device_os_other' with p-value: 0.8933
Optimization terminated successfully.
         Current function value: 0.049078
         Iterations 10
Removing 'payment_type_AB' with p-value: 0.8154
Optimization terminated successfully.
         Current function value: 0.049078
         Iterations 10
Removing 'employment_status_CG' with p-value: 0.7637
Optimization terminated successfully.
         Current function value: 0.049078
         Iterations 10
Removing 'payment_type_AE' with p-value: 0.3159
Optimization terminated successfully.
         Current function value: 0.049079
         Iterations 10
Removing 'current_address_months_count' with p-value: 0.1137
Optimization terminated successfully.
         Current function value: 0.049081
         Iterations 10
Removing 'days_since_request' with p-value: 0.1227
Optimization terminated

In [105]:
selected_features = ['income', 'name_email_similarity', 'customer_age', 'zip_count_4w', 'velocity_6h', 'velocity_24h', 
'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 
'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'proposed_credit_limit',
'foreign_request', 'session_length_in_minutes', 'keep_alive_session', 'device_distinct_emails_8w', 'month',
'payment_type_AC', 'employment_status_CB', 'employment_status_CC', 'employment_status_CD', 'employment_status_CE',
'employment_status_CF', 'housing_status_BB', 'housing_status_BC', 'housing_status_BD', 'housing_status_BE', 'housing_status_BF',
'source_TELEAPP', 'device_os_macintosh', 'device_os_windows', 'device_os_x11']

X_train = X_train[selected_features]
X_test = X_test[selected_features]

### **Resampling**

Fraud class vs non fraud class 

In [106]:
ratio = y.value_counts() / len(y) * 100
print(f'% of non-fraud class in y: {round(ratio[0],3)}%\n% of fraud class in y: {round(ratio[1],3)}%\n')

ratio_train = y_train.value_counts() / len(y_train) * 100
print(f'% of non-fraud class in y_train: {round(ratio_train[0],3)}%\n% of fraud class in y_train: {round(ratio_train[1],3)}%\n')

ratio_test = y_test.value_counts() / len(y_test) * 100
print(f'% of non-fraud class in y_test: {round(ratio_test[0],3)}%\n% of fraud class in y_test: {round(ratio_test[1],3)}%')

% of non-fraud class in y: 98.893%
% of fraud class in y: 1.107%

% of non-fraud class in y_train: 98.893%
% of fraud class in y_train: 1.107%

% of non-fraud class in y_test: 98.893%
% of fraud class in y_test: 1.107%


SMOTE

In [107]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42, sampling_strategy = 0.666) #ratio of minority:majority 40:60

Xt_resampled_SMOTE, yt_resampled_SMOTE = smote.fit_resample(X_train, y_train)

ratio_SMOTE = yt_resampled_SMOTE.value_counts() / len(yt_resampled_SMOTE) * 100
print(f'% of non-fraud class in resampled data: {round(ratio_SMOTE[0],3)}%\n% of fraud class in resampled data: {round(ratio_SMOTE[1],3)}%')

% of non-fraud class in resampled data: 60.024%
% of fraud class in resampled data: 39.976%


### **Evaluation metric**

In [109]:
metrics_names = ['Ratio of Classes', 'Accuracy', 'Recall','Precision', 'F2 Score', 'F1.5 Score','F1 Score', 
                 'TPR','FNR', "PR-AUC", 'Balanced Accuracy', 'Kappa Statistic']
results = pd.DataFrame(index= metrics_names,columns=['Original Dataset', 'SMOTE'])
class_reports = {}
pr_auc_pts = {}

results.loc['Ratio of Classes','Original Dataset'] = str(round(ratio_train,3)[0]) + '% : ' +str(round(ratio_train,3)[1])+'%'
results.loc['Ratio of Classes','SMOTE'] = str(round(ratio_SMOTE,3)[0]) + '% : ' +str(round(ratio_SMOTE,3)[1])+'%'

In [110]:
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, fbeta_score, f1_score, average_precision_score, precision_recall_curve, confusion_matrix,balanced_accuracy_score, cohen_kappa_score
def evaluate_results(model,resampler,x_resampled, y_resampled):

    model.fit(x_resampled, y_resampled)

    y_pred_test = model.predict(X_test)

    results.loc['Accuracy',resampler] = accuracy_score(y_test, y_pred_test)
    class_reports[resampler] = classification_report(y_test, y_pred_test)
    results.loc['Recall',resampler] = recall_score(y_test, y_pred_test)
    results.loc['Precision',resampler] = precision_score(y_test, y_pred_test)
    results.loc['F2 Score',resampler] = fbeta_score(y_test, y_pred_test, beta =2)
    results.loc['F1.5 Score',resampler] = fbeta_score(y_test, y_pred_test, beta =1.5)
    results.loc['F1 Score',resampler] = f1_score(y_test, y_pred_test)
    results.loc['PR-AUC',resampler] = average_precision_score(y_test, y_pred_test)
    pr_auc_pts[resampler] = precision_recall_curve(y_test, y_pred_test)
    results.loc['Balanced Accuracy',resampler] = balanced_accuracy_score(y_test, y_pred_test)
    results.loc['Kappa Statistic',resampler] = cohen_kappa_score(y_test, y_pred_test)
    
    cm = confusion_matrix(y_test, y_pred_test, labels=[0,1])
    TN, FP, FN, TP = cm.ravel()
    TPR = TP/(TP+FN)
    FNR = FN/(TP+FN)
    results.loc['TPR',resampler] = TPR
    results.loc['FNR',resampler] = FNR

    print(f"{resampler} Model Performance on Test Data:")
    print(f"{resampler} Accuracy:", results.loc['Accuracy',resampler])
    print(f"{resampler} Precision: {results.loc['Precision',resampler]}")
    print(f"{resampler} Recall: {results.loc['Recall',resampler]}")
    print(f"{resampler} F2: {results.loc['F2 Score',resampler]}")
    print(f"{resampler} F1.5: {results.loc['F1.5 Score',resampler]}")
    print(f"{resampler} F1: {results.loc['F1 Score',resampler]}")
    print(f"{resampler} PR-AUC: {results.loc['PR-AUC',resampler]}")
    print(f"{resampler} TPR: {results.loc['TPR',resampler]}")
    print(f"{resampler} FNR: {results.loc['FNR',resampler]}")
    print(f"{resampler} Balanced Accuracy: {results.loc['Balanced Accuracy',resampler]}")
    print(f"{resampler} Kappa Statistic: {results.loc['Kappa Statistic',resampler]}")
    print(f"{resampler} Classification Report: \n{class_reports[resampler]}")