In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold

In [2]:
base = pd.read_csv("../data/raw/Base.csv")

In [3]:
base.dtypes

fraud_bool                            int64
income                              float64
name_email_similarity               float64
prev_address_months_count             int64
current_address_months_count          int64
customer_age                          int64
days_since_request                  float64
intended_balcon_amount              float64
payment_type                         object
zip_count_4w                          int64
velocity_6h                         float64
velocity_24h                        float64
velocity_4w                         float64
bank_branch_count_8w                  int64
date_of_birth_distinct_emails_4w      int64
employment_status                    object
credit_risk_score                     int64
email_is_free                         int64
housing_status                       object
phone_home_valid                      int64
phone_mobile_valid                    int64
bank_months_count                     int64
has_other_cards                 

In [4]:
base["payment_type"].unique()

array(['AA', 'AD', 'AB', 'AC', 'AE'], dtype=object)

In [5]:
base.groupby("fraud_bool")["fraud_bool"].count()

fraud_bool
0    988971
1     11029
Name: fraud_bool, dtype: int64

In [6]:
base

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0,0.3,0.986506,-1,25,40,0.006735,102.453711,AA,1059,...,0,1500.0,0,INTERNET,16.224843,linux,1,1,0,0
1,0,0.8,0.617426,-1,89,20,0.010095,-0.849551,AD,1658,...,0,1500.0,0,INTERNET,3.363854,other,1,1,0,0
2,0,0.8,0.996707,9,14,40,0.012316,-1.490386,AB,1095,...,0,200.0,0,INTERNET,22.730559,windows,0,1,0,0
3,0,0.6,0.475100,11,14,30,0.006991,-1.863101,AB,3483,...,0,200.0,0,INTERNET,15.215816,linux,1,1,0,0
4,0,0.9,0.842307,-1,29,40,5.742626,47.152498,AA,2339,...,0,200.0,0,INTERNET,3.743048,other,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0,0.8,0.124690,-1,143,30,0.051348,-0.826239,AB,530,...,0,1500.0,0,INTERNET,16.967770,other,0,1,0,7
999996,0,0.9,0.824544,-1,193,30,0.009591,0.008307,AC,408,...,1,1000.0,0,INTERNET,1.504109,macintosh,0,1,0,7
999997,0,0.8,0.140891,-1,202,10,0.059287,50.609995,AA,749,...,0,200.0,0,INTERNET,16.068595,other,0,1,0,7
999998,0,0.9,0.002480,52,3,30,0.023357,-1.313387,AB,707,...,0,200.0,0,INTERNET,1.378683,linux,1,1,0,7


## Feature Engineering

In [7]:
X = base.loc[ : , base.columns != 'fraud_bool']

In [8]:
y = base[['fraud_bool']]

In [9]:
X.shape, y.shape

((1000000, 31), (1000000, 1))

### Scaling

In [10]:
# categorical columns with numeric dtype (Either binary or month)
binary_cols = [ 'email_is_free', 'phone_home_valid', 'phone_mobile_valid','has_other_cards', 'foreign_request','keep_alive_session','month']
# change dtype to object
X[binary_cols] = X[binary_cols].astype(object)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[binary_cols] = X[binary_cols].astype(object)


In [11]:
X

Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,velocity_6h,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0.3,0.986506,-1,25,40,0.006735,102.453711,AA,1059,13096.035018,...,0,1500.0,0,INTERNET,16.224843,linux,1,1,0,0
1,0.8,0.617426,-1,89,20,0.010095,-0.849551,AD,1658,9223.283431,...,0,1500.0,0,INTERNET,3.363854,other,1,1,0,0
2,0.8,0.996707,9,14,40,0.012316,-1.490386,AB,1095,4471.472149,...,0,200.0,0,INTERNET,22.730559,windows,0,1,0,0
3,0.6,0.475100,11,14,30,0.006991,-1.863101,AB,3483,14431.993621,...,0,200.0,0,INTERNET,15.215816,linux,1,1,0,0
4,0.9,0.842307,-1,29,40,5.742626,47.152498,AA,2339,7601.511579,...,0,200.0,0,INTERNET,3.743048,other,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.8,0.124690,-1,143,30,0.051348,-0.826239,AB,530,6732.602414,...,0,1500.0,0,INTERNET,16.967770,other,0,1,0,7
999996,0.9,0.824544,-1,193,30,0.009591,0.008307,AC,408,1574.293294,...,1,1000.0,0,INTERNET,1.504109,macintosh,0,1,0,7
999997,0.8,0.140891,-1,202,10,0.059287,50.609995,AA,749,1258.864938,...,0,200.0,0,INTERNET,16.068595,other,0,1,0,7
999998,0.9,0.002480,52,3,30,0.023357,-1.313387,AB,707,7048.137128,...,0,200.0,0,INTERNET,1.378683,linux,1,1,0,7


In [12]:
numeric_cols = X.select_dtypes(include=['int', 'float']).columns

In [13]:
#Feature engineering - scaling
scaler = MinMaxScaler()

# Apply Min-Max Scaling to all numeric columns in X
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_cols] = scaler.fit_transform(X[numeric_cols])


Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,velocity_6h,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0.250,0.986507,0.000000,0.060606,0.375,0.000086,0.918255,AA,0.157934,0.785651,...,0,0.685864,0,INTERNET,0.198216,linux,1,0.666667,0.0,0
1,0.875,0.617426,0.000000,0.209790,0.125,0.000129,0.114260,AD,0.247350,0.556307,...,0,0.685864,0,INTERNET,0.050217,other,1,0.666667,0.0,0
2,0.875,0.996708,0.026042,0.034965,0.375,0.000157,0.109273,AB,0.163308,0.274904,...,0,0.005236,0,INTERNET,0.273082,windows,0,0.666667,0.0,0
3,0.625,0.475100,0.031250,0.034965,0.250,0.000089,0.106372,AB,0.519779,0.864767,...,0,0.005236,0,INTERNET,0.186605,linux,1,0.666667,0.0,0
4,1.000,0.842307,0.000000,0.069930,0.375,0.073195,0.487853,AA,0.349007,0.460265,...,0,0.005236,0,INTERNET,0.054581,other,0,0.666667,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.875,0.124689,0.000000,0.335664,0.250,0.000654,0.114442,AB,0.078967,0.408808,...,0,0.685864,0,INTERNET,0.206766,other,0,0.666667,0.0,7
999996,1.000,0.824545,0.000000,0.452214,0.250,0.000122,0.120937,AC,0.060755,0.103333,...,1,0.424084,0,INTERNET,0.028816,macintosh,0,0.666667,0.0,7
999997,0.875,0.140890,0.000000,0.473193,0.000,0.000756,0.514763,AA,0.111658,0.084653,...,0,0.005236,0,INTERNET,0.196418,other,0,0.666667,0.0,7
999998,1.000,0.002479,0.138021,0.009324,0.250,0.000298,0.110650,AB,0.105389,0.427494,...,0,0.005236,0,INTERNET,0.027373,linux,1,0.666667,0.0,7


### Pearson Correlation - detect mutlicollinearity

In [14]:
import pandas as pd
import numpy as np

# Calculate the Pearson correlation coefficients
correlation_matrix = X.corr(method='pearson', numeric_only=True)

# Get the absolute values of the correlation coefficients
correlation_matrix_abs = correlation_matrix.abs()

# Select upper triangle of correlation matrix (excluding diagonal)
upper_triangle = correlation_matrix_abs.where(
    np.triu(np.ones(correlation_matrix_abs.shape), k=1).astype(np.bool_))

# Find features with correlation greater than a threshold
threshold = 0.85  # Example threshold
high_correlation_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]

# Print highly correlated features
print("Highly Correlated Features:")
print(high_correlation_features)     #None

Highly Correlated Features:
[]


### Variance Threshold

In [15]:
vt = VarianceThreshold(threshold=0.0)

# Fit the VarianceThreshold to the selected numeric columns in X
vt.fit(X[numeric_cols])

# Get the boolean mask of selected numeric features
selected_numeric_mask = vt.get_support()

# Get the names of the selected numeric features
selected_numeric_features = X[numeric_cols].columns[selected_numeric_mask]

print(f'dropped columns: {set(X[numeric_cols].columns).difference(set(selected_numeric_features))}')

# Filter X to keep only selected numeric features
X_numeric_selected = X[selected_numeric_features]
#device_fraud_count is dropped 

# Get the remaining categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Combine selected numeric features with categorical columns
X_selected = pd.concat([X_numeric_selected, X[categorical_cols]], axis=1)
X = X_selected
X

dropped columns: {'device_fraud_count'}


Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,...,email_is_free,housing_status,phone_home_valid,phone_mobile_valid,has_other_cards,foreign_request,source,device_os,keep_alive_session,month
0,0.250,0.986507,0.000000,0.060606,0.375,0.000086,0.918255,0.157934,0.785651,0.798218,...,1,BC,0,1,0,0,INTERNET,linux,1,0
1,0.875,0.617426,0.000000,0.209790,0.125,0.000129,0.114260,0.247350,0.556307,0.541631,...,1,BC,1,1,0,0,INTERNET,other,1,0
2,0.875,0.996708,0.026042,0.034965,0.375,0.000157,0.109273,0.163308,0.274904,0.508333,...,1,BC,0,1,0,0,INTERNET,windows,0,0
3,0.625,0.475100,0.031250,0.034965,0.250,0.000089,0.106372,0.519779,0.864767,0.664714,...,1,BC,0,1,0,0,INTERNET,linux,1,0
4,1.000,0.842307,0.000000,0.069930,0.375,0.073195,0.487853,0.349007,0.460265,0.465935,...,0,BC,1,1,0,0,INTERNET,other,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.875,0.124689,0.000000,0.335664,0.250,0.000654,0.114442,0.078967,0.408808,0.208338,...,1,BB,1,1,0,0,INTERNET,other,0,7
999996,1.000,0.824545,0.000000,0.452214,0.250,0.000122,0.120937,0.060755,0.103333,0.172567,...,0,BA,1,1,1,0,INTERNET,macintosh,0,7
999997,0.875,0.140890,0.000000,0.473193,0.000,0.000756,0.514763,0.111658,0.084653,0.280386,...,1,BE,0,1,0,0,INTERNET,other,0,7
999998,1.000,0.002479,0.138021,0.009324,0.250,0.000298,0.110650,0.105389,0.427494,0.636207,...,0,BD,0,1,0,0,INTERNET,linux,1,7


### One Hot Encoding

In [16]:
# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(handle_unknown='ignore')

# Select categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Fit and transform the selected columns with OneHotEncoder
encoded_cols = onehot_encoder.fit_transform(X[categorical_cols]).toarray()

# Get the feature names for the new one-hot encoded columns
# feature_names = onehot_encoder.get_feature_names(categorical_cols)
feature_names = onehot_encoder.get_feature_names_out(categorical_cols)

# Create a DataFrame from the one-hot encoded columns
X_encoded = pd.DataFrame(encoded_cols, columns=feature_names)

# Replace the original columns with the one-hot encoded columns (optional)
X.drop(columns=categorical_cols, inplace=True)
X = pd.concat([X, X_encoded], axis=1)

In [17]:
X

Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,...,keep_alive_session_0,keep_alive_session_1,month_0,month_1,month_2,month_3,month_4,month_5,month_6,month_7
0,0.250,0.986507,0.000000,0.060606,0.375,0.000086,0.918255,0.157934,0.785651,0.798218,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.875,0.617426,0.000000,0.209790,0.125,0.000129,0.114260,0.247350,0.556307,0.541631,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.875,0.996708,0.026042,0.034965,0.375,0.000157,0.109273,0.163308,0.274904,0.508333,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.625,0.475100,0.031250,0.034965,0.250,0.000089,0.106372,0.519779,0.864767,0.664714,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.000,0.842307,0.000000,0.069930,0.375,0.073195,0.487853,0.349007,0.460265,0.465935,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.875,0.124689,0.000000,0.335664,0.250,0.000654,0.114442,0.078967,0.408808,0.208338,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
999996,1.000,0.824545,0.000000,0.452214,0.250,0.000122,0.120937,0.060755,0.103333,0.172567,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
999997,0.875,0.140890,0.000000,0.473193,0.000,0.000756,0.514763,0.111658,0.084653,0.280386,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
999998,1.000,0.002479,0.138021,0.009324,0.250,0.000298,0.110650,0.105389,0.427494,0.636207,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Train Test Split

In [18]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)


In [19]:
X_train.shape, X_test.shape

((800000, 64), (200000, 64))

### Feature Selection

In [None]:
#from sklearn.linear_model import Lasso
#import pandas as pd

# Initialize the Lasso model with a chosen alpha
#lasso = Lasso(alpha=0.001)  # Adjust the alpha value as needed

# Fit the Lasso model to the training data
#lasso.fit(X_train, y_train)

# Get the coefficients and corresponding feature names
#feature_names = X_train.columns
#lasso_coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': lasso.coef_})

# Filter the features with non-zero coefficients
#selected_features = lasso_coefficients[lasso_coefficients['Coefficient'] != 0]['Feature']
#print(selected_features)

#### RFE

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Initialize the estimator (e.g., Logistic Regression)
estimator = LogisticRegression()

# Initialize RFE with the estimator and number of features to select
rfe = RFE(estimator, n_features_to_select=35)  # Adjust number of features as needed

# Fit RFE 
rfe.fit(X_train, y_train)

# Get selected features
selected_features = X_train.columns[rfe.support_]

print(selected_features)


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

Index(['income', 'name_email_similarity', 'prev_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'zip_count_4w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score',
       'proposed_credit_limit', 'device_distinct_emails_8w', 'payment_type_AA',
       'payment_type_AB', 'payment_type_AD', 'payment_type_AE',
       'employment_status_CB', 'employment_status_CD', 'employment_status_CE',
       'employment_status_CF', 'email_is_free_0', 'email_is_free_1',
       'housing_status_BA', 'phone_home_valid_1', 'phone_mobile_valid_0',
       'phone_mobile_valid_1', 'has_other_cards_1', 'foreign_request_0',
       'foreign_request_1', 'source_INTERNET', 'source_TELEAPP',
       'device_os_linux', 'device_os_other', 'device_os_windows',
       'keep_alive_session_1', 'month_3'],
      dtype='object')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# #the top 35 features (idk a rough gauge)
# columns_to_keep = ['income', 'name_email_similarity', 'prev_address_months_count',
#        'customer_age', 'days_since_request', 'intended_balcon_amount',
#        'zip_count_4w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score',
#        'proposed_credit_limit', 'device_distinct_emails_8w', 'payment_type_AA',
#        'payment_type_AB', 'payment_type_AD', 'payment_type_AE',
#        'employment_status_CB', 'employment_status_CD', 'employment_status_CE',
#        'employment_status_CF', 'email_is_free_0', 'email_is_free_1',
#        'housing_status_BA', 'phone_home_valid_1', 'phone_mobile_valid_0',
#        'phone_mobile_valid_1', 'has_other_cards_1', 'foreign_request_0',
#        'foreign_request_1', 'source_INTERNET', 'source_TELEAPP',
#        'device_os_linux', 'device_os_other', 'device_os_windows',
#        'keep_alive_session_1', 'month_3']
# X = X[columns_to_keep]

In [None]:
# X

Unnamed: 0,income,name_email_similarity,prev_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,date_of_birth_distinct_emails_4w,credit_risk_score,proposed_credit_limit,...,has_other_cards_1,foreign_request_0,foreign_request_1,source_INTERNET,source_TELEAPP,device_os_linux,device_os_other,device_os_windows,keep_alive_session_1,month_3
0,0.250,0.986507,0.000000,0.375,0.000086,0.918255,0.157934,0.128205,0.595707,0.685864,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.875,0.617426,0.000000,0.125,0.000129,0.114260,0.247350,0.461538,0.579606,0.685864,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.875,0.996708,0.026042,0.375,0.000157,0.109273,0.163308,0.282051,0.463327,0.005236,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.625,0.475100,0.031250,0.250,0.000089,0.106372,0.519779,0.333333,0.465116,0.005236,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.000,0.842307,0.000000,0.375,0.073195,0.487853,0.349007,0.153846,0.466905,0.005236,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.875,0.124689,0.000000,0.250,0.000654,0.114442,0.078967,0.205128,0.849732,0.685864,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
999996,1.000,0.824545,0.000000,0.250,0.000122,0.120937,0.060755,0.128205,0.724508,0.424084,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
999997,0.875,0.140890,0.000000,0.000,0.000756,0.514763,0.111658,0.076923,0.652952,0.005236,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
999998,1.000,0.002479,0.138021,0.250,0.000298,0.110650,0.105389,0.205128,0.568873,0.005236,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


#### LASSO with CV

In [None]:
%%time
#takes very very long to run
#optimize a logistic function with a L1 penalty -> use logistic regression estimator with L1 penalty
from sklearn.linear_model import LogisticRegressionCV

# Create L1 regularized logistic regression
l1_model = LogisticRegressionCV(penalty='l1', solver='liblinear', cv=5)
l1_model.fit(X_train, y_train)

# Get coefficients and feature names
coef_l1 = l1_model.coef_[0]
coef_df_l1 = pd.DataFrame({
   'Feature': feature_names,
   'Coefficient': coef_l1,
})

# Filter non-zero coefficients
selected_features_l1 = coef_df_l1[coef_df_l1['Coefficient'] != 0]['Feature']

print("Selected Features with L1 Regularization:", selected_features_l1)


#### Forward Stepwise Selection

In [None]:
import statsmodels.api as sm

def forward_selection_logistic(X, y, alpha=0.05):
    selected_features = []
    best_pvalue = 1

    while True:
        if len(selected_features) == len(X.columns):
            break

        candidate_features = [feature for feature in X.columns if feature not in selected_features]
        pvalues = []

        for feature in candidate_features:
            current_features = selected_features + [feature]
            X_subset = sm.add_constant(X[current_features])
            model = sm.Logit(y, X_subset).fit(disp=0)
            pvalue = model.pvalues[feature]
            pvalues.append((feature, pvalue))

        best_candidate, best_candidate_pvalue = min(pvalues, key=lambda x: x[1])

        if best_candidate_pvalue < alpha:
            selected_features.append(best_candidate)
            best_pvalue = best_candidate_pvalue
        else:
            break

    return selected_features

# Assuming X is your feature DataFrame and y is the target Series or DataFrame
selected_features_logistic = forward_selection_logistic(X_train, y_train)
print("Selected Features using Logistic Regression and p-values:", selected_features_logistic) #40
#Selected Features using Logistic Regression and p-values: ['income', 'customer_age', 'credit_risk_score', 'device_distinct_emails_8w', 
# 'housing_status_BA', 'phone_home_valid_0', 'has_other_cards_0', 'device_os_windows', 'keep_alive_session_1', 'name_email_similarity', 
# 'email_is_free_1', 'payment_type_AC', 'prev_address_months_count', 'month_3', 'device_os_macintosh', 'employment_status_CB', 
# 'bank_months_count', 'month_4', 'proposed_credit_limit', 'date_of_birth_distinct_emails_4w', 'zip_count_4w', 'foreign_request_1', 
# 'intended_balcon_amount', 'employment_status_CF', 'phone_mobile_valid_0', 'month_7', 'device_os_linux', 'employment_status_CD', 
# 'employment_status_CE', 'days_since_request', 'source_TELEAPP', 'housing_status_BE', 'month_2', 'employment_status_CA', 'month_1',
# 'velocity_24h', 'bank_branch_count_8w', 'device_os_other', 'housing_status_BD', 'payment_type_AA']


In [None]:
forward_features_selected = ['income', 'customer_age', 'credit_risk_score', 'device_distinct_emails_8w', 
'housing_status_BA', 'phone_home_valid_0', 'has_other_cards_0', 'device_os_windows', 'keep_alive_session_1', 'name_email_similarity', 
'email_is_free_1', 'payment_type_AC', 'prev_address_months_count', 'month_3', 'device_os_macintosh', 'employment_status_CB', 
'bank_months_count', 'month_4', 'proposed_credit_limit', 'date_of_birth_distinct_emails_4w', 'zip_count_4w', 'foreign_request_1', 
'intended_balcon_amount', 'employment_status_CF', 'phone_mobile_valid_0', 'month_7', 'device_os_linux', 'employment_status_CD', 
'employment_status_CE', 'days_since_request', 'source_TELEAPP', 'housing_status_BE', 'month_2', 'employment_status_CA', 'month_1',
'velocity_24h', 'bank_branch_count_8w', 'device_os_other', 'housing_status_BD', 'payment_type_AA']

#### Backward Stepwise Selection

In [21]:
#iteratively fits a Logistic Regression model & removes the feature with the highest p-value (least significant) in each iteration.
#the process continues until the maximum p-value is less than the p_threshold, which is set to 0.05 by default.
import statsmodels.api as sm

def backward_stepwise_selection(X, y, p_threshold=0.05):
    features = X.columns.tolist()
    num_features = len(features)
    
    for i in range(num_features, 0, -1):
        model = sm.Logit(y, X[features]).fit()
        p_values = model.pvalues
        max_p_value = p_values.max()
        if max_p_value > p_threshold:
            remove_feature = p_values.idxmax()
            print(f"Removing '{remove_feature}' with p-value: {max_p_value:.4f}")
            features.remove(remove_feature)
        else:
            break
            
    return features

backward_features_selected = backward_stepwise_selection(X_train, y_train)
print("Selected Features:", backward_features_selected) #46
# ['income', 'name_email_similarity', 'prev_address_months_count', 'customer_age', 'days_since_request', 'intended_balcon_amount', 'zip_count_4w', 'velocity_24h', 
# 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 'bank_months_count', 'proposed_credit_limit', 'device_distinct_emails_8w', 'payment_type_AA', 
# 'payment_type_AB', 'payment_type_AD', 'employment_status_CA', 'employment_status_CC', 'employment_status_CE', 'employment_status_CF', 'email_is_free_0', 'housing_status_BA', 
# 'housing_status_BB', 'housing_status_BC', 'housing_status_BE', 'phone_home_valid_1', 'phone_mobile_valid_0', 'phone_mobile_valid_1', 'has_other_cards_0', 'foreign_request_1', 
# 'source_INTERNET', 'device_os_linux', 'device_os_macintosh', 'device_os_other', 'device_os_x11', 'keep_alive_session_1', 'month_1', 'month_2', 'month_3', 'month_4', 'month_7']


Selected Features: ['income', 'name_email_similarity', 'prev_address_months_count', 'customer_age', 'days_since_request', 'intended_balcon_amount', 'zip_count_4w', 'velocity_24h', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 'bank_months_count', 'proposed_credit_limit', 'device_distinct_emails_8w', 'payment_type_AB', 'payment_type_AC', 'payment_type_AD', 'employment_status_CA', 'employment_status_CB', 'employment_status_CD', 'employment_status_CE', 'employment_status_CF', 'email_is_free_0', 'housing_status_BA', 'housing_status_BB', 'housing_status_BC', 'housing_status_BE', 'housing_status_BF', 'phone_home_valid_1', 'phone_mobile_valid_1', 'has_other_cards_1', 'foreign_request_0', 'source_INTERNET', 'source_TELEAPP', 'device_os_linux', 'device_os_macintosh', 'device_os_other', 'device_os_x11', 'keep_alive_session_1', 'month_0', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6']


In [22]:
# backward_features_selected = ['income', 'name_email_similarity', 'prev_address_months_count', 'customer_age', 'days_since_request', 'intended_balcon_amount', 'zip_count_4w', 'velocity_24h', 
# 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 'bank_months_count', 'proposed_credit_limit', 'device_distinct_emails_8w', 'payment_type_AA', 
# 'payment_type_AB', 'payment_type_AD', 'employment_status_CA', 'employment_status_CC', 'employment_status_CE', 'employment_status_CF', 'email_is_free_0', 'housing_status_BA', 
# 'housing_status_BB', 'housing_status_BC', 'housing_status_BE', 'phone_home_valid_1', 'phone_mobile_valid_0', 'phone_mobile_valid_1', 'has_other_cards_0', 'foreign_request_1', 
# 'source_INTERNET', 'device_os_linux', 'device_os_macintosh', 'device_os_other', 'device_os_x11', 'keep_alive_session_1', 'month_1', 'month_2', 'month_3', 'month_4', 'month_7']

backward_features_selected = ['income', 'name_email_similarity', 'prev_address_months_count', 'customer_age', 
 'days_since_request', 'intended_balcon_amount', 'zip_count_4w', 'velocity_24h', 
 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 
 'bank_months_count', 'proposed_credit_limit', 'device_distinct_emails_8w', 'payment_type_AB', 
 'payment_type_AC', 'payment_type_AD', 'employment_status_CA', 'employment_status_CB', 'employment_status_CD', 
 'employment_status_CE', 'employment_status_CF', 'email_is_free_0', 'housing_status_BA', 'housing_status_BB', 
 'housing_status_BC', 'housing_status_BE', 'housing_status_BF', 'phone_home_valid_1', 'phone_mobile_valid_1', 
 'has_other_cards_1', 'foreign_request_0', 'source_INTERNET', 'source_TELEAPP', 'device_os_linux', 'device_os_macintosh', 
 'device_os_other', 'device_os_x11', 'keep_alive_session_1', 'month_0', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6']

In [23]:
len(backward_features_selected)

46

Chosen feature selection method: Backward Feature Selection

In [25]:
X_train = X_train[backward_features_selected]
X_test = X_test[backward_features_selected]

## Resampling

In [26]:
ratio = y.fraud_bool.value_counts() / len(y) * 100
print(f'% of non-fraud class in y: {round(ratio[0],3)}%\n% of fraud class in y: {round(ratio[1],3)}%\n')

ratio_train = y_train.fraud_bool.value_counts() / len(y_train) * 100
print(f'% of non-fraud class in y_train: {round(ratio_train[0],3)}%\n% of fraud class in y_train: {round(ratio_train[1],3)}%\n')

ratio_test = y_test.fraud_bool.value_counts() / len(y_test) * 100
print(f'% of non-fraud class in y_test: {round(ratio_test[0],3)}%\n% of fraud class in y_test: {round(ratio_test[1],3)}%')


% of non-fraud class in y: 98.897%
% of fraud class in y: 1.103%

% of non-fraud class in y_train: 98.885%
% of fraud class in y_train: 1.115%

% of non-fraud class in y_test: 98.946%
% of fraud class in y_test: 1.054%


In [27]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 64 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   income                            1000000 non-null  float64
 1   name_email_similarity             1000000 non-null  float64
 2   prev_address_months_count         1000000 non-null  float64
 3   current_address_months_count      1000000 non-null  float64
 4   customer_age                      1000000 non-null  float64
 5   days_since_request                1000000 non-null  float64
 6   intended_balcon_amount            1000000 non-null  float64
 7   zip_count_4w                      1000000 non-null  float64
 8   velocity_6h                       1000000 non-null  float64
 9   velocity_24h                      1000000 non-null  float64
 10  velocity_4w                       1000000 non-null  float64
 11  bank_branch_count_8w              1000

### Individual Resample

#### Random Undersample

In [28]:
%%time
from imblearn.under_sampling import RandomUnderSampler
# from imblearn.over_sampling import SMOTE

desired_majority_size = int(0.5 * len(X_train))  # 50% of the original majority class size

# Initialize RandomUnderSampler to undersample the majority class to 50%
under_sampler = RandomUnderSampler(sampling_strategy={0: desired_majority_size}, random_state=42)

# Apply RandomUnderSampler
Xt_resampled_under, yt_resampled_under = under_sampler.fit_resample(X_train, y_train)

# Initialize SMOTE to oversample the minority class to match the majority class
# smote = SMOTE(sampling_strategy='auto', random_state=42)

# # Apply SMOTE on the undersampled data
# Xt_resampled, yt_resampled = smote.fit_resample(Xt_resampled_under, yt_resampled_under)

tmp = yt_resampled_under.fraud_bool.value_counts() / len(yt_resampled_under) * 100
print(f'% of non-fraud class in resampled data: {round(tmp[0],3)}%\n% of fraud class in resampled data: {round(tmp[1],3)}%')

% of non-fraud class in resampled data: 97.819%
% of fraud class in resampled data: 2.181%
CPU times: total: 203 ms
Wall time: 904 ms


#### Tomek Links

In [29]:
%%time
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
Xt_resampled_tl, yt_resampled_tl = tl.fit_resample(X_train, y_train)

ratio_tl = yt_resampled_tl.fraud_bool.value_counts() / len(yt_resampled_tl) * 100
print(f'% of non-fraud class in resampled data: {round(ratio_tl[0],3)}%\n% of fraud class in resampled data: {round(ratio_tl[1],3)}%')

% of non-fraud class in resampled data: 98.88%
% of fraud class in resampled data: 1.12%
CPU times: total: 1h 20min 54s
Wall time: 26min 39s


#### SMOTE

In [30]:
%%time
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42, sampling_strategy = 0.666) #ratio of minority:majority 40:60

Xt_resampled_SMOTE, yt_resampled_SMOTE = smote.fit_resample(X_train, y_train)

ratio_SMOTE = yt_resampled_SMOTE.fraud_bool.value_counts() / len(yt_resampled_SMOTE) * 100
print(f'% of non-fraud class in resampled data: {round(ratio_SMOTE[0],3)}%\n% of fraud class in resampled data: {round(ratio_SMOTE[1],3)}%')

% of non-fraud class in resampled data: 60.024%
% of fraud class in resampled data: 39.976%
CPU times: total: 2.45 s
Wall time: 2.64 s


In [31]:
X_train.columns

Index(['income', 'name_email_similarity', 'prev_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'zip_count_4w', 'velocity_24h', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'credit_risk_score',
       'bank_months_count', 'proposed_credit_limit',
       'device_distinct_emails_8w', 'payment_type_AB', 'payment_type_AC',
       'payment_type_AD', 'employment_status_CA', 'employment_status_CB',
       'employment_status_CD', 'employment_status_CE', 'employment_status_CF',
       'email_is_free_0', 'housing_status_BA', 'housing_status_BB',
       'housing_status_BC', 'housing_status_BE', 'housing_status_BF',
       'phone_home_valid_1', 'phone_mobile_valid_1', 'has_other_cards_1',
       'foreign_request_0', 'source_INTERNET', 'source_TELEAPP',
       'device_os_linux', 'device_os_macintosh', 'device_os_other',
       'device_os_x11', 'keep_alive_session_1', 'month_0', 'month_1',
       'month_2', 'month_3', 'month_

#### Evaluate Individual Resampling Methods

In [32]:
accuracies = {}
f2_scores = {}
f15_scores = {}
f1_scores = {}
recall_scores = {}
precision_scores = {}
class_reports = {}
pr_auc = {}
pr_auc_pts = {}
tpr = {}
fnr = {}

In [33]:
#models 
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# logistic = LogisticRegression(random_state=42)
dt = DecisionTreeClassifier(random_state=42)
# svm = SVC(kernel='linear', random_state=42)
# rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [34]:
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, fbeta_score, f1_score, average_precision_score, precision_recall_curve, confusion_matrix
def evaluate_results(model,resampler,x_resampled, y_resampled):

    model.fit(x_resampled, y_resampled)

    y_pred_test = model.predict(X_test)

    accuracies[resampler] = accuracy_score(y_test, y_pred_test)
    class_reports[resampler] = classification_report(y_test, y_pred_test)
    recall_scores[resampler] = recall_score(y_test, y_pred_test)
    precision_scores[resampler] = precision_score(y_test, y_pred_test)
    f2_scores[resampler] = fbeta_score(y_test, y_pred_test, beta =2)
    f15_scores[resampler] = fbeta_score(y_test, y_pred_test, beta =1.5)
    f1_scores[resampler] = f1_score(y_test, y_pred_test)
    pr_auc[resampler] = average_precision_score(y_test, y_pred_test)
    pr_auc_pts[resampler] = precision_recall_curve(y_test, y_pred_test)
    
    cm = confusion_matrix(y_test, y_pred_test, labels=[0,1])
    TN, FP, FN, TP = cm.ravel()
    # TP = np.diag(cm).astype(float)
    # FN = (cm.sum(axis=1) -np.diag(cm)).astype(float)
    TPR = TP/(TP+FN)
    FNR = FN/(TP+FN)
    tpr[resampler] = TPR
    fnr[resampler] = FNR
    # print(cm)
    # print(TP,FN)
    # print(TPR, FNR)

    print(f"{resampler} Model Performance on Test Data:")
    print(f"{resampler} Accuracy:", accuracies[resampler])
    print(f"{resampler} Precision: {precision_scores[resampler]}")
    print(f"{resampler} Recall: {recall_scores[resampler]}")
    print(f"{resampler} F2: {f2_scores[resampler]}")
    print(f"{resampler} F1.5: {f15_scores[resampler]}")
    print(f"{resampler} F1: {f1_scores[resampler]}")
    print(f"{resampler} PR-AUC: {pr_auc[resampler]}")
    print(f"{resampler} Classification Report: \n{class_reports[resampler]}")
    # print(classification_report(y_test, y_pred_test),'\n')

In [35]:
evaluate_results(dt,"Original Dataset",X_train, y_train)
# evaluate_results(dt,"Random Undersample",Xt_resampled_under, yt_resampled_under)
# evaluate_results(dt,"Tomek Links",Xt_resampled_tl, yt_resampled_tl)
evaluate_results(dt,"SMOTE",Xt_resampled_SMOTE, yt_resampled_SMOTE)

Original Dataset Model Performance on Test Data:
Original Dataset Accuracy: 0.97801
Original Dataset Precision: 0.07214953271028038
Original Dataset Recall: 0.09151256519677572
Original Dataset F2: 0.08685086850868509
Original Dataset F1.5: 0.0845321923115798
Original Dataset F1: 0.08068561872909699
Original Dataset PR-AUC: 0.016182588816066434
Original Dataset Classification Report: 
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    197891
           1       0.07      0.09      0.08      2109

    accuracy                           0.98    200000
   macro avg       0.53      0.54      0.53    200000
weighted avg       0.98      0.98      0.98    200000

SMOTE Model Performance on Test Data:
SMOTE Accuracy: 0.97473
SMOTE Precision: 0.06473544191545966
SMOTE Recall: 0.10384068278805121
SMOTE F2: 0.09264743210085456
SMOTE F1.5: 0.08756497401039584
SMOTE F1: 0.0797523670793882
SMOTE PR-AUC: 0.01617217248908756
SMOTE Classification Report

### Combined Undersample + Oversample

#### Undersampling methods + SMOTE

In [42]:
def undersample_smote(undersampler, X_undersampled, y_undersampled):
    smote = SMOTE(random_state=21, sampling_strategy = 0.666)

    # Apply SMOTE on the undersampled data
    Xt_resampled, yt_resampled = smote.fit_resample(X_undersampled, y_undersampled)

    tmp = yt_resampled.fraud_bool.value_counts() / len(yt_resampled) * 100
    print(f'{undersampler}:\n% of non-fraud class in resampled data: {round(tmp[0],3)}%\n% of fraud class in resampled data: {round(tmp[1],3)}%')
    
    evaluate_results(dt, undersampler+' + SMOTE',Xt_resampled, yt_resampled)

In [37]:
undersample_smote('Random Undersample',Xt_resampled_under, yt_resampled_under)

Random Undersample:
% of non-fraud class in resampled data: 60.024%
% of fraud class in resampled data: 39.976%
Random Undersample + SMOTE Model Performance on Test Data:
Random Undersample + SMOTE Accuracy: 0.96211
Random Undersample + SMOTE Precision: 0.05471421592574499
Random Undersample + SMOTE Recall: 0.1593172119487909
Random Undersample + SMOTE F2: 0.11525005145091582
Random Undersample + SMOTE F1.5: 0.10031002411298656
Random Undersample + SMOTE F1: 0.08145454545454546
Random Undersample + SMOTE PR-AUC: 0.017581916335253828
Random Undersample + SMOTE Classification Report: 
              precision    recall  f1-score   support

           0       0.99      0.97      0.98    197891
           1       0.05      0.16      0.08      2109

    accuracy                           0.96    200000
   macro avg       0.52      0.56      0.53    200000
weighted avg       0.98      0.96      0.97    200000



In [38]:
undersample_smote('Tomek Links', Xt_resampled_tl, yt_resampled_tl)

Tomek Links:
% of non-fraud class in resampled data: 60.024%
% of fraud class in resampled data: 39.976%
Tomek Links + SMOTE Model Performance on Test Data:
Tomek Links + SMOTE Accuracy: 0.974065
Tomek Links + SMOTE Precision: 0.06818181818181818
Tomek Links + SMOTE Recall: 0.11522048364153627
Tomek Links + SMOTE F2: 0.10125
Tomek Links + SMOTE F1.5: 0.0950446791226645
Tomek Links + SMOTE F1: 0.08566895822316235
Tomek Links + SMOTE PR-AUC: 0.017185942066468382
Tomek Links + SMOTE Classification Report: 
              precision    recall  f1-score   support

           0       0.99      0.98      0.99    197891
           1       0.07      0.12      0.09      2109

    accuracy                           0.97    200000
   macro avg       0.53      0.55      0.54    200000
weighted avg       0.98      0.97      0.98    200000



### All Results

In [39]:
metrics_names = ['Accuracy', 'Recall','Precision', 'F2 Score', 'F1.5 Score','F1 Score', 'TPR','FNR', "PR-AUC"]
results = pd.DataFrame(index= metrics_names,columns=accuracies.keys())
all_results_list = [accuracies, recall_scores, precision_scores, f2_scores,f15_scores,f1_scores, tpr, fnr, pr_auc]
for i in range(len(all_results_list)):
    for k,v in all_results_list[i].items():
        results.loc[metrics_names[i], str(k)] = v
        
results

Unnamed: 0,Original Dataset,SMOTE,Random Undersample + SMOTE,Tomek Links + SMOTE
Accuracy,0.97801,0.97473,0.96211,0.974065
Recall,0.091513,0.103841,0.159317,0.11522
Precision,0.07215,0.064735,0.054714,0.068182
F2 Score,0.086851,0.092647,0.11525,0.10125
F1.5 Score,0.084532,0.087565,0.10031,0.095045
F1 Score,0.080686,0.079752,0.081455,0.085669
TPR,0.091513,0.103841,0.159317,0.11522
FNR,0.908487,0.896159,0.840683,0.88478
PR-AUC,0.016183,0.016172,0.017582,0.017186


In [40]:
for k,v in class_reports.items():
    print(k,':\n', v,'\n')

Original Dataset :
               precision    recall  f1-score   support

           0       0.99      0.99      0.99    197891
           1       0.07      0.09      0.08      2109

    accuracy                           0.98    200000
   macro avg       0.53      0.54      0.53    200000
weighted avg       0.98      0.98      0.98    200000
 

SMOTE :
               precision    recall  f1-score   support

           0       0.99      0.98      0.99    197891
           1       0.06      0.10      0.08      2109

    accuracy                           0.97    200000
   macro avg       0.53      0.54      0.53    200000
weighted avg       0.98      0.97      0.98    200000
 

Random Undersample + SMOTE :
               precision    recall  f1-score   support

           0       0.99      0.97      0.98    197891
           1       0.05      0.16      0.08      2109

    accuracy                           0.96    200000
   macro avg       0.52      0.56      0.53    200000
weighted av

In [41]:
# import matplotlib.pyplot as plt
# fig = plt.figure(figsize=(10, 5))

# # iterate over the function list and add a subplot for each function
# for idx, x in enumerate(pr_auc_pts.items(), start=1):  
#     resampler = x[0]
#     v = x[1]
#     ax = fig.add_subplot(3, 3, idx) # plot with 2 rows and 3 columns
#     ax.plot(v[0],v[1])
#     ax.set_title(resampler)
#     ax.set_ylabel('Recall')
#     ax.set_xlabel('Precision')
    

# # add spacing between subplots
# fig.tight_layout()