In [7]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold

In [8]:
base = pd.read_csv("Base.csv")

In [9]:
base.dtypes

fraud_bool                            int64
income                              float64
name_email_similarity               float64
prev_address_months_count             int64
current_address_months_count          int64
customer_age                          int64
days_since_request                  float64
intended_balcon_amount              float64
payment_type                         object
zip_count_4w                          int64
velocity_6h                         float64
velocity_24h                        float64
velocity_4w                         float64
bank_branch_count_8w                  int64
date_of_birth_distinct_emails_4w      int64
employment_status                    object
credit_risk_score                     int64
email_is_free                         int64
housing_status                       object
phone_home_valid                      int64
phone_mobile_valid                    int64
bank_months_count                     int64
has_other_cards                 

In [10]:
base["payment_type"].unique()

array(['AA', 'AD', 'AB', 'AC', 'AE'], dtype=object)

In [11]:
base.groupby("fraud_bool")["fraud_bool"].count()

fraud_bool
0    988971
1     11029
Name: fraud_bool, dtype: int64

In [12]:
base

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0,0.3,0.986506,-1,25,40,0.006735,102.453711,AA,1059,...,0,1500.0,0,INTERNET,16.224843,linux,1,1,0,0
1,0,0.8,0.617426,-1,89,20,0.010095,-0.849551,AD,1658,...,0,1500.0,0,INTERNET,3.363854,other,1,1,0,0
2,0,0.8,0.996707,9,14,40,0.012316,-1.490386,AB,1095,...,0,200.0,0,INTERNET,22.730559,windows,0,1,0,0
3,0,0.6,0.475100,11,14,30,0.006991,-1.863101,AB,3483,...,0,200.0,0,INTERNET,15.215816,linux,1,1,0,0
4,0,0.9,0.842307,-1,29,40,5.742626,47.152498,AA,2339,...,0,200.0,0,INTERNET,3.743048,other,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0,0.8,0.124690,-1,143,30,0.051348,-0.826239,AB,530,...,0,1500.0,0,INTERNET,16.967770,other,0,1,0,7
999996,0,0.9,0.824544,-1,193,30,0.009591,0.008307,AC,408,...,1,1000.0,0,INTERNET,1.504109,macintosh,0,1,0,7
999997,0,0.8,0.140891,-1,202,10,0.059287,50.609995,AA,749,...,0,200.0,0,INTERNET,16.068595,other,0,1,0,7
999998,0,0.9,0.002480,52,3,30,0.023357,-1.313387,AB,707,...,0,200.0,0,INTERNET,1.378683,linux,1,1,0,7


## Feature Engineering

In [13]:
X = base.loc[ : , base.columns != 'fraud_bool']

In [14]:
y = base[['fraud_bool']]

In [15]:
X.shape, y.shape

((1000000, 31), (1000000, 1))

### Scaling

In [16]:
# categorical columns with numeric dtype (Either binary or month)
binary_cols = [ 'email_is_free', 'phone_home_valid', 'phone_mobile_valid','has_other_cards', 'foreign_request','keep_alive_session','month']
# change dtype to object
X[binary_cols] = X[binary_cols].astype(object)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[binary_cols] = X[binary_cols].astype(object)


In [17]:
X

Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,velocity_6h,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0.3,0.986506,-1,25,40,0.006735,102.453711,AA,1059,13096.035018,...,0,1500.0,0,INTERNET,16.224843,linux,1,1,0,0
1,0.8,0.617426,-1,89,20,0.010095,-0.849551,AD,1658,9223.283431,...,0,1500.0,0,INTERNET,3.363854,other,1,1,0,0
2,0.8,0.996707,9,14,40,0.012316,-1.490386,AB,1095,4471.472149,...,0,200.0,0,INTERNET,22.730559,windows,0,1,0,0
3,0.6,0.475100,11,14,30,0.006991,-1.863101,AB,3483,14431.993621,...,0,200.0,0,INTERNET,15.215816,linux,1,1,0,0
4,0.9,0.842307,-1,29,40,5.742626,47.152498,AA,2339,7601.511579,...,0,200.0,0,INTERNET,3.743048,other,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.8,0.124690,-1,143,30,0.051348,-0.826239,AB,530,6732.602414,...,0,1500.0,0,INTERNET,16.967770,other,0,1,0,7
999996,0.9,0.824544,-1,193,30,0.009591,0.008307,AC,408,1574.293294,...,1,1000.0,0,INTERNET,1.504109,macintosh,0,1,0,7
999997,0.8,0.140891,-1,202,10,0.059287,50.609995,AA,749,1258.864938,...,0,200.0,0,INTERNET,16.068595,other,0,1,0,7
999998,0.9,0.002480,52,3,30,0.023357,-1.313387,AB,707,7048.137128,...,0,200.0,0,INTERNET,1.378683,linux,1,1,0,7


In [18]:
numeric_cols = X.select_dtypes(include=['int', 'float']).columns

In [19]:
#Feature engineering - scaling
scaler = MinMaxScaler()

# Apply Min-Max Scaling to all numeric columns in X
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_cols] = scaler.fit_transform(X[numeric_cols])


Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,velocity_6h,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0.250,0.986507,0.000000,0.060606,0.375,0.000086,0.918255,AA,0.157934,0.785651,...,0,0.685864,0,INTERNET,0.198216,linux,1,0.666667,0.0,0
1,0.875,0.617426,0.000000,0.209790,0.125,0.000129,0.114260,AD,0.247350,0.556307,...,0,0.685864,0,INTERNET,0.050217,other,1,0.666667,0.0,0
2,0.875,0.996708,0.026042,0.034965,0.375,0.000157,0.109273,AB,0.163308,0.274904,...,0,0.005236,0,INTERNET,0.273082,windows,0,0.666667,0.0,0
3,0.625,0.475100,0.031250,0.034965,0.250,0.000089,0.106372,AB,0.519779,0.864767,...,0,0.005236,0,INTERNET,0.186605,linux,1,0.666667,0.0,0
4,1.000,0.842307,0.000000,0.069930,0.375,0.073195,0.487853,AA,0.349007,0.460265,...,0,0.005236,0,INTERNET,0.054581,other,0,0.666667,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.875,0.124689,0.000000,0.335664,0.250,0.000654,0.114442,AB,0.078967,0.408808,...,0,0.685864,0,INTERNET,0.206766,other,0,0.666667,0.0,7
999996,1.000,0.824545,0.000000,0.452214,0.250,0.000122,0.120937,AC,0.060755,0.103333,...,1,0.424084,0,INTERNET,0.028816,macintosh,0,0.666667,0.0,7
999997,0.875,0.140890,0.000000,0.473193,0.000,0.000756,0.514763,AA,0.111658,0.084653,...,0,0.005236,0,INTERNET,0.196418,other,0,0.666667,0.0,7
999998,1.000,0.002479,0.138021,0.009324,0.250,0.000298,0.110650,AB,0.105389,0.427494,...,0,0.005236,0,INTERNET,0.027373,linux,1,0.666667,0.0,7


### Pearson Correlation - detect mutlicollinearity

In [20]:
import pandas as pd
import numpy as np

# Calculate the Pearson correlation coefficients
correlation_matrix = X.corr(method='pearson')

# Get the absolute values of the correlation coefficients
correlation_matrix_abs = correlation_matrix.abs()

# Select upper triangle of correlation matrix (excluding diagonal)
upper_triangle = correlation_matrix_abs.where(
    np.triu(np.ones(correlation_matrix_abs.shape), k=1).astype(np.bool))

# Find features with correlation greater than a threshold
threshold = 0.85  # Example threshold
high_correlation_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]

# Print highly correlated features
print("Highly Correlated Features:")
print(high_correlation_features)     #None

  correlation_matrix = X.corr(method='pearson')


Highly Correlated Features:
[]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.triu(np.ones(correlation_matrix_abs.shape), k=1).astype(np.bool))


### Variance Threshold

In [21]:
vt = VarianceThreshold(threshold=0.0)

# Fit the VarianceThreshold to the selected numeric columns in X
vt.fit(X[numeric_cols])

# Get the boolean mask of selected numeric features
selected_numeric_mask = vt.get_support()

# Get the names of the selected numeric features
selected_numeric_features = X[numeric_cols].columns[selected_numeric_mask]

# Filter X to keep only selected numeric features
X_numeric_selected = X[selected_numeric_features]
#device_fraud_count is dropped 

# Get the remaining categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Combine selected numeric features with categorical columns
X_selected = pd.concat([X_numeric_selected, X[categorical_cols]], axis=1)
X = X_selected
X

Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,...,email_is_free,housing_status,phone_home_valid,phone_mobile_valid,has_other_cards,foreign_request,source,device_os,keep_alive_session,month
0,0.250,0.986507,0.000000,0.060606,0.375,0.000086,0.918255,0.157934,0.785651,0.798218,...,1,BC,0,1,0,0,INTERNET,linux,1,0
1,0.875,0.617426,0.000000,0.209790,0.125,0.000129,0.114260,0.247350,0.556307,0.541631,...,1,BC,1,1,0,0,INTERNET,other,1,0
2,0.875,0.996708,0.026042,0.034965,0.375,0.000157,0.109273,0.163308,0.274904,0.508333,...,1,BC,0,1,0,0,INTERNET,windows,0,0
3,0.625,0.475100,0.031250,0.034965,0.250,0.000089,0.106372,0.519779,0.864767,0.664714,...,1,BC,0,1,0,0,INTERNET,linux,1,0
4,1.000,0.842307,0.000000,0.069930,0.375,0.073195,0.487853,0.349007,0.460265,0.465935,...,0,BC,1,1,0,0,INTERNET,other,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.875,0.124689,0.000000,0.335664,0.250,0.000654,0.114442,0.078967,0.408808,0.208338,...,1,BB,1,1,0,0,INTERNET,other,0,7
999996,1.000,0.824545,0.000000,0.452214,0.250,0.000122,0.120937,0.060755,0.103333,0.172567,...,0,BA,1,1,1,0,INTERNET,macintosh,0,7
999997,0.875,0.140890,0.000000,0.473193,0.000,0.000756,0.514763,0.111658,0.084653,0.280386,...,1,BE,0,1,0,0,INTERNET,other,0,7
999998,1.000,0.002479,0.138021,0.009324,0.250,0.000298,0.110650,0.105389,0.427494,0.636207,...,0,BD,0,1,0,0,INTERNET,linux,1,7


### One Hot Encoding

In [22]:
# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(handle_unknown='ignore')

# Select categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Fit and transform the selected columns with OneHotEncoder
encoded_cols = onehot_encoder.fit_transform(X[categorical_cols]).toarray()

# Get the feature names for the new one-hot encoded columns
# feature_names = onehot_encoder.get_feature_names(categorical_cols)
feature_names = onehot_encoder.get_feature_names_out(categorical_cols)

# Create a DataFrame from the one-hot encoded columns
X_encoded = pd.DataFrame(encoded_cols, columns=feature_names)

# Replace the original columns with the one-hot encoded columns (optional)
X.drop(columns=categorical_cols, inplace=True)
X = pd.concat([X, X_encoded], axis=1)

In [23]:
X

Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,...,keep_alive_session_0,keep_alive_session_1,month_0,month_1,month_2,month_3,month_4,month_5,month_6,month_7
0,0.250,0.986507,0.000000,0.060606,0.375,0.000086,0.918255,0.157934,0.785651,0.798218,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.875,0.617426,0.000000,0.209790,0.125,0.000129,0.114260,0.247350,0.556307,0.541631,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.875,0.996708,0.026042,0.034965,0.375,0.000157,0.109273,0.163308,0.274904,0.508333,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.625,0.475100,0.031250,0.034965,0.250,0.000089,0.106372,0.519779,0.864767,0.664714,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.000,0.842307,0.000000,0.069930,0.375,0.073195,0.487853,0.349007,0.460265,0.465935,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.875,0.124689,0.000000,0.335664,0.250,0.000654,0.114442,0.078967,0.408808,0.208338,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
999996,1.000,0.824545,0.000000,0.452214,0.250,0.000122,0.120937,0.060755,0.103333,0.172567,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
999997,0.875,0.140890,0.000000,0.473193,0.000,0.000756,0.514763,0.111658,0.084653,0.280386,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
999998,1.000,0.002479,0.138021,0.009324,0.250,0.000298,0.110650,0.105389,0.427494,0.636207,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Feature Selection

In [24]:
#from sklearn.linear_model import Lasso
#import pandas as pd

# Initialize the Lasso model with a chosen alpha
#lasso = Lasso(alpha=0.001)  # Adjust the alpha value as needed

# Fit the Lasso model to the training data
#lasso.fit(X_train, y_train)

# Get the coefficients and corresponding feature names
#feature_names = X_train.columns
#lasso_coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': lasso.coef_})

# Filter the features with non-zero coefficients
#selected_features = lasso_coefficients[lasso_coefficients['Coefficient'] != 0]['Feature']
#print(selected_features)

In [25]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Initialize the estimator (e.g., Logistic Regression)
estimator = LogisticRegression()

# Initialize RFE with the estimator and number of features to select
rfe = RFE(estimator, n_features_to_select=35)  # Adjust number of features as needed

# Fit RFE 
rfe.fit(X, y)

# Get selected features
selected_features = X.columns[rfe.support_]

print(selected_features)


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

Index(['income', 'name_email_similarity', 'prev_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'zip_count_4w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score',
       'proposed_credit_limit', 'device_distinct_emails_8w', 'payment_type_AA',
       'payment_type_AB', 'payment_type_AD', 'payment_type_AE',
       'employment_status_CB', 'employment_status_CD', 'employment_status_CE',
       'employment_status_CF', 'email_is_free_0', 'email_is_free_1',
       'housing_status_BA', 'phone_home_valid_1', 'phone_mobile_valid_0',
       'phone_mobile_valid_1', 'has_other_cards_1', 'foreign_request_0',
       'foreign_request_1', 'source_INTERNET', 'source_TELEAPP',
       'device_os_linux', 'device_os_other', 'device_os_windows',
       'keep_alive_session_1', 'month_3'],
      dtype='object')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
#the top 35 features (idk a rough gauge)
columns_to_keep = ['income', 'name_email_similarity', 'prev_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'zip_count_4w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score',
       'proposed_credit_limit', 'device_distinct_emails_8w', 'payment_type_AA',
       'payment_type_AB', 'payment_type_AD', 'payment_type_AE',
       'employment_status_CB', 'employment_status_CD', 'employment_status_CE',
       'employment_status_CF', 'email_is_free_0', 'email_is_free_1',
       'housing_status_BA', 'phone_home_valid_1', 'phone_mobile_valid_0',
       'phone_mobile_valid_1', 'has_other_cards_1', 'foreign_request_0',
       'foreign_request_1', 'source_INTERNET', 'source_TELEAPP',
       'device_os_linux', 'device_os_other', 'device_os_windows',
       'keep_alive_session_1', 'month_3']
X = X[columns_to_keep]

In [27]:
X

Unnamed: 0,income,name_email_similarity,prev_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,date_of_birth_distinct_emails_4w,credit_risk_score,proposed_credit_limit,...,has_other_cards_1,foreign_request_0,foreign_request_1,source_INTERNET,source_TELEAPP,device_os_linux,device_os_other,device_os_windows,keep_alive_session_1,month_3
0,0.250,0.986507,0.000000,0.375,0.000086,0.918255,0.157934,0.128205,0.595707,0.685864,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.875,0.617426,0.000000,0.125,0.000129,0.114260,0.247350,0.461538,0.579606,0.685864,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.875,0.996708,0.026042,0.375,0.000157,0.109273,0.163308,0.282051,0.463327,0.005236,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.625,0.475100,0.031250,0.250,0.000089,0.106372,0.519779,0.333333,0.465116,0.005236,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.000,0.842307,0.000000,0.375,0.073195,0.487853,0.349007,0.153846,0.466905,0.005236,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.875,0.124689,0.000000,0.250,0.000654,0.114442,0.078967,0.205128,0.849732,0.685864,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
999996,1.000,0.824545,0.000000,0.250,0.000122,0.120937,0.060755,0.128205,0.724508,0.424084,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
999997,0.875,0.140890,0.000000,0.000,0.000756,0.514763,0.111658,0.076923,0.652952,0.005236,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
999998,1.000,0.002479,0.138021,0.250,0.000298,0.110650,0.105389,0.205128,0.568873,0.005236,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


### Train Test Split

In [28]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=109) # 70% training and 30% test


In [29]:
X_train.shape

(700000, 35)

## Resampling

In [30]:
ratio = y.fraud_bool.value_counts() / len(y) * 100
print(f'% of non-fraud class in y: {round(ratio[0],3)}%\n% of fraud class in y: {round(ratio[1],3)}%\n')

ratio_train = y_train.fraud_bool.value_counts() / len(y_train) * 100
print(f'% of non-fraud class in y_train: {round(ratio_train[0],3)}%\n% of fraud class in y_train: {round(ratio_train[1],3)}%\n')

ratio_test = y_test.fraud_bool.value_counts() / len(y_test) * 100
print(f'% of non-fraud class in y_test: {round(ratio_test[0],3)}%\n% of fraud class in y_test: {round(ratio_test[1],3)}%')


% of non-fraud class in y: 98.897%
% of fraud class in y: 1.103%

% of non-fraud class in y_train: 98.892%
% of fraud class in y_train: 1.108%

% of non-fraud class in y_test: 98.909%
% of fraud class in y_test: 1.091%


### Individual Resample

#### Random Undersample

In [31]:
%%time
from imblearn.under_sampling import RandomUnderSampler
# from imblearn.over_sampling import SMOTE

desired_majority_size = int(0.5 * len(X_train))  # 50% of the original majority class size

# Initialize RandomUnderSampler to undersample the majority class to 50%
under_sampler = RandomUnderSampler(sampling_strategy={0: desired_majority_size}, random_state=42)

# Apply RandomUnderSampler
Xt_resampled_under, yt_resampled_under = under_sampler.fit_resample(X_train, y_train)

# Initialize SMOTE to oversample the minority class to match the majority class
# smote = SMOTE(sampling_strategy='auto', random_state=42)

# # Apply SMOTE on the undersampled data
# Xt_resampled, yt_resampled = smote.fit_resample(Xt_resampled_under, yt_resampled_under)

tmp = yt_resampled_under.fraud_bool.value_counts() / len(yt_resampled_under) * 100
print(f'% of non-fraud class in resampled data: {round(tmp[0],3)}%\n% of fraud class in resampled data: {round(tmp[1],3)}%')

% of non-fraud class in resampled data: 97.832%
% of fraud class in resampled data: 2.168%
CPU times: user 693 ms, sys: 275 ms, total: 967 ms
Wall time: 1.05 s


#### Tomek Links

In [32]:
%%time
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
Xt_resampled_tl, yt_resampled_tl = tl.fit_resample(X_train, y_train)

ratio_tl = yt_resampled_tl.fraud_bool.value_counts() / len(yt_resampled_tl) * 100
print(f'% of non-fraud class in resampled data: {round(ratio_tl[0],3)}%\n% of fraud class in resampled data: {round(ratio_tl[1],3)}%')

% of non-fraud class in resampled data: 98.887%
% of fraud class in resampled data: 1.113%
CPU times: user 1h 56min 28s, sys: 29.3 s, total: 1h 56min 58s
Wall time: 17min 36s


#### Cluster Centroid

can tune estimator -- default is KMeans

In [33]:
%%time
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state = 42)
Xt_resampled_cc, yt_resampled_cc = cc.fit_resample(X_train, y_train)

ratio_cc = yt_resampled_cc.fraud_bool.value_counts() / len(yt_resampled_cc) * 100
print(f'% of non-fraud class in resampled data: {round(ratio_cc[0],3)}%\n% of fraud class in resampled data: {round(ratio_cc[1],3)}%')

KeyboardInterrupt: 

#### SMOTE

In [34]:
%%time
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)

Xt_resampled_SMOTE, yt_resampled_SMOTE = smote.fit_resample(X_train, y_train)

ratio_SMOTE = yt_resampled_SMOTE.fraud_bool.value_counts() / len(yt_resampled_SMOTE) * 100
print(f'% of non-fraud class in resampled data: {round(ratio_SMOTE[0],3)}%\n% of fraud class in resampled data: {round(ratio_SMOTE[1],3)}%')

% of non-fraud class in resampled data: 50.0%
% of fraud class in resampled data: 50.0%
CPU times: user 3.29 s, sys: 1.04 s, total: 4.33 s
Wall time: 2.63 s


#### ADASYN

In [35]:
%%time
from imblearn.over_sampling import ADASYN
adasyn = ADASYN(random_state=42)
Xt_resampled_adasyn, yt_resampled_adasyn = adasyn.fit_resample(X_train, y_train)

ratio_adasyn = yt_resampled_adasyn.fraud_bool.value_counts() / len(yt_resampled_adasyn) * 100
print(f'% of non-fraud class in resampled data: {round(ratio_adasyn[0],3)}%\n% of fraud class in resampled data: {round(ratio_adasyn[1],3)}%')

% of non-fraud class in resampled data: 50.026%
% of fraud class in resampled data: 49.974%
CPU times: user 1min 6s, sys: 1.39 s, total: 1min 7s
Wall time: 11.9 s


#### Evaluate Individual Resampling Methods

In [36]:
accuracies = {}
class_reports = {}

In [37]:
#models 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

logistic = LogisticRegression(random_state=42)
svm = SVC(kernel='linear', random_state=42)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [38]:
from sklearn.metrics import classification_report, accuracy_score
def evaluate_results(model,resampler,x_resampled, y_resampled):

    model.fit(x_resampled, y_resampled)

    y_pred_test = model.predict(X_test)

    accuracies[resampler] = accuracy_score(y_test, y_pred_test)
    class_reports[resampler] = classification_report(y_test, y_pred_test)

    print(f"{resampler} Model Performance on Test Data:")
    print(f"{resampler} Accuracy:", accuracy_score(y_test, y_pred_test))
    print(f"{resampler} Classification Report:")
    print(classification_report(y_test, y_pred_test),'\n')

In [39]:
evaluate_results(logistic,"Random Undersample",Xt_resampled_under, yt_resampled_under)
evaluate_results(logistic,"Tomek Links",Xt_resampled_tl, yt_resampled_tl)
#evaluate_results(logistic,"Cluster Centroid",Xt_resampled_cc, yt_resampled_cc)
evaluate_results(logistic,"SMOTE",Xt_resampled_SMOTE, yt_resampled_SMOTE)
evaluate_results(logistic,"ADASYN",Xt_resampled_adasyn, yt_resampled_adasyn)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Undersample Model Performance on Test Data:
Random Undersample Accuracy: 0.9889133333333333
Random Undersample Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    296726
           1       0.42      0.04      0.07      3274

    accuracy                           0.99    300000
   macro avg       0.70      0.52      0.53    300000
weighted avg       0.98      0.99      0.98    300000
 



  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Tomek Links Model Performance on Test Data:
Tomek Links Accuracy: 0.9890466666666666
Tomek Links Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    296726
           1       0.41      0.01      0.02      3274

    accuracy                           0.99    300000
   macro avg       0.70      0.50      0.51    300000
weighted avg       0.98      0.99      0.98    300000
 



  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


SMOTE Model Performance on Test Data:
SMOTE Accuracy: 0.80851
SMOTE Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.81      0.89    296726
           1       0.04      0.78      0.08      3274

    accuracy                           0.81    300000
   macro avg       0.52      0.79      0.49    300000
weighted avg       0.99      0.81      0.88    300000
 



  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ADASYN Model Performance on Test Data:
ADASYN Accuracy: 0.8012433333333333
ADASYN Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.80      0.89    296726
           1       0.04      0.78      0.08      3274

    accuracy                           0.80    300000
   macro avg       0.52      0.79      0.48    300000
weighted avg       0.99      0.80      0.88    300000
 



In [None]:
#evaluate_results(svm,"Random Undersample",Xt_resampled_under, yt_resampled_under)
#evaluate_results(svm,"Tomek Links",Xt_resampled_tl, yt_resampled_tl)
#evaluate_results(svm,"Cluster Centroid",Xt_resampled_cc, yt_resampled_cc)
#evaluate_results(svm,"SMOTE",Xt_resampled_SMOTE, yt_resampled_SMOTE)
#evaluate_results(svm,"ADASYN",Xt_resampled_adasyn, yt_resampled_adasyn)

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Undersample Model Performance on Test Data:
Random Undersample Accuracy: 0.9890866666666667
Random Undersample Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99    296726
           1       0.00      0.00      0.00      3274

    accuracy                           0.99    300000
   macro avg       0.49      0.50      0.50    300000
weighted avg       0.98      0.99      0.98    300000
 



  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Tomek Links Model Performance on Test Data:
Tomek Links Accuracy: 0.9890866666666667
Tomek Links Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99    296726
           1       0.00      0.00      0.00      3274

    accuracy                           0.99    300000
   macro avg       0.49      0.50      0.50    300000
weighted avg       0.98      0.99      0.98    300000
 



  y = column_or_1d(y, warn=True)


In [41]:
evaluate_results(rf_classifier,"Random Undersample",Xt_resampled_under, yt_resampled_under)
evaluate_results(rf_classifier,"Tomek Links",Xt_resampled_tl, yt_resampled_tl)
#evaluate_results(rf,"Cluster Centroid",Xt_resampled_cc, yt_resampled_cc)
evaluate_results(rf_classifier,"SMOTE",Xt_resampled_SMOTE, yt_resampled_SMOTE)
evaluate_results(rf_classifier,"ADASYN",Xt_resampled_adasyn, yt_resampled_adasyn)

  model.fit(x_resampled, y_resampled)


Random Undersample Model Performance on Test Data:
Random Undersample Accuracy: 0.9891233333333334
Random Undersample Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    296726
           1       0.54      0.02      0.05      3274

    accuracy                           0.99    300000
   macro avg       0.76      0.51      0.52    300000
weighted avg       0.98      0.99      0.98    300000
 



  model.fit(x_resampled, y_resampled)


KeyboardInterrupt: 

### Combined Undersample + Oversample

#### Undersampling methods + SMOTE

In [42]:
def undersample_smote(undersampler, X_undersampled, y_undersampled):
    smote = SMOTE(sampling_strategy='auto', random_state=42)

    # Apply SMOTE on the undersampled data
    Xt_resampled, yt_resampled = smote.fit_resample(X_undersampled, y_undersampled)

    tmp = yt_resampled.fraud_bool.value_counts() / len(yt_resampled) * 100
    print(f'{undersampler}:\n% of non-fraud class in resampled data: {round(tmp[0],3)}%\n% of fraud class in resampled data: {round(tmp[1],3)}%')
    
    evaluate_results(logistic, undersampler+' + SMOTE',Xt_resampled, yt_resampled)

In [43]:
undersample_smote('Random Undersample',Xt_resampled_under, yt_resampled_under)

Random Undersample:
% of non-fraud class in resampled data: 50.0%
% of fraud class in resampled data: 50.0%


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Undersample + SMOTE Model Performance on Test Data:
Random Undersample + SMOTE Accuracy: 0.80818
Random Undersample + SMOTE Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.81      0.89    296726
           1       0.04      0.78      0.08      3274

    accuracy                           0.81    300000
   macro avg       0.52      0.79      0.49    300000
weighted avg       0.99      0.81      0.88    300000
 



In [None]:
undersample_smote('Tomek Links', Xt_resampled_tl, yt_resampled_tl)

Tomek Links:
% of non-fraud class in resampled data: 50.0%
% of fraud class in resampled data: 50.0%


  y = column_or_1d(y, warn=True)


Tomek Links + SMOTE Logistic Regression Performance on Test Data:
Tomek Links + SMOTE Accuracy: 0.8151066666666666
Tomek Links + SMOTE Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.82      0.90    296726
           1       0.04      0.78      0.08      3274

    accuracy                           0.82    300000
   macro avg       0.52      0.80      0.49    300000
weighted avg       0.99      0.82      0.89    300000
 



In [None]:
#undersample_smote('Cluster Centroid', Xt_resampled_cc, yt_resampled_cc)

Cluster Centroid:
% of non-fraud class in resampled data: 50.0%
% of fraud class in resampled data: 50.0%


  y = column_or_1d(y, warn=True)


Cluster Centroid + SMOTE Logistic Regression Performance on Test Data:
Cluster Centroid + SMOTE Accuracy: 0.7578533333333334
Cluster Centroid + SMOTE Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.76      0.86    296726
           1       0.04      0.81      0.07      3274

    accuracy                           0.76    300000
   macro avg       0.52      0.78      0.46    300000
weighted avg       0.99      0.76      0.85    300000
 



#### Undersampling methods + ADASYN

In [None]:
def undersample_adasyn(undersampler, X_undersampled, y_undersampled):
    adasyn = ADASYN(random_state=42)

    # Apply SMOTE on the undersampled data
    Xt_resampled, yt_resampled = adasyn.fit_resample(X_undersampled, y_undersampled)

    tmp = yt_resampled.fraud_bool.value_counts() / len(yt_resampled) * 100
    print(f'{undersampler}:\n% of non-fraud class in resampled data: {round(tmp[0],3)}%\n% of fraud class in resampled data: {round(tmp[1],3)}%')
    
    evaluate_results(logistic, undersampler+' + ADASYN',Xt_resampled, yt_resampled)

In [None]:
undersample_adasyn('Random Undersample',Xt_resampled_under, yt_resampled_under)

Random Undersample:
% of non-fraud class in resampled data: 49.903%
% of fraud class in resampled data: 50.097%


  y = column_or_1d(y, warn=True)


Random Undersample + ADASYN Logistic Regression Performance on Test Data:
Random Undersample + ADASYN Accuracy: 0.8046133333333333
Random Undersample + ADASYN Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.80      0.89    296726
           1       0.04      0.79      0.08      3274

    accuracy                           0.80    300000
   macro avg       0.52      0.80      0.49    300000
weighted avg       0.99      0.80      0.88    300000
 



In [None]:
undersample_adasyn('Tomek Links',Xt_resampled_tl, yt_resampled_tl)

Tomek Links:
% of non-fraud class in resampled data: 50.07%
% of fraud class in resampled data: 49.93%


  y = column_or_1d(y, warn=True)


Tomek Links + ADASYN Logistic Regression Performance on Test Data:
Tomek Links + ADASYN Accuracy: 0.8080933333333333
Tomek Links + ADASYN Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.81      0.89    296726
           1       0.04      0.78      0.08      3274

    accuracy                           0.81    300000
   macro avg       0.52      0.80      0.49    300000
weighted avg       0.99      0.81      0.88    300000
 



In [None]:
undersample_adasyn('Cluster Centroid',Xt_resampled_cc, yt_resampled_cc)

Cluster Centroid:
% of non-fraud class in resampled data: 50.0%
% of fraud class in resampled data: 50.0%


  y = column_or_1d(y, warn=True)


Cluster Centroid + ADASYN Logistic Regression Performance on Test Data:
Cluster Centroid + ADASYN Accuracy: 0.7578533333333334
Cluster Centroid + ADASYN Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.76      0.86    296726
           1       0.04      0.81      0.07      3274

    accuracy                           0.76    300000
   macro avg       0.52      0.78      0.46    300000
weighted avg       0.99      0.76      0.85    300000
 



### All Results

In [None]:
accuracies

{'Random Undersample': 0.9888933333333333,
 'Tomek Links': 0.98909,
 'Cluster Centroid': 0.7578533333333334,
 'SMOTE': 0.8162366666666667,
 'ADASYN': 0.8094833333333333,
 'Random Undersample + SMOTE': 0.8157533333333333,
 'Tomek Links + SMOTE': 0.8151066666666666,
 'Cluster Centroid + SMOTE': 0.7578533333333334,
 'Random Undersample + ADASYN': 0.8046133333333333,
 'Tomek Links + ADASYN': 0.8080933333333333,
 'Cluster Centroid + ADASYN': 0.7578533333333334}

In [None]:
for k,v in class_reports.items():
    print(k,':\n', v,'\n')

Random Undersample :
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    296726
           1       0.41      0.04      0.08      3274

    accuracy                           0.99    300000
   macro avg       0.70      0.52      0.53    300000
weighted avg       0.98      0.99      0.98    300000
 

Tomek Links :
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    296726
           1       0.51      0.01      0.02      3274

    accuracy                           0.99    300000
   macro avg       0.75      0.50      0.51    300000
weighted avg       0.98      0.99      0.98    300000
 

Cluster Centroid :
               precision    recall  f1-score   support

           0       1.00      0.76      0.86    296726
           1       0.04      0.81      0.07      3274

    accuracy                           0.76    300000
   macro avg       0.52      0.78      0.46    300000
weighted avg 