# Packages & Helper Functions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

# show all cols in df
pd.set_option('display.max_columns', None)

# Function to do all the preprocessing

def preprocess_train_test(train_df, test_df):
    def preprocess_single(df):
        df_copy = df.copy()

        # Ensure 'trans_date_trans_time' is in datetime format
        df_copy['trans_date_trans_time'] = pd.to_datetime(df_copy['trans_date_trans_time'])
        df_copy['unix_time'] = pd.to_datetime(df_copy['unix_time'], unit='s')

        # Get month of year, day of week, and hour of day for transactions
        df_copy['trans_day_of_week'] = df_copy['trans_date_trans_time'].dt.day_name()
        df_copy['trans_month'] = df_copy['trans_date_trans_time'].dt.month_name()

        # Create 'time_of_day' column
        df_copy['time_of_day'] = (df_copy['trans_date_trans_time'].dt.hour * 60 + df_copy['trans_date_trans_time'].dt.minute) / (24 * 60)

        # Calculate age in years
        df_copy['dob'] = pd.to_datetime(df_copy['dob'])
        df_copy['age'] = df_copy['trans_date_trans_time'].dt.year - df_copy['dob'].dt.year - (
            (df_copy['trans_date_trans_time'].dt.month < df_copy['dob'].dt.month) |
            ((df_copy['trans_date_trans_time'].dt.month == df_copy['dob'].dt.month) & (df_copy['trans_date_trans_time'].dt.day < df_copy['dob'].dt.day))
        )

        # Sort the DataFrame by 'cc_num' and 'trans_date_trans_time'
        df_copy.sort_values(by=['cc_num', 'trans_date_trans_time'], inplace=True)

        # Group by 'cc_num' and calculate the time difference
        df_copy['last_purchased'] = df_copy.groupby('cc_num')['trans_date_trans_time'].diff()

        # For the first transaction for each 'cc_num', set 'last_purchased' to NaT
        df_copy['last_purchased'] = df_copy['last_purchased'].fillna(pd.NaT)

        # Convert 'last_purchased' to numerical format (total number of seconds)
        df_copy['last_purchased_secs'] = df_copy['last_purchased'].dt.total_seconds()

        # Define the Haversine distance function
        def haversine(lat1, lon1, lat2, lon2):
            R = 6371  # Radius of the Earth in kilometers
            lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
            dlat = lat2 - lat1
            dlon = lon2 - lon1
            a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
            c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
            return R * c

        # Calculate the distance to the last purchase
        df_copy['last_merch_lat'] = df_copy.groupby('cc_num')['merch_lat'].shift()
        df_copy['last_merch_long'] = df_copy.groupby('cc_num')['merch_long'].shift()

        df_copy['distance_last_purchase'] = haversine(df_copy['merch_lat'], df_copy['merch_long'], df_copy['last_merch_lat'], df_copy['last_merch_long'])

        # For the first transaction for each 'cc_num', set 'distance_last_purchase' to NaN
        df_copy['distance_last_purchase'] = df_copy['distance_last_purchase'].fillna(np.nan)

        # Create 'same_merchant_with_last_purchase' column
        df_copy['last_merchant'] = df_copy.groupby('cc_num')['merchant'].shift()
        df_copy['same_merchant_with_last_purchase'] = (df_copy['merchant'] == df_copy['last_merchant'])

        # Drop the temporary columns
        df_copy.drop(columns=['last_merch_lat', 'last_merch_long', 'last_merchant'], inplace=True)

        # Drop the specified columns
        df_copy.drop(columns=['last_purchased', 'city', 'street', 'last', 'first', 'job', 'cc_num', 'merchant', 'state', 'zip', 'lat', 'long', 'merch_lat', 'merch_long', 'trans_num', 'unix_time', 'dob'], inplace=True)

        # Sort the DataFrame by the original index to maintain the original order
        df_copy.sort_index(inplace=True)

        return df_copy

    train_df_preprocessed = preprocess_single(train_df)
    test_df_preprocessed = preprocess_single(test_df)

    return train_df_preprocessed, test_df_preprocessed

# Takes in processsed train and test data, removes all January 2019 data, and returns train, val, test
def process_fraud_data(train_df_preprocessed, test_df_preprocessed):
    # Combine the preprocessed data
    combined_df = pd.concat([train_df_preprocessed, test_df_preprocessed])

    # Remove rows where 'last_purchased_secs' is NaN
    combined_df = combined_df.dropna(subset=['last_purchased_secs'])

    # Remove all January 2019 data
    combined_df = combined_df[~((combined_df['trans_date_trans_time'].dt.year == 2019) & (combined_df['trans_date_trans_time'].dt.month == 1))]

    # Drop trans_date_trans_time
    combined_df = combined_df.drop(columns=['trans_date_trans_time'])

    # Split the combined data into train, validation, and test sets by order
    total_samples = len(combined_df)
    train_end = int(0.64 * total_samples)
    val_end = int(0.80 * total_samples)

    train_data = combined_df.iloc[:train_end]
    val_data = combined_df.iloc[train_end:val_end]
    test_data = combined_df.iloc[val_end:]

    return train_data, val_data, test_data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load & Preprocess Data



In [None]:
train_df = pd.read_csv("/content/drive/Shared drives/Machine Learning/Project/fraudTrain.csv", index_col=0)
test_df = pd.read_csv("/content/drive/Shared drives/Machine Learning/Project/fraudTest.csv", index_col=0)
train_preprocessed, test_preprocessed = preprocess_train_test(train_df, test_df)
train, val, test = process_fraud_data(train_preprocessed, train_preprocessed)
train.head()

Unnamed: 0,category,amt,gender,city_pop,is_fraud,trans_day_of_week,trans_month,time_of_day,age,last_purchased_secs,distance_last_purchase,same_merchant_with_last_purchase
52525,grocery_net,19.46,F,1423,0,Friday,February,0.001389,20,16027.0,155.644412,False
52526,misc_net,13.01,M,471,0,Friday,February,0.002083,51,163101.0,121.187723,False
52527,gas_transport,50.02,M,471,0,Friday,February,0.003472,77,55381.0,28.230127,False
52528,entertainment,6.11,M,192805,0,Friday,February,0.004167,55,91351.0,132.898691,False
52529,grocery_net,32.14,M,18408,0,Friday,February,0.005556,46,971.0,81.694148,False


In [None]:
categorical_features = ['category', 'gender', 'trans_day_of_week', 'trans_month', 'same_merchant_with_last_purchase']
continuous_features = ['amt', "city_pop", "time_of_day", "age", 'last_purchased_secs', 'distance_last_purchase']
train = pd.get_dummies(train, columns=categorical_features, drop_first=True)

# Engineeri val & test in the same way as train set, if the one-hot encoding doesn't exist in test set, fill False

val = pd.get_dummies(val, columns=categorical_features, drop_first=True)
val = val.reindex(columns=train.columns, fill_value=0)
test = pd.get_dummies(test, columns=categorical_features, drop_first=True)
test = test.reindex(columns=train.columns, fill_value=0)
train.head()

Unnamed: 0,amt,city_pop,is_fraud,time_of_day,age,last_purchased_secs,distance_last_purchase,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M,trans_day_of_week_Monday,trans_day_of_week_Saturday,trans_day_of_week_Sunday,trans_day_of_week_Thursday,trans_day_of_week_Tuesday,trans_day_of_week_Wednesday,trans_month_August,trans_month_December,trans_month_February,trans_month_January,trans_month_July,trans_month_June,trans_month_March,trans_month_May,trans_month_November,trans_month_October,trans_month_September,same_merchant_with_last_purchase_True
52525,19.46,1423,0,0.001389,20,16027.0,155.644412,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
52526,13.01,471,0,0.002083,51,163101.0,121.187723,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
52527,50.02,471,0,0.003472,77,55381.0,28.230127,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
52528,6.11,192805,0,0.004167,55,91351.0,132.898691,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
52529,32.14,18408,0,0.005556,46,971.0,81.694148,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False


In [None]:
x_train = train.drop(columns=['is_fraud'])
y_train = train['is_fraud']
x_val = val.drop(columns=['is_fraud'])
y_val = val['is_fraud']

In [None]:
y_train.value_counts()

Unnamed: 0_level_0,count
is_fraud,Unnamed: 1_level_1
0,1583366
1,9056


In [None]:
y_val.value_counts()

Unnamed: 0_level_0,count
is_fraud,Unnamed: 1_level_1
0,396104
1,2002


# Final Model Selection



# Random Forest Without Sampling

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 20],
    'min_samples_split': [2, 6],
    'min_samples_leaf': [1, 3]
}

# Initialize GridSearchCV
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Perform Grid Search with Stratified K-Fold CV
grid_search = GridSearchCV(estimator=rf_classifier,
              param_grid=param_grid,
              cv=stratified_kfold,  # Use stratified KFold
              scoring='roc_auc',
              n_jobs=-1,verbose=2)

# Fit the grid search to the  training data
grid_search.fit(x_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best ROC AUC score: {best_score}")

# Train a Random Forest model with the best hyperparameters
best_rf_model = RandomForestClassifier(**best_params, random_state=42)
best_rf_model.fit(x_train, y_train)

# Make predictions on the validation set
y_pred_rf = best_rf_model.predict(x_val)
y_prob_rf = best_rf_model.predict_proba(x_val)[:, 1]


# Evaluate the model
print(classification_report(y_val, y_pred_rf))
print("ROC AUC Score:", roc_auc_score(y_val, y_prob_rf))

Fitting 3 folds for each of 16 candidates, totalling 48 fits


KeyboardInterrupt: 

# XGBoost Model Without Sampling

In [None]:
# prompt: fit xgboost and perform grid search on hyper parameter on train set and test on validation set
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.1, 0.3],
    'subsample': [1],
    'colsample_bytree': [1]
}

grid_search_xgb = GridSearchCV(
    estimator=XGBClassifier(random_state=42, tree_method='hist'),  # Change 'gpu_hist' to 'hist' for CPU
    param_grid=param_grid_xgb,
    cv=stratified_kfold,
    scoring='roc_auc',
    n_jobs=-1
)

# Fit the grid search to the training data
grid_search_xgb.fit(x_train, y_train)

# Get the best parameters and best score
best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = grid_search_xgb.best_score_

print(f"Best parameters for XGBoost: {best_params_xgb}")
print(f"Best ROC AUC score for XGBoost: {best_score_xgb}")

# Train an XGBoost model with the best hyperparameters
best_xgb_model = XGBClassifier(**best_params_xgb, random_state=42, tree_method='hist')
best_xgb_model.fit(x_train, y_train)

# Make predictions on the validation set
y_pred_xgb = best_xgb_model.predict(x_val)
y_prob_xgb = best_xgb_model.predict_proba(x_val)[:, 1]

# Evaluate the model
print(classification_report(y_val, y_pred_xgb))
print("ROC AUC Score for XGBoost:", roc_auc_score(y_val, y_prob_xgb))



Best parameters for XGBoost: {'colsample_bytree': 1, 'learning_rate': 0.3, 'max_depth': 6, 'n_estimators': 200, 'subsample': 1}
Best ROC AUC score for XGBoost: 0.9993454457933123
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    396104
           1       1.00      0.95      0.97      2002

    accuracy                           1.00    398106
   macro avg       1.00      0.98      0.99    398106
weighted avg       1.00      1.00      1.00    398106

ROC AUC Score for XGBoost: 0.9999817667134836


In [None]:
import pickle

# Save model
with open("/content/drive/Shared drives/Machine Learning/Project/xgb_model_notbalanced.pkl", "wb") as f:
    pickle.dump(best_xgb_model, f)

# Logistic Regression Without Sampling

In [None]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score

# Add an intercept term for statsmodels
x_train_smote_const = sm.add_constant(x_train)
x_val_const = sm.add_constant(x_val)


# Fit the logistic regression model using statsmodels
logit_model = sm.Logit(np.asarray(y_train), x_train_smote_const.astype(float)).fit()
print(logit_model.summary())

# Make predictions on the validation set
y_prob_lr = logit_model.predict(x_val_const.astype(float))  # Probability predictions
y_pred_lr = (y_prob_lr >= 0.5).astype(int)  # Convert to binary predictions

# Evaluate the model
print(classification_report(y_val, y_pred_lr))
print("ROC AUC Score for Logistic Regression:", roc_auc_score(y_val, y_prob_lr))

Optimization terminated successfully.
         Current function value: 0.027726
         Iterations 12
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:              1592422
Model:                          Logit   Df Residuals:                  1592383
Method:                           MLE   Df Model:                           38
Date:                Mon, 10 Mar 2025   Pseudo R-squ.:                  0.2094
Time:                        00:44:37   Log-Likelihood:                -44152.
converged:                       True   LL-Null:                       -55846.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
const                                    -9.2206     

In [None]:
import pickle

# Save model
with open("/content/drive/Shared drives/Machine Learning/Project/logi_model_notbalanced.pkl", "wb") as f:
    pickle.dump(logit_model, f)

# SVM Without Sampling

In [None]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score

# Define the parameter grid for Linear SVM
param_grid_svm = {
    'C': [0.1, 1, 10]  # Regularization parameter
}

# Initialize GridSearchCV for LinearSVC
grid_search_svm = GridSearchCV(estimator=LinearSVC(random_state=42, max_iter=10000, verbose=1),
                                param_grid=param_grid_svm,
                                cv=stratified_kfold,
                                scoring='roc_auc',
                                n_jobs=-1,
                                verbose=2)

# Fit the grid search to the training data
grid_search_svm.fit(x_train, y_train)

# Get the best parameters and best score
best_params_svm = grid_search_svm.best_params_
best_score_svm = grid_search_svm.best_score_

print(f"Best parameters for Linear SVM: {best_params_svm}")
print(f"Best ROC AUC score for Linear SVM: {best_score_svm}")

# Train an SVM model with the best hyperparameters
best_svm_model = LinearSVC(**best_params_svm, random_state=42, max_iter=10000)
best_svm_model.fit(x_train, y_train)

# Convert LinearSVC to support probability estimation
calibrated_svm = CalibratedClassifierCV(best_svm_model)
calibrated_svm.fit(x_train, y_train)

# Make predictions on the validation set
y_pred_svm = calibrated_svm.predict(x_val)
y_prob_svm = calibrated_svm.predict_proba(x_val)[:, 1]

# Evaluate the model
print(classification_report(y_val, y_pred_svm))
print("ROC AUC Score for Linear SVM:", roc_auc_score(y_val, y_prob_svm))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[LibLinear]Best parameters for Linear SVM: {'C': 10}
Best ROC AUC score for Linear SVM: 0.6210537363139086
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    396104
           1       0.00      0.00      0.00      2002

    accuracy                           0.99    398106
   macro avg       0.50      0.50      0.50    398106
weighted avg       0.99      0.99      0.99    398106

ROC AUC Score for Linear SVM: 0.5686801749237372


In [None]:
import pickle

# Save model
with open("/content/drive/Shared drives/Machine Learning/Project/svm_model_notbalanced.pkl", "wb") as f:
    pickle.dump(best_svm_model, f)