In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, roc_auc_score

import joblib

from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import make_pipeline

from scipy.stats import uniform

In [2]:
df = pd.read_csv('../data/Base.csv')

# Separate out the fraud labels. 
y = df['fraud_bool']

X = df.drop('fraud_bool', axis=1)


In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 31 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   income                            1000000 non-null  float64
 1   name_email_similarity             1000000 non-null  float64
 2   prev_address_months_count         1000000 non-null  int64  
 3   current_address_months_count      1000000 non-null  int64  
 4   customer_age                      1000000 non-null  int64  
 5   days_since_request                1000000 non-null  float64
 6   intended_balcon_amount            1000000 non-null  float64
 7   payment_type                      1000000 non-null  object 
 8   zip_count_4w                      1000000 non-null  int64  
 9   velocity_6h                       1000000 non-null  float64
 10  velocity_24h                      1000000 non-null  float64
 11  velocity_4w                       1000

In [4]:
cat_columns = X.select_dtypes('object').columns
print(cat_columns)

Index(['payment_type', 'employment_status', 'housing_status', 'source',
       'device_os'],
      dtype='object')


In [5]:
# One Hot Vector Encoding

X = pd.get_dummies(X, columns=cat_columns)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 52 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   income                            1000000 non-null  float64
 1   name_email_similarity             1000000 non-null  float64
 2   prev_address_months_count         1000000 non-null  int64  
 3   current_address_months_count      1000000 non-null  int64  
 4   customer_age                      1000000 non-null  int64  
 5   days_since_request                1000000 non-null  float64
 6   intended_balcon_amount            1000000 non-null  float64
 7   zip_count_4w                      1000000 non-null  int64  
 8   velocity_6h                       1000000 non-null  float64
 9   velocity_24h                      1000000 non-null  float64
 10  velocity_4w                       1000000 non-null  float64
 11  bank_branch_count_8w              1000

In [6]:
X.nunique()

income                                   9
name_email_similarity               998861
prev_address_months_count              374
current_address_months_count           423
customer_age                             9
days_since_request                  989330
intended_balcon_amount              994971
zip_count_4w                          6306
velocity_6h                         998687
velocity_24h                        998940
velocity_4w                         998318
bank_branch_count_8w                  2326
date_of_birth_distinct_emails_4w        40
credit_risk_score                      551
email_is_free                            2
phone_home_valid                         2
phone_mobile_valid                       2
bank_months_count                       33
has_other_cards                          2
proposed_credit_limit                   12
foreign_request                          2
session_length_in_minutes           994887
keep_alive_session                       2
device_dist

In [7]:
# device_fraud_count has a unique value of only 1, we can drop it from the dataset.
X = X.drop(labels='device_fraud_count', axis=1)


In [8]:
# Shuffle and split data for parameter search:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(y_test.nunique())

fraud = 0
not_fraud = 0
for p in y_test:
    if p == 0:
        not_fraud += 1
    else:
        fraud += 1

print(not_fraud)
print(fraud)




# Accuracy: 0.83293

# Function to load from npz, given code from Kaggle competition
# def load_npz(file_path):
#     with np.load(file_path) as data:
#         return {key: data[key] for key in data}

# # Load training and testing data
# train_data = load_npz('train.npz')
# test_data = load_npz('test.npz')

# # Extract feature embeddings, labels, and uid from training and testing data
# train_emb1, train_emb2, train_labels = train_data['emb1'], train_data['emb2'], train_data['preference']
# test_emb1, test_emb2, test_uid = test_data['emb1'], test_data['emb2'], test_data['uid']

# # Concat training embeddings, doubles the training size to 37,500
# all_train_embeddings = np.concatenate((train_emb1, train_emb2), axis=0)

# # Invert training labels and concat all lables to match train_emb1 + train_emb2
# neg_train_labels = np.logical_not(train_labels).astype(np.int8)
# all_train_labels = np.concatenate((train_labels, neg_train_labels))

# # Shuffle and split data for parameter search:
# xTr, xVal, yTr, yVal = train_test_split(all_train_embeddings, all_train_labels, test_size=0.2, random_state=42)

# # Function to evaluate trained model accuracy, uses yVal to test
# def get_accuracy(svm_classifier):
#     preds = svm_classifier.predict(xVal)
#     accuracy = accuracy_score(yVal, preds)
#     print(f"Accuracy: {accuracy}")

# # Vanilla SVM Model with default params, C=1, gamma=1, no CV, to get idea of initial accuracy
# svm_classifier_vanilla = SVC(kernel='rbf', verbose=1)
# svm_classifier_vanilla.fit(xTr, yTr)
# get_accuracy(svm_classifier_vanilla)
# # Accuracy: 0.83293

(800000, 51)
(200000, 51)
(800000,)
(200000,)
2
197794
2206


In [9]:
# Scale

# Scale data to improve performance on some models
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# # Vanilla SVM Model with default params, C=1, gamma=1, no CV, to get idea of initial accuracy
# svm_classifier_vanilla = SVC(kernel='rbf', verbose=True, shrinking=False)
# svm_classifier_vanilla.fit(X_train, y_train)

In [11]:
# preds = svm_classifier_vanilla.predict(X_test)
# accuracy = accuracy_score(y_test, preds)
# print(f"Accuracy: {accuracy}")

In [12]:
# unique_values, counts = np.unique(preds, return_counts=True)
# print(unique_values)
# print(preds)
# y_unique_values, y_counts = np.unique(y_test, return_counts=True)
# print(y_unique_values)
# print(y_counts)


In [13]:
# fraud = 0
# not_fraud = 0
# for p in preds:
#     if p == 0:
#         not_fraud += 1
#     else:
#         fraud += 1
# print(fraud)
# print(not_fraud)

In [14]:
# print(classification_report(y_test, preds))

In [15]:
lr_model = LogisticRegressionCV(class_weight='balanced')
lr_model.fit(X_train, y_train)

predictions = lr_model.predict(X_test)

In [16]:
print(predictions.shape)
unique_values, counts = np.unique(predictions, return_counts=True)
print('Prediction Results:', unique_values)
print('Prediction Results:', counts)
print()
unique_values, counts = np.unique(y_test, return_counts=True)
print('True Results:', unique_values)
print('True Results:',counts)

(200000,)
Prediction Results: [0 1]
Prediction Results: [159939  40061]

True Results: [0 1]
True Results: [197794   2206]


In [17]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       1.00      0.81      0.89    197794
           1       0.04      0.79      0.08      2206

    accuracy                           0.81    200000
   macro avg       0.52      0.80      0.49    200000
weighted avg       0.99      0.81      0.88    200000



In [20]:
logCV = LogisticRegressionCV(solver='newton-cholesky', max_iter=500, scoring='roc_auc', random_state=42, class_weight='balanced', verbose=1)
logCV_model = logCV.fit(X_train, y_train)

Newton iter=1
  Check Convergence
    1. max |gradient| 0.05107325695573011 <= 0.0001
Newton iter=2
  Check Convergence
    1. max |gradient| 0.0087508780106928 <= 0.0001
Newton iter=3
  Check Convergence
    1. max |gradient| 0.00041578966272695986 <= 0.0001
Newton iter=4
  Check Convergence
    1. max |gradient| 1.029321581536187e-06 <= 0.0001
    2. Newton decrement 1.4852904833182895e-06 <= 0.0001
  Solver did converge at loss = 0.4559054287710034.
Newton iter=1
  Check Convergence
    1. max |gradient| 0.00040427142840490703 <= 0.0001
Newton iter=2
  Check Convergence
    1. max |gradient| 2.0172824880143082e-06 <= 0.0001
    2. Newton decrement 3.836978486295411e-06 <= 0.0001
  Solver did converge at loss = 0.4454595026347563.
Newton iter=1
  Check Convergence
    1. max |gradient| 1.0006205272233997e-05 <= 0.0001
    2. Newton decrement 2.279832301864572e-05 <= 0.0001
  Solver did converge at loss = 0.44394112365704136.
Newton iter=1
  Check Convergence
    1. max |gradient| 2.0

In [21]:
predictions = logCV_model.predict(X_test)

print(predictions.shape)
unique_values, counts = np.unique(predictions, return_counts=True)
print('Prediction Results:', unique_values)
print('Prediction Results:', counts)
print()
unique_values, counts = np.unique(y_test, return_counts=True)
print('True Results:', unique_values)
print('True Results:',counts)

(200000,)
Prediction Results: [0 1]
Prediction Results: [159694  40306]

True Results: [0 1]
True Results: [197794   2206]


In [None]:
logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200, random_state=42,  class_weight='balanced')
distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])
clf = RandomizedSearchCV(logistic, distributions, random_state=42)
search = clf.fit(X_train, y_train)
search.best_params_

In [None]:
log_cv_predictions = search.predict(X_test)

In [None]:
print(log_cv_predictions.shape)
unique_values, counts = np.unique(log_cv_predictions, return_counts=True)
print('Prediction Results:', unique_values)
print('Prediction Results:', counts)
print()
unique_values, counts = np.unique(y_test, return_counts=True)
print('True Results:', unique_values)
print('True Results:',counts)

In [None]:
smote_nc = SMOTENC(categorical_features=X, sampling_strategy='minority', random_state=42)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
logistic_reg_CV_model = RandomizedSearchCV(estimator=X, param_distributions=balanced', n_iter=20, scoring="roc_auc", n_jobs=-1, cv=cv)

In [None]:
logistic_reg_CV_model.fit(X_train, y_train)

predictions = logistic_reg_CV_model.predict(X_test)

In [None]:
numerical_cols = df.select_dtypes(include=['float', 'int']).columns
for col in numerical_cols:
    plt.figure(figsize=(8, 6))
    sns.histplot(df[col], kde=False, color='blue', bins=20)
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Plot box plots for numerical columns
for col in numerical_cols:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=df[col], color='green')
    plt.title(f'Box Plot of {col}')
    plt.xlabel(col)
    plt.show()


In [None]:
# Plot count plots for categorical columns
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    plt.figure(figsize=(8, 6))
    sns.countplot(data=df, x=col, palette='Set2')
    plt.title(f'Count Plot of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
print(df.info())

In [None]:
print(df.nunique())

In [None]:
print(df.shape)

fraud_df = df.loc[(df['fraud_bool'] == 1)]
print(fraud_df.nunique())

num_fraud_df = fraud_df.select_dtypes(include=['number'])

# Drop non-numeric columns
num_fraud_df = fraud_df.drop(columns=fraud_df.select_dtypes(exclude=['number']).columns)

print(num_fraud_df.isnull().values.any())

# Standardize the features
scaler = StandardScaler()
scaled_df = scaler.fit_transform(num_fraud_df)

# Apply PCA
pca = PCA(n_components=2)
pca_df = pca.fit_transform(num_fraud_df)

original_column_names = fraud_df.columns

# Get the principal components
principal_components = pca.components_

# Print original column names corresponding to PC1 and PC2
print("Original column names corresponding to PC1 and PC2:")
for i, component in enumerate(principal_components):
    print(f"PC{i+1}:")
    for j, weight in enumerate(component):
        print(f"   {original_column_names[j]}: {weight}")

plt.figure(figsize=(10, 6))

pca_df = pd.DataFrame(data=pca_df, columns=['PC1', 'PC2'])

# Plot points with binary_column == 0 in blue
plt.scatter(pca_df.loc[df['fraud_bool'] == 0, 'PC1'], 
            pca_df.loc[df['fraud_bool'] == 0, 'PC2'], 
            color='blue', label='Not Fraudulent')

# Plot points with binary_column == 1 in red
plt.scatter(pca_df.loc[df['fraud_bool'] == 1, 'PC1'], 
            pca_df.loc[df['fraud_bool'] == 1, 'PC2'], 
            color='red', label='Fraudulent')

plt.title('PCA Plot')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.grid(True)
plt.show()