In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.cluster import DBSCAN
from xgboost import XGBClassifier
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the datasets
train_df = pd.read_csv("../datasets/fraudTrain.csv")
test_df = pd.read_csv("../datasets/fraudTest.csv")

# Drop ID column if it exists
for df in [train_df, test_df]:
    if 'id' in df.columns:
        df.drop(columns=['id'], inplace=True)

'''
# Separate features and labels
X_train = train_df.drop(columns=['is_fraud'])
y_train = train_df['is_fraud']

X_test = test_df.drop(columns=['is_fraud'])
y_test = test_df['is_fraud']

# Display shapes for confirmation
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
'''


'\n# Separate features and labels\nX_train = train_df.drop(columns=[\'is_fraud\'])\ny_train = train_df[\'is_fraud\']\n\nX_test = test_df.drop(columns=[\'is_fraud\'])\ny_test = test_df[\'is_fraud\']\n\n# Display shapes for confirmation\nprint("X_train shape:", X_train.shape)\nprint("y_train shape:", y_train.shape)\nprint("X_test shape:", X_test.shape)\nprint("y_test shape:", y_test.shape)\n'

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Drop identifier and personal columns
drop_cols = ['id', 'cc_num', 'first', 'last', 'trans_num', 'street', 'unix_time']
train_df = train_df.drop(columns=[col for col in drop_cols if col in train_df.columns])
test_df = test_df.drop(columns=[col for col in drop_cols if col in test_df.columns])

# Convert datetime to time features
for df in [train_df, test_df]:
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['dob'] = pd.to_datetime(df['dob'], errors='coerce')

    # Compute age in years at time of transaction
    df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days / 365.25
    df['age'] = df['age'].fillna(df['age'].median())  # Handle any NaT values
    
    df['trans_hour'] = df['trans_date_trans_time'].dt.hour
    df['trans_day'] = df['trans_date_trans_time'].dt.day
    df['trans_weekday'] = df['trans_date_trans_time'].dt.weekday
    df.drop(columns=['trans_date_trans_time', 'dob'], inplace=True)

# Target variable
y_train = train_df['is_fraud']
y_test = test_df['is_fraud']

# Drop target from features
X_train = train_df.drop(columns=['is_fraud'])
X_test = test_df.drop(columns=['is_fraud'])

# Identify categorical and numeric columns
categorical_cols = ['merchant', 'category', 'gender', 'job', 'city', 'state']
numerical_cols = [col for col in X_train.columns if col not in categorical_cols]

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Apply preprocessing
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Final shape confirmation
print("X_train shape:", X_train_transformed.shape)
print("X_test shape:", X_test_transformed.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (1296675, 2159)
X_test shape: (555719, 2159)
y_train shape: (1296675,)
y_test shape: (555719,)


In [None]:


# --------------------------
# Supervised Models
# --------------------------
models = {
    "logistic_regression": LogisticRegression(max_iter=1000),
    "decision_tree": DecisionTreeClassifier(),
    "random_forest": RandomForestClassifier(n_estimators=100),
    "xgboost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, model in models.items():
    model.fit(X_train_transformed, y_train)
    y_pred = model.predict(X_test_transformed)
    y_prob = model.predict_proba(X_test_transformed)[:, 1]

    print(f"\n{name.replace('_', ' ').title()}")
    print("-" * 30)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("ROC AUC:", round(roc_auc_score(y_test, y_prob), 4))
    
    # Save the model
    joblib.dump(model, f"models/{name}.joblib")


Logistic Regression
------------------------------
Confusion Matrix:
[[553322    252]
 [  2116     29]]
ROC AUC: 0.6622

Decision Tree
------------------------------
Confusion Matrix:
[[553066    508]
 [   461   1684]]
ROC AUC: 0.8921

Random Forest
------------------------------
Confusion Matrix:
[[553541     33]
 [   914   1231]]
ROC AUC: 0.9504


Parameters: { "use_label_encoder" } are not used.




Xgboost
------------------------------
Confusion Matrix:
[[553416    158]
 [   537   1608]]
ROC AUC: 0.9958


In [19]:
for name, model in models.items():
    y_pred = model.predict(X_test_transformed)
    y_prob = model.predict_proba(X_test_transformed)[:, 1]

    print(f"\n{name.replace('_', ' ').title()}")
    print("-" * 30)
    print("Report")
    print(classification_report(y_test, y_pred))


Logistic Regression
------------------------------
Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.10      0.01      0.02      2145

    accuracy                           1.00    555719
   macro avg       0.55      0.51      0.51    555719
weighted avg       0.99      1.00      0.99    555719


Decision Tree
------------------------------
Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.77      0.79      0.78      2145

    accuracy                           1.00    555719
   macro avg       0.88      0.89      0.89    555719
weighted avg       1.00      1.00      1.00    555719


Random Forest
------------------------------
Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.97      0.57      0.72      2145

    accuracy     

In [4]:

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN


# --------------------------
# Apply PCA
# --------------------------
pca = PCA(n_components=99, random_state=42)
X_pca = pca.fit_transform(X_train_transformed)
print("Explained Variance Ratio (PCA):", pca.explained_variance_ratio_.sum())

# Save PCA
joblib.dump(pca, "models/pca_model.joblib")

Explained Variance Ratio (PCA): 0.840798136415197


['models/pca_model.joblib']

In [5]:
X_test_PCA = pca.fit_transform(X_test_transformed)

In [6]:
def evaluate_model(name, true, pred, score_source=None):
    acc = accuracy_score(true, pred)
    auc = roc_auc_score(true, score_source if score_source is not None else pred)
    print(f"\n{name} Evaluation")
    print("=" * len(name))
    print("Accuracy:", round(acc, 4))
    print("ROC AUC:", round(auc, 4))
    print("Confusion Matrix:\n", confusion_matrix(true, pred))
    print("Classification Report:\n", classification_report(true, pred))

In [None]:

# --------------------------
# 1. Autoencoder on PCA Data
# --------------------------
print("\n[Autoencoder]")
input_dim = X_pca.shape[1]

input_layer = Input(shape=(input_dim,))
encoder = Dense(6, activation="relu")(input_layer)
encoder = Dense(3, activation="relu")(encoder)
decoder = Dense(6, activation="relu")(encoder)
decoder = Dense(input_dim, activation="sigmoid")(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer=Adam(learning_rate=1e-3), loss='mse')

autoencoder.fit(X_pca, X_pca, epochs=20, batch_size=256, validation_split=0.2, shuffle=True, verbose=0)

# Save Autoencoder
autoencoder.save("models/autoencoder_pca_model.keras")

# Predictions and evaluation
reconstructions = autoencoder.predict(X_test_PCA)


[Autoencoder]
[1m17367/17367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 408us/step


ValueError: operands could not be broadcast together with shapes (1296675,99) (555719,99) 

In [8]:

mse = np.mean(np.power(X_test_PCA - reconstructions, 2), axis=1)
threshold = np.percentile(mse[y_test == 0], 95)
auto_preds = (mse > threshold).astype(int)

evaluate_model("Autoencoder", y_test, auto_preds, mse)


Autoencoder Evaluation
Accuracy: 0.9483
ROC AUC: 0.8052
Confusion Matrix:
 [[525895  27679]
 [  1070   1075]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97    553574
           1       0.04      0.50      0.07      2145

    accuracy                           0.95    555719
   macro avg       0.52      0.73      0.52    555719
weighted avg       0.99      0.95      0.97    555719



In [None]:
# --------------------------
# 2. DBSCAN on PCA Data
# --------------------------
print("\n[DBSCAN]")
dbscan = DBSCAN(eps=1.5, min_samples=5)
db_labels = dbscan.fit_predict(X_pca)
binary_dbscan_labels = (db_labels == -1).astype(int)

evaluate_model("DBSCAN", y_train, binary_dbscan_labels)

# Save DBSCAN model
joblib.dump(dbscan, "models/dbscan_pca_model.joblib")


[DBSCAN]


In [10]:
# --------------------------
# 3. KMeans on PCA Data
# --------------------------
print("\n[KMeans]")
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans_labels = kmeans.fit_predict(X_pca)

# Flip label if needed
if np.mean(y_train[kmeans_labels == 0]) > np.mean(y_train[kmeans_labels == 1]):
    kmeans_labels = 1 - kmeans_labels

evaluate_model("KMeans", y_train, kmeans_labels)

# Save KMeans model
joblib.dump(kmeans, "models/kmeans_pca_model.joblib")


[KMeans]

KMeans Evaluation
Accuracy: 0.3661
ROC AUC: 0.5078
Confusion Matrix:
 [[469846 819323]
 [  2619   4887]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.36      0.53   1289169
           1       0.01      0.65      0.01      7506

    accuracy                           0.37   1296675
   macro avg       0.50      0.51      0.27   1296675
weighted avg       0.99      0.37      0.53   1296675



['models/kmeans_pca_model.joblib']

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Example input dimension
input_dim = X_train.shape[1]  # e.g., 99

# Define model
input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Training
autoencoder.fit(X_train, X_train, epochs=10, batch_size=32)

# Predicting
reconstructions = autoencoder.predict(X_test)