In [2]:
# Import libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb

# Load dataset
df = pd.read_csv("dataset.csv")

# --- Handle categorical features ---
categorical_cols = ['merchant', 'category', 'gender', 'job', 'first', 'last', 'street']
encoder = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = df[col].astype(str)  # ensure all strings
    df[col] = le.fit_transform(df[col])
    encoder[col] = le

# Save encoder
joblib.dump(encoder, 'label_encoder.jb')

# --- Handle datetime columns ---
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
df['age'] = df['age'].fillna(0)
df = df.drop(columns=['dob', 'trans_date_trans_time'])

# --- Hash credit card number ---
df['cc_num'] = df['cc_num'].apply(lambda x: hash(str(x)) % (10**4))

# --- Target column ---
y = df['is_fraud']
X = df.drop(columns=['is_fraud', 'Unnamed: 0', 'trans_num', 'unix_time', 'city', 'state', 'zip'])

# --- Optional features ---
X['amt_log'] = np.log1p(X['amt'])
X['distance_km'] = np.sqrt((X['lat'] - X['merch_lat'])**2 + (X['long'] - X['merch_long'])**2)

# --- Train/Test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Train LightGBM Model ---
model = lgb.LGBMClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Save model
joblib.dump(model, 'fraud_detection_model.jb')

# --- Evaluate ---
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", acc)


[LightGBM] [Info] Number of positive: 5986, number of negative: 1031354
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059768 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3619
[LightGBM] [Info] Number of data points in the train set: 1037340, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005771 -> initscore=-5.149204
[LightGBM] [Info] Start training from score -5.149204
Test Accuracy: 0.9952879480208996
