In [None]:
import pandas as pd
import joblib
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
df = pd.read_csv('dataset.csv', encoding='utf-8')
df.columns.to_list()

In [None]:
# Convert the trans time into categorical data (Night, day, etc)
# timestamp and hour are helper cols only
df['timestamp'] = pd.to_datetime(df['Transaction_Time'], format='%H:%M:%S')
df['hour'] = df['timestamp'].dt.hour

def categorize_time(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Noon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

df['time_period'] = df['hour'].apply(categorize_time)

# Convert to numerical for your model
df = pd.get_dummies(df, columns=['time_period'])

In [None]:
df_clean = df.drop(
    columns=[
        # 'Customer_ID', 
        'Customer_Name', 
        'Transaction_ID', 
        'Merchant_ID', 
        'Customer_Contact', 
        'Customer_Email', 
        'Transaction_Description',
        'Transaction_Date', 
        'Transaction_Time',
        'Bank_Name',
        'City',
        'Gender'
    ],
    errors='ignore' 
)

In [None]:
df_clean.columns.to_list()

In [None]:
def bankrupt_ratio(df):
    # add 0.01 to avoid dividing to 0
    df['amount_to_balance_ratio'] = df['Transaction_Amount'] / (df['Account_Balance'] + 0.01)
    return df

def high_amount_night(df):
    is_night = (df['time_period_Night'] == True)
    
    # 2. Define "High Amount" (e.g., Top 10% of all transactions)
    # We calculate the 90th percentile dynamically
    high_threshold = df['Transaction_Amount'].quantile(0.90)
    is_high_amount = df['Transaction_Amount'] > high_threshold

    # 3. Combine: If BOTH are True, return 1. Else 0.
    df['high_amount_night'] = (is_night & is_high_amount).astype(int)
    return df

def velocity_check(df):
    # 1. Ensure timestamp is datetime (just in case)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # 2. Sort is MANDATORY for rolling time windows
    df = df.sort_values(['Customer_ID', 'timestamp'])

    # 3. Calculate Velocity using an index-based approach
    # We set the index to timestamp so rolling('24h') knows what to measure
    df['velocity_24h'] = (
        df.set_index('timestamp')
          .groupby('Customer_ID')
          ['Transaction_Amount'] # We can count any column
          .rolling('24h')
          .count()
          .reset_index(level=0, drop=True) # Remove the Customer_ID index level
          .values # Extract the raw numbers to put back in the original df
    )
    
    df['velocity_6h'] = (
        df.set_index('timestamp')
          .groupby('Customer_ID')
          ['Transaction_Amount']
          .rolling('6h')
          .count()
          .reset_index(level=0, drop=True)
          .values
    )
    
    return df

def savings_attack(df):
    # Logic: Is it a Savings account AND has a high 6h velocity?
    # We use 2 as the threshold as per your project plan
    df['is_savings_attack'] = (
        (df['Account_Type'] == 'Savings') & 
        (df['velocity_6h'] >= 2)
    ).astype(int)
    
    return df

def is_outlier(df):
    # 1. Calculate mean and std for each customer
    # transform('std') returns NaN for customers with only 1 transaction
    customer_mean = df.groupby("Customer_ID")["Transaction_Amount"].transform("mean")
    customer_std = df.groupby("Customer_ID")["Transaction_Amount"].transform("std")
    
    # 2. FIX: Replace NaN standard deviation with 0
    # This happens for any customer who has only 1 transaction in the data
    customer_std = customer_std.fillna(0)
    
    # 3. Calculate the Z-score
    # Adding 0.001 (epsilon) prevents the code from crashing when std is 0
    df["user_z_score"] = (df["Transaction_Amount"] - customer_mean) / (customer_std + 0.001)
    
    df["is_outlier"] = (df["user_z_score"] > 1.5).astype(int)  
    return df


df_clean = bankrupt_ratio(df_clean)
df_clean = high_amount_night(df_clean)
df_clean = velocity_check(df_clean)
df_clean = savings_attack(df_clean)
df_clean = is_outlier(df_clean)

In [None]:
df_clean.info()

In [None]:
# Final clean
df_clean = df_clean.drop(
    columns=[
        'Customer_ID',
        'timestamp'
    ], 
    errors="ignore"
)

# MOVE 'Is_Fraud' TO THE END
# Create a list of all columns except 'Is_Fraud'
cols = [col for col in df_clean.columns if col != 'Is_Fraud']

# Append 'Is_Fraud' to the end of that list
cols.append('Is_Fraud')

# Reorder the dataframe
df_clean = df_clean[cols]

df_clean.columns.to_list()

In [None]:
keep_cols = ['Transaction_Amount', 'Account_Balance', 'amount_to_balance_ratio', 'velocity_6h', 'velocity_24h', 'is_savings_attack', 'is_outlier', 'Is_Fraud']

df_clean = df_clean[keep_cols]

df_clean.columns.to_list()


In [None]:
df_clean['is_outlier'].value_counts()

In [None]:
# Seperate the Features and the Target
X = df_clean.iloc[:, 0:-1]
y = df_clean.iloc[:, -1]    # The isFraud col

# Now split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=36)

smote = SMOTE(k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
X_resampled = X_resampled.round().astype(int)   # To avoid synthetic value being float like 1.5

rf = RandomForestClassifier(
    n_estimators=200,          # Increase number of trees
    class_weight={0: 1, 1: 25}, # Manually set higher weight for Fraud
    max_depth=None,            # Let the trees grow deeper to find complex rules
    min_samples_leaf=2,        # More specific rules
    n_jobs=-1,
)
rf.fit(X_resampled, y_resampled)    # Training part
joblib.dump(rf, "rfFeaturePruning.joblib") # Save for later use - when test on actual data

In [None]:
y_pred = rf.predict(X_test)
score = rf.score(X_test, y_test)  
print(score)
print(classification_report(y_test, y_pred, digits=3))
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix')
plt.show()

In [None]:
# 1. Get feature importances from your trained model
importances = rf.feature_importances_
feature_names = X.columns

# 2. Create a DataFrame for easy plotting
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# 3. Plot only the Top 15 (otherwise it gets too crowded)
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(25), palette='viridis')

plt.title('Top 15 Features for Detecting Fraud')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.show()

In [None]:
# df_clean.head()
df_clean["is_savings_attack"].value_counts()