In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from geopy.distance import geodesic
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
# Assuming the dataset is in a CSV file named 'transactions.csv'
# If your data is in a different format, adjust accordingly
df = pd.read_csv('/kaggle/input/transactions/transactions.csv')

# Convert date columns to datetime format
date_columns = ['transactionDateTime', 'currentExpDate', 'accountOpenDate', 'dateOfLastAddressChange']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

In [2]:
df=df

In [3]:
# -------------- Feature Engineering --------------

# 1. Velocity of Transactions for Each User
print("Engineering transaction velocity features...")
# Sort transactions by customer and datetime
df = df.sort_values(['customerId', 'transactionDateTime'])

# Calculate time difference between consecutive transactions per customer
df['prevTransactionTime'] = df.groupby('customerId')['transactionDateTime'].shift(1)
df['timeDelta'] = (df['transactionDateTime'] - df['prevTransactionTime']).dt.total_seconds() / 3600  # in hours

# Count transactions in the last 24 hours, 7 days
def count_transactions_in_timeframe(group, hours):
    result = []
    for i, row in group.iterrows():
        current_time = row['transactionDateTime']
        timeframe_start = current_time - pd.Timedelta(hours=hours)
        count = len(group[(group['transactionDateTime'] > timeframe_start) &
                          (group['transactionDateTime'] < current_time)])
        result.append(count)
    return result

# Apply counting functions to each customer group
customer_groups = df.groupby('customerId')
df['txn_count_24h'] = customer_groups.apply(lambda x: count_transactions_in_timeframe(x, 24)).explode().values
df['txn_count_7d'] = customer_groups.apply(lambda x: count_transactions_in_timeframe(x, 168)).explode().values  # 7*24=168

Engineering transaction velocity features...


In [4]:
# 2. Unusual Spending Spikes
print("Engineering spending pattern features...")
# Calculate average transaction amount per customer
customer_avg_amount = df.groupby('customerId')['transactionAmount'].transform('mean')
customer_std_amount = df.groupby('customerId')['transactionAmount'].transform('std')

# Calculate z-score of transaction amount
df['amount_zscore'] = (df['transactionAmount'] - customer_avg_amount) / customer_std_amount.replace(0, 1)

# Calculate ratio of current transaction to average
df['amount_to_avg_ratio'] = df['transactionAmount'] / customer_avg_amount.replace(0, 1)

# Calculate cumulative amount spent in last 24 hours
def sum_amount_in_timeframe(group, hours):
    result = []
    for i, row in group.iterrows():
        current_time = row['transactionDateTime']
        timeframe_start = current_time - pd.Timedelta(hours=hours)
        amount_sum = group[(group['transactionDateTime'] > timeframe_start) &
                           (group['transactionDateTime'] < current_time)]['transactionAmount'].sum()
        result.append(amount_sum)
    return result

df['amount_24h'] = customer_groups.apply(lambda x: sum_amount_in_timeframe(x, 24)).explode().values

Engineering spending pattern features...


In [5]:
import requests
import time
from urllib.parse import quote

def geocode_merchant(merchant_name, merchant_city, merchant_state, merchant_zip):
    """
    Uses Nominatim API to geocode a merchant location based on provided details.
    
    Args:
        merchant_name (str): Name of the merchant/business
        merchant_city (str): City where the merchant is located
        merchant_state (str): State where the merchant is located
        merchant_zip (str): ZIP/Postal code of the merchant
        country (str, optional): Country of the merchant. Defaults to "USA".
        
    Returns:
        dict: Dictionary containing latitude, longitude, and display name if found
              or None if no results were found
    """
    # Nominatim requires a valid user agent
    headers = {
        'User-Agent': 'MerchantGeocoder/1.0 (your-email@example.com)'
    }
        # Handle missing da
    
    # Format the search query
    query_parts = []
    if merchant_name:
        query_parts.append(str(merchant_name))
    if merchant_city:
        query_parts.append(str(merchant_city))
    if merchant_state:
        query_parts.append(str(merchant_state))
    if merchant_zip:
        query_parts.append(str(merchant_zip))
    
    query = ", ".join(query_parts)
    
    # URL encode the query
    encoded_query = quote(query)
    
    # Nominatim API endpoint
    url = f"https://nominatim.openstreetmap.org/search?q={encoded_query}&format=json&limit=1"
    
    try:
        # Make the request
        response = requests.get(url, headers=headers)
        
        # Respect Nominatim's usage policy (max 1 request per second)
        time.sleep(1)
        
        if response.status_code == 200:
            results = response.json()
            if results:
                result = results[0]
                print(result['display_name'])
                return {
                    'latitude': float(result['lat']),
                    'longitude': float(result['lon']),
                }
            else:
                return None
        else:
            print(f"Error: Received status code {response.status_code}")
            return None
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

In [None]:
# 3. Geolocation Analysis for Each User
print("Engineering geolocation features...")
# Create a simple mapping for demonstration
# In a real scenario, you would use external APIs or databases to get coordinates
# This is a simplified mapping for demonstration purposes

# Create a dictionary to store merchant location data
# Format: {merchantName_merchantCity_merchantState: (lat, lon)}
merchant_locations = {}

# Function to get merchant location (simulated)
counter=0
def get_merchant_location(row):
    merchant_key = f"{row['merchantName']}_{row['merchantCity']}_{row['merchantState']}_{row['merchantZip']}"
    if merchant_key not in merchant_locations:
        # In real application, use geocoding API to get actual coordinates
        # Here we simulate with random but consistent coordinates#longitude range approx
        dicts=geocode_merchant(row['merchantName'],row['merchantCity'],row['merchantState'],row['merchantZip'])
        if dicts:
            lat,lon = dicts['latitude'], dicts['longitude']
        else:
            lat,lon=None, None
        merchant_locations[merchant_key] = (lat, lon)

    return merchant_locations[merchant_key]

# Apply the function to create lat/lon columns
df['merchant_loc'] = df.apply(get_merchant_location, axis=1)
df['merchant_lat'] = df['merchant_loc'].apply(lambda x: x[0])
df['merchant_lon'] = df['merchant_loc'].apply(lambda x: x[1])


Engineering geolocation features...


In [None]:
# Fill previous valid latitude and longitude recursively using ffill()
df['prev_lat'] = df.groupby('customerId')['merchant_lat'].ffill().shift(1)
df['prev_lon'] = df.groupby('customerId')['merchant_lon'].ffill().shift(1)

In [None]:
# Calculate distance in kilometers
def calculate_distance(row):
    if pd.isna(row['prev_lat']) or pd.isna(row['prev_lon']) or pd.isna(row['merchant_lat']) or pd.isna(row['merchant_lon']):
        return 0
    return geodesic((row['prev_lat'], row['prev_lon']), (row['merchant_lat'], row['merchant_lon'])).kilometers

df['distance_from_prev_txn'] = df.apply(calculate_distance, axis=1)

# Calculate speed (km/h) - distance divided by time difference
df['speed_kmph'] = np.where(df['timeDelta'] > 0, df['distance_from_prev_txn'] / df['timeDelta'], 0)

# Calculate if transaction is in a different country from previous
df['prev_country'] = df.groupby('customerId')['merchantCountryCode'].shift(1)
df['different_country'] = (df['merchantCountryCode'] != df['prev_country']).astype(int)

In [None]:
# 4. Additional Features
print("Engineering additional features...")
# Binary flags
df['cvv_match'] = (df['cardCVV'] == df['enteredCVV']).astype(int)
df['exp_date_match'] = df['expirationDateKeyInMatch'].astype(int)
df['is_foreign_transaction'] = (df['acqCountry'] != df['merchantCountryCode']).astype(int)

# Calculate the ratio of transaction amount to credit limit
df['amount_to_limit_ratio'] = df['transactionAmount'] / df['creditLimit'].replace(0, 1)

# Calculate the ratio of transaction amount to available money
df['amount_to_available_ratio'] = df['transactionAmount'] / df['availableMoney'].replace(0, 1)

# Calculate days since account opening
df['account_age_days'] = (df['transactionDateTime'] - df['accountOpenDate']).dt.days

# Calculate days since last address change
df['days_since_address_change'] = (df['transactionDateTime'] - df['dateOfLastAddressChange']).dt.days

df['isOnline'] = np.where(df['merchantCategoryCode'] == 'online_retail', 1, 0)
# One-hot encode categorical variables
categorical_cols = ['posEntryMode', 'posConditionCode', 'merchantCategoryCode', 'transactionType']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [None]:


# -------------- Data Preparation --------------
print("Preparing data for modeling...")
# Select features for the model
feature_cols = [
    # Transaction velocity features
    'txn_count_24h', 'txn_count_7d', 'timeDelta',

    # Spending pattern features
    'amount_zscore', 'amount_to_avg_ratio', 'amount_24h',

    # Geolocation features
    'distance_from_prev_txn', 'speed_kmph', 'different_country', 'isOnline',

    # Additional features
    'cvv_match', 'exp_date_match', 'is_foreign_transaction',
    'amount_to_limit_ratio', 'amount_to_available_ratio',
    'account_age_days', 'days_since_address_change'
]

# Add one-hot encoded columns
for col in df_encoded.columns:
    if col.startswith(tuple(categorical_cols)):
        feature_cols.append(col)

# Remove rows with NaN values in feature columns
df_clean = df_encoded.dropna(subset=feature_cols)

# Convert target variable to binary (0 for normal, 1 for fraud)
y = df_clean['isFraud'].astype(int)

# Select features
X = df_clean[feature_cols]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)



In [None]:
isof_params = {
    'n_estimators': 400,           # Number of trees
    'max_samples': 'auto',            # Similar to subsample, fraction of dataset used for each tree
    'contamination': 0.015,       # Adjust based on expected proportion of anomalies
    'max_features': 0.7,           # Similar to colsample_bytree, fraction of features used per tree
    'random_state': 42,            # Equivalent to seed
    'n_jobs': -1                    # Parallel processing
}

In [None]:
# -------------- Model Training --------------
print("Training Isolation Forest model...")
# For Isolation Forest, we'll train it on normal transactions only (non-fraud)
X_train_normal = X_train[y_train == 0]

# Initialize and train the model
# Contamination is set to a small value since fraud is rare
model = IsolationForest(**isof_params)
model.fit(X_train_normal)

# Predict anomalies
# Isolation Forest returns -1 for anomalies and 1 for normal samples
# We convert to 0 for normal and 1 for fraud to match our target
y_pred_train = (model.predict(X_train) == -1).astype(int)
y_pred_test = (model.predict(X_test) == -1).astype(int)

# Calculate anomaly scores
anomaly_scores_test = model.decision_function(X_test)
# Convert scores to probabilities (lower score = higher probability of fraud)
# We invert and scale the scores to be between 0 and 1
prob_fraud = 1 - (anomaly_scores_test - min(anomaly_scores_test)) / (max(anomaly_scores_test) - min(anomaly_scores_test))

In [None]:
# -------------- Model Evaluation --------------
print("Evaluating model performance...")
# Calculate precision, recall, F1 score
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_test, average='binary')

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))

# Print confusion matrix
cm = confusion_matrix(y_test, y_pred_test)
print("\nConfusion Matrix:")
print(cm)

# Calculate additional metrics for fraud detection
tn, fp, fn, tp = cm.ravel()
fraud_detection_rate = tp / (tp + fn)
false_positive_rate = fp / (fp + tn)

print(f"\nFraud Detection Rate (Recall): {fraud_detection_rate:.4f}")
print(f"False Positive Rate: {false_positive_rate:.4f}")
print(f"Precision (proportion of flagged transactions that are actually fraud): {precision:.4f}")
print(f"F1 Score: {f1:.4f}")

# -------------- Visualization --------------
# 1. Confusion Matrix Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Normal', 'Fraud'], yticklabels=['Normal', 'Fraud'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.close()

# 2. Precision-Recall Curve
from sklearn.metrics import precision_recall_curve
precision_curve, recall_curve, thresholds = precision_recall_curve(y_test, prob_fraud)

plt.figure(figsize=(8, 6))
plt.plot(recall_curve, precision_curve, marker='.', label=f'Isolation Forest (AUC={precision * recall:.3f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True)
plt.savefig('precision_recall_curve.png')
plt.close()

# 3. Feature Importance
# Isolation Forest doesn't provide feature importance directly, but we can
# compute it based on the mean path length decrease for each feature
def calculate_feature_importance(model, X):
    n_samples = X.shape[0]
    n_features = X.shape[1]
    feature_importances = np.zeros(n_features)

    for i in range(n_features):
        X_permuted = X.copy()
        np.random.shuffle(X_permuted[:, i])

        # Get original and permuted scores
        original_scores = model.score_samples(X)
        permuted_scores = model.score_samples(X_permuted)

        # Calculate importance as the mean decrease in score when permuting the feature
        feature_importances[i] = np.mean(original_scores) - np.mean(permuted_scores)

    return feature_importances

# Calculate feature importance
feature_importance = calculate_feature_importance(model, X_test)

# Sort features by importance
sorted_idx = np.argsort(feature_importance)
feature_names = np.array(feature_cols)

plt.figure(figsize=(10, 8))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx])
plt.yticks(range(len(sorted_idx)), feature_names[sorted_idx])
plt.xlabel('Feature Importance')
plt.title('Feature Importance for Fraud Detection')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

In [None]:
# -------------- Save the model --------------
from joblib import dump
dump(model, 'isolation_forest_fraud_model.joblib')
dump(scaler, 'feature_scaler.joblib')

print("\nModel training and evaluation complete.")
print("Model saved as 'isolation_forest_fraud_model.joblib'")
print("Scaler saved as 'feature_scaler.joblib'")

# -------------- Example Function for Prediction --------------
def predict_fraud(transaction_data, model, scaler, feature_cols):
    """
    Predict if a transaction is fraudulent

    Parameters:
    transaction_data: DataFrame row with transaction information
    model: Trained Isolation Forest model
    scaler: Fitted StandardScaler
    feature_cols: List of feature column names

    Returns:
    is_fraud: Boolean indicating fraud prediction
    fraud_probability: Estimated probability of fraud
    """
    # Extract features
    features = transaction_data[feature_cols].values.reshape(1, -1)

    # Scale features
    scaled_features = scaler.transform(features)

    # Get anomaly score
    anomaly_score = model.decision_function(scaled_features)[0]

    # Convert score to probability (lower score = higher probability of fraud)
    # This is a simple conversion for demonstration purposes
    fraud_probability = 1 - (anomaly_score + 0.5)  # Adjust range to [0,1]
    fraud_probability = max(0, min(1, fraud_probability))  # Clip to [0,1]

    # Predict class
    is_fraud = model.predict(scaled_features)[0] == -1