# Updated Backend Modules for Fraud Detection (Colab Version)

This notebook contains the updated code for the backend modules that load the dataset, preprocess data, train models, and run inference based on a dataset with the following columns:

- click_id
- timestamp
- user_id
- ip_address
- device_type
- browser
- operating_system
- referrer_url
- page_url
- click_duration
- scroll_depth
- mouse_movement
- keystrokes_detected
- ad_position
- click_frequency
- time_since_last_click
- device_ip_reputation
- VPN_usage
- proxy_usage
- bot_likelihood_score
- is_fraudulent

All references to the file directory now use the current working directory (`os.getcwd()`), which is suitable for Colab.

In [None]:
import pandas as pd
import numpy as np
import os

def load_raw_data(file_path='data/click_dataset.csv'):
    # In Colab, use os.getcwd() since __file__ is not defined
    base_dir = os.getcwd()
    full_path = os.path.join(base_dir, file_path)
    df = pd.read_csv(full_path)
    
    # Parse timestamp column using the explicit format
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
    
    # Remove any rows with invalid timestamps
    df = df[df['timestamp'].notna()]
    return df

def feature_engineering(df):
    # Derive temporal features
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Convert device_ip_reputation: map "Good" to 0 and "Suspicious" to 1
    if 'device_ip_reputation' in df.columns:
        df['device_ip_reputation'] = df['device_ip_reputation'].map({'Good': 0, 'Suspicious': 1})
    
    # Ensure VPN_usage and proxy_usage are numeric
    df['VPN_usage'] = pd.to_numeric(df['VPN_usage'], errors='coerce')
    df['proxy_usage'] = pd.to_numeric(df['proxy_usage'], errors='coerce')
    
    # For numeric features, convert to numbers and fill missing values with the median
    numeric_cols = ['click_duration', 'scroll_depth', 'mouse_movement', 'keystrokes_detected', 
                    'click_frequency', 'time_since_last_click', 'bot_likelihood_score']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col].fillna(df[col].median(), inplace=True)
    
    return df

def preprocess_data():
    df = load_raw_data()
    df = feature_engineering(df)
    
    # Drop columns that are not used for training
    drop_cols = ['click_id', 'user_id', 'ip_address', 'timestamp', 'referrer_url', 'page_url']
    df = df.drop(columns=[col for col in drop_cols if col in df.columns])
    
    # Separate features and target
    X = df.drop(columns=['is_fraudulent'])
    y = df['is_fraudulent'].astype(int)
    
    # One-hot encode categorical features
    categorical_features = ['device_type', 'browser', 'operating_system', 'ad_position']
    X = pd.get_dummies(X, columns=categorical_features, drop_first=True)
    
    return X, y

if __name__ == '__main__':
    X, y = preprocess_data()
    print('Processed data shape:', X.shape)

## Model Training Module

In [None]:
import os
import json
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Using the preprocess_data function from the previous cell
def load_data():
    X, y = preprocess_data()
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

def evaluate_model(model, X_test, y_test, model_type=''):
    if model_type == 'Neural Network':
        y_pred = (model.predict(X_test) > 0.5).astype('int32')
    else:
        y_pred_proba = model.predict_proba(X_test)
        threshold = 0.5
        y_pred = (y_pred_proba[:, 1] > threshold).astype(int)
    
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    if model_type != 'Neural Network':
        auc = roc_auc_score(y_test, y_pred_proba[:, 1])
        print(f'{model_type} Accuracy: {accuracy:.4f}, ROC AUC: {auc:.4f}')
    else:
        print(f'{model_type} Accuracy: {accuracy:.4f}')
    print(classification_report(y_test, y_pred))
    return accuracy, report

def train_random_forest(X_train, y_train):
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X_train, y_train)
    rf = RandomForestClassifier(n_estimators=300, max_depth=20, random_state=42, n_jobs=-1)
    rf.fit(X_res, y_res)
    return rf

def train_xgboost(X_train, y_train):
    undersampler = RandomUnderSampler(random_state=42)
    X_res, y_res = undersampler.fit_resample(X_train, y_train)
    xgb_model = xgb.XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=6,
                                  random_state=42, use_label_encoder=False, eval_metric='auc')
    xgb_model.fit(X_res, y_res)
    return xgb_model

def train_neural_network(X_train, y_train):
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X_train, y_train)
    model = Sequential([
        Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
        BatchNormalization(),
        Dropout(0.4),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    model.fit(X_res, y_res, epochs=30, batch_size=64, validation_split=0.2, verbose=1)
    return model

def save_model(model, filename):
    # In Colab, use os.getcwd() for the base directory
    base_dir = os.getcwd()
    full_filename = os.path.join(base_dir, filename)
    dir_name = os.path.dirname(full_filename)
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    if hasattr(model, 'save'):
        model.save(full_filename)
    else:
        with open(full_filename, 'wb') as f:
            pickle.dump(model, f)
    print(f'Model saved to {full_filename}')

def train_and_save_models():
    X_train, X_test, y_train, y_test = load_data()
    base_dir = os.getcwd()
    models_dir = os.path.join(base_dir, 'models')
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)
    
    rf_model = train_random_forest(X_train, y_train)
    rf_acc, rf_report = evaluate_model(rf_model, X_test, y_test, 'Random Forest')
    save_model(rf_model, os.path.join('models', 'random_forest.pkl'))
    
    xgb_model = train_xgboost(X_train, y_train)
    xgb_acc, xgb_report = evaluate_model(xgb_model, X_test, y_test, 'XGBoost')
    save_model(xgb_model, os.path.join('models', 'xgboost.pkl'))
    
    nn_model = train_neural_network(X_train, y_train)
    nn_acc, nn_report = evaluate_model(nn_model, X_test, y_test, 'Neural Network')
    save_model(nn_model, os.path.join('models', 'neural_network.h5'))
    
    scores = {
        'random_forest': {'accuracy': rf_acc, 'report': rf_report},
        'xgboost': {'accuracy': xgb_acc, 'report': xgb_report},
        'neural_network': {'accuracy': nn_acc, 'report': nn_report}
    }
    with open(os.path.join(models_dir, 'model_scores.json'), 'w') as f:
        json.dump(scores, f)
    print('Model scores saved.')

if __name__ == '__main__':
    train_and_save_models()

## Inference Module

In [None]:
import os
import pickle
import tensorflow as tf
import pandas as pd

def load_models():
    # In Colab, use os.getcwd() to set the base directory
    base_dir = os.getcwd()
    models_dir = os.path.join(base_dir, 'models')
    models = {
        'random_forest': pickle.load(open(os.path.join(models_dir, 'random_forest.pkl'), 'rb')),
        'xgboost': pickle.load(open(os.path.join(models_dir, 'xgboost.pkl'), 'rb')),
        'neural_network': tf.keras.models.load_model(os.path.join(models_dir, 'neural_network.h5'))
    }
    return models

def preprocess_input(data):
    df = pd.DataFrame([data])
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
        df['hour'] = df['timestamp'].dt.hour
        df['day_of_week'] = df['timestamp'].dt.dayofweek
        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
        df.drop(columns=['timestamp'], inplace=True)
    if 'device_ip_reputation' in df.columns:
        df['device_ip_reputation'] = df['device_ip_reputation'].map({'Good': 0, 'Suspicious': 1})
    categorical_features = ['device_type', 'browser', 'operating_system', 'ad_position']
    df = pd.get_dummies(df, columns=categorical_features, drop_first=True)
    return df

def predict(input_data):
    models = load_models()
    processed_data = preprocess_input(input_data)
    predictions = {}
    for key, model in models.items():
        try:
            if key == 'neural_network':
                pred = (model.predict(processed_data) > 0.5).astype('int32')[0][0]
            else:
                pred = model.predict(processed_data)[0]
            predictions[key] = int(pred)
        except Exception as e:
            predictions[key] = None
    return predictions

if __name__ == '__main__':
    sample_data = {
        'click_id': 'sample',
        'timestamp': '2024-08-23 02:47:39',
        'user_id': 'sample_user',
        'ip_address': '141.36.49.37',
        'device_type': 'Tablet',
        'browser': 'Safari',
        'operating_system': 'Android',
        'referrer_url': 'https://evans-ford.com/',
        'page_url': 'http://www.turner-stewart.com/',
        'click_duration': 0.29,
        'scroll_depth': 60,
        'mouse_movement': 111,
        'keystrokes_detected': 8,
        'ad_position': 'Bottom',
        'click_frequency': 7,
        'time_since_last_click': 72,
        'device_ip_reputation': 'Good',
        'VPN_usage': 0,
        'proxy_usage': 1,
        'bot_likelihood_score': 0.29
    }
    result = predict(sample_data)
    print('Inference results:', result)