In [22]:
# 1. Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer  # Added this import
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (roc_auc_score, average_precision_score, f1_score, 
                            classification_report, confusion_matrix)
import joblib
import os
import warnings
warnings.filterwarnings('ignore')
# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [None]:
# 2. Load and preprocess data
def load_and_preprocess_data(file_path):
    # Load the data
    df = pd.read_csv(file_path)
    
    # Print initial info
    print("Initial data shape:", df.shape)
    print("\nColumns in the dataset:")
    print(df.columns.tolist())
    
    # Check for missing values
    print("\nMissing values per column:")
    print(df.isnull().sum())
    
    # Check for target variable
    target_col = None
    for col in ['class', 'Class']:
        if col in df.columns:
            target_col = col
            break
    
    if not target_col:
        raise ValueError("No target column ('class' or 'Class') found in the dataset")
    
    print(f"\nUsing '{target_col}' as the target variable")
    
    # Convert target to numeric, coerce errors to NaN
    df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
    
    # Drop rows where target is NaN
    initial_rows = len(df)
    df = df.dropna(subset=[target_col])
    dropped_rows = initial_rows - len(df)
    print(f"\nDropped {dropped_rows} rows with NaN in target column")
    
    # Convert target to int (0 and 1)
    df[target_col] = df[target_col].astype(int)
    
    # Check class distribution
    print("\nClass distribution after cleaning:")
    print(df[target_col].value_counts(normalize=True).mul(100).round(2))
    
    # Convert datetime columns if they exist
    datetime_cols = ['signup_time', 'purchase_time']
    for col in datetime_cols:
        if col in df.columns:
            print(f"\nProcessing datetime column: {col}")
            df[col] = pd.to_datetime(df[col], errors='coerce')
            # Extract useful datetime features
            df[f'{col}_hour'] = df[col].dt.hour
            df[f'{col}_dayofweek'] = df[col].dt.dayofweek
            df[f'{col}_dayofmonth'] = df[col].dt.day
            df[f'{col}_month'] = df[col].dt.month
    
    # Drop original datetime columns and any other non-numeric columns that might cause issues
    cols_to_drop = datetime_cols + [col for col in df.columns if df[col].dtype == 'object' and col != target_col]
    df = df.drop(columns=cols_to_drop, errors='ignore')
    
    # Check for any remaining non-numeric columns
    non_numeric_cols = [col for col in df.columns if df[col].dtype == 'object' and col != target_col]
    if non_numeric_cols:
        print(f"\nWarning: Found non-numeric columns that will be dropped: {non_numeric_cols}")
        df = df.drop(columns=non_numeric_cols)
    
    # Fill any remaining NaN values with median for numeric columns
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.difference([target_col])
    if df[numeric_cols].isnull().any().any():
        print("\nFilling remaining NaN values with median...")
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    
    # Separate features and target
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    
    return X, y

In [None]:

# 3. Load the data
data_paths = [
    '../data/processed/fraud_processed.csv',
    '../data/rw/Fraud_Data.csv',
    '../data/raw/creditcard.csv'
]
X, y = None, None
for path in data_paths:
    try:
        print(f"\nTrying to load data from: {path}")
        X, y = load_and_preprocess_data(path)
        print(f"Successfully loaded data from {path}")
        break
    except Exception as e:
        print(f"Error loading {path}: {str(e)}")
if X is None or y is None:
    raise FileNotFoundError("Could not load any of the data files. Please check the file paths.")

In [18]:

# 4. Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Numeric columns: {numeric_cols}")
print(f"Categorical columns: {categorical_cols}")
# 5. Create preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

Numeric columns: ['user_id', 'purchase_value', 'age', 'lower_bound_ip_address', 'upper_bound_ip_address', 'time_since_signup', 'hour_of_day', 'day_of_week', 'user_transaction_count', 'time_since_last_txn', 'txn_within_1hr']
Categorical columns: ['source_Direct', 'source_SEO', 'browser_FireFox', 'browser_IE', 'browser_Opera', 'browser_Safari', 'sex_M', 'country_Albania', 'country_Algeria', 'country_Angola', 'country_Antigua and Barbuda', 'country_Argentina', 'country_Armenia', 'country_Australia', 'country_Austria', 'country_Azerbaijan', 'country_Bahamas', 'country_Bahrain', 'country_Bangladesh', 'country_Barbados', 'country_Belarus', 'country_Belgium', 'country_Belize', 'country_Benin', 'country_Bermuda', 'country_Bhutan', 'country_Bolivia', 'country_Bonaire; Sint Eustatius; Saba', 'country_Bosnia and Herzegowina', 'country_Botswana', 'country_Brazil', 'country_British Indian Ocean Territory', 'country_Brunei Darussalam', 'country_Bulgaria', 'country_Burkina Faso', 'country_Burundi', '

In [23]:
# 6. Create preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [31]:
# Create a mask for non-NaN values
mask = ~y_train.isna()

# Apply the mask to both X and y
X_train_clean = X_train[mask]
y_train_clean = y_train[mask].astype(int)  # Convert to int for classification

# Now run cross-validation on the cleaned data
cv_scores = cross_val_score(
    lr_pipeline, 
    X_train_clean, 
    y_train_clean, 
    cv=cv, 
    scoring='average_precision',
    n_jobs=-1
)

print(f"Cross-validated PR-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

Cross-validated PR-AUC: 0.0957 (+/- 0.0027)


In [1]:
# 7. Data Splitting
print("\n=== Data Splitting ===")
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
    )
    print("Data split successfully")
    print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")
    print(f"Fraud rate in training: {y_train.mean():.4f}")
    print(f"Fraud rate in test: {y_test.mean():.4f}")
except Exception as e:
    print(f"Error during train-test split: {e}")
    print("\nTrying without stratification...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE
    )
    print("Data split successfully (without stratification)")
    print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")
    print(f"Fraud rate in training: {y_train.mean():.4f}")
    print(f"Fraud rate in test: {y_test.mean():.4f}")


=== Data Splitting ===
Error during train-test split: name 'train_test_split' is not defined

Trying without stratification...


NameError: name 'train_test_split' is not defined

In [26]:
# 8. Model Pipeline
def create_model_pipeline(estimator):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', estimator)
    ])

In [30]:
print("NaN values in y_train:", y_train.isna().sum())

NaN values in y_train: 2062


In [29]:
print("\n=== Training Logistic Regression ===")
lr_pipeline = create_model_pipeline(
    LogisticRegression(
        class_weight='balanced',
        random_state=RANDOM_STATE,
        max_iter=1000
    )
)
# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_scores = cross_val_score(
    lr_pipeline, X_train, y_train, 
    cv=cv, 
    scoring='average_precision',
    n_jobs=-1
)
print(f"Cross-validated PR-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")



=== Training Logistic Regression ===


ValueError: Input y contains NaN.