In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import joblib
import logging
from datetime import datetime

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('training.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

logger.info("Loading dataset...")
data = pd.read_csv('creditcard.csv')

logger.info("Cleaning data...")
data.dropna(inplace=True) 
data = data.drop_duplicates()  
logger.info(f"Dataset shape after cleaning: {data.shape}")


logger.info("Performing feature engineering...")
data['Log_Amount'] = np.log1p(data['Amount'])  
data['Hour'] = (data['Time'] // 3600) % 24  
data['Time_Diff'] = data['Time'].diff().fillna(0)  
features = ['V' + str(i) for i in range(1, 29)] + ['Log_Amount', 'Hour', 'Time_Diff']
X = data[features]
y = data['Class']
logger.info(f"Features used: {features}")

logger.info("Standardizing features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
logger.info(f"Feature matrix shape after scaling: {X_scaled.shape}")

logger.info("Applying SMOTE for class imbalance...")
smote = SMOTE(sampling_strategy=0.1, random_state=42)  
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
logger.info(f"Resampled dataset shape: {X_resampled.shape}")

logger.info("Splitting data into train/test sets...")
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
logger.info(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")

models = {
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42),
        'params': {
            'C': [0.1, 1, 10],
            'class_weight': ['balanced', None]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [10, 20],
            'min_samples_split': [2, 5]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=42, eval_metric='logloss'),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [3, 6],
            'learning_rate': [0.05, 0.1]
        }
    },
    'LightGBM': {
        'model': LGBMClassifier(random_state=42, verbose=-1),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [3, 6],
            'learning_rate': [0.05, 0.1]
        }
    }
}

best_model = None
best_f1 = 0
best_model_name = ''
results = []

for name, config in models.items():
    logger.info(f"Training {name}...")
    start_time = datetime.now()

    search = RandomizedSearchCV(
        config['model'],
        config['params'],
        n_iter=5, 
        cv=3,      
        scoring='f1',
        n_jobs=-1,
        random_state=42
    )
    search.fit(X_train, y_train)

    model = search.best_estimator_

    logger.info(f"Evaluating {name} on test set...")
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    results.append({
        'Model': name,
        'Best Parameters': search.best_params_,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    })

    if f1 > best_f1:
        best_f1 = f1
        best_model = model
        best_model_name = name

    end_time = datetime.now()
    logger.info(f"{name} completed in {(end_time - start_time).total_seconds()} seconds")

logger.info("\nModel Comparison:")
for result in results:
    logger.info(f"\n{result['Model']}:")
    logger.info(f"Best Parameters: {result['Best Parameters']}")
    logger.info(f"Precision: {result['Precision']:.4f}")
    logger.info(f"Recall: {result['Recall']:.4f}")
    logger.info(f"F1-Score: {result['F1-Score']:.4f}")
    logger.info(f"ROC-AUC: {result['ROC-AUC']:.4f}")

logger.info(f"\nBest Model: {best_model_name} with F1-Score: {best_f1:.4f}")

logger.info("Saving best model and scaler...")
joblib.dump(best_model, 'best_fraud_detection_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
logger.info("Training completed successfully!")