In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


In [None]:

# Load dataset
file_path = "../data/credit_scoring_data.csv"
df = pd.read_csv(file_path)

# Display the first few rows to understand the structure
print(df.head())


In [None]:
# Define target variable (FraudResult: 1 - Fraud, 0 - No Fraud)
target = "FraudResult"

# Drop unnecessary columns
X = df.drop(columns=["TransactionId", "CustomerId", "TransactionStartTime", target])
y = df[target]

# Split into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns

# Drop high-cardinality columns (like unique IDs) if they are not predictive
high_cardinality_cols = ["TransactionId", "BatchId", "AccountId", "SubscriptionId", "CustomerId"]
X_train = X_train.drop(columns=high_cardinality_cols, errors="ignore")
X_test = X_test.drop(columns=high_cardinality_cols, errors="ignore")

# Apply Label Encoding to categorical columns
label_encoders = {}
for col in categorical_cols:
    if col in X_train.columns:  # Ensure the column was not dropped
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col])
        X_test[col] = le.transform(X_test[col])
        label_encoders[col] = le  # Store encoders for future use

# Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100)
}

# Train models and store results
trained_models = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    trained_models[name] = model


In [6]:
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring="roc_auc", n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best Parameters
print(f"Best parameters for Random Forest: {grid_search.best_params_}")


Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}


In [7]:
# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    }

# Evaluate models
evaluation_results = {name: evaluate_model(model, X_test, y_test) for name, model in trained_models.items()}

# Display results
pd.DataFrame(evaluation_results).T


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC-AUC
Logistic Regression,0.998432,0.8,0.222222,0.347826,0.991332
Decision Tree,0.999843,1.0,0.916667,0.956522,0.98579
Random Forest,0.999791,0.970588,0.916667,0.942857,0.999996
Gradient Boosting,0.999321,0.780488,0.888889,0.831169,0.916423
