In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Load the dataset 

training = "/Users/ankelovset/Documents/Master/2. Semester/INFO381 - Artificial Intelligence/Project381/archive (1)/fraudTrain.csv"
testing = "/Users/ankelovset/Documents/Master/2. Semester/INFO381 - Artificial Intelligence/Project381/archive (1)/fraudTest.csv"

df_train = pd.read_csv(training)
df_test = pd.read_csv(testing)


In [None]:
# Selecting relevant features
features = ["trans_date_trans_time", "category", "amt", "gender", "zip", "city_pop", "job", "dob", "merch_lat", "merch_long", "is_fraud"]
df_train = df_train[features]
df_test = df_test[features]

In [None]:
# Convert date columns to datetime format
df_train["trans_date_trans_time"] = pd.to_datetime(df_train["trans_date_trans_time"])
df_train["dob"] = pd.to_datetime(df_train["dob"])
df_test["trans_date_trans_time"] = pd.to_datetime(df_test["trans_date_trans_time"])
df_test["dob"] = pd.to_datetime(df_test["dob"])

In [None]:
# Create age feature
df_train["age"] = (df_train["trans_date_trans_time"] - df_train["dob"]).dt.days // 365
df_test["age"] = (df_test["trans_date_trans_time"] - df_test["dob"]).dt.days // 365
df_train.drop(columns=["dob", "trans_date_trans_time"], inplace=True)
df_test.drop(columns=["dob", "trans_date_trans_time"], inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
categorical_cols = ["category", "gender", "job"]

for col in categorical_cols:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    
    # Map test data with known categories, set unknown labels to -1
    df_test[col] = df_test[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)

    label_encoders[col] = le

In [None]:
# Handle missing values
df_train.fillna(df_train.median(), inplace=True)
df_test.fillna(df_test.median(), inplace=True)

In [None]:
# Splitting features and target variable
X_train = df_train.drop(columns=["is_fraud"])
y_train = df_train["is_fraud"]
X_test = df_test.drop(columns=["is_fraud"])
y_test = df_test["is_fraud"]

In [None]:
# Normalize numerical features
scaler = StandardScaler()
numerical_cols = ["amt", "zip", "city_pop", "merch_lat", "merch_long", "age"]
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [None]:
# Train a Logistic Regression model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=500, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Visualizing feature importance
plt.figure(figsize=(10,6))
sns.barplot(x=np.abs(model.coef_).flatten(), y=X_train.columns)
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.title("Feature Importance in Fraud Detection (Logistic Regression)")
plt.show()

In [None]:
from skrules import SkopeRules
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [None]:
# Step 1: Identify categorical and numerical features
categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

In [None]:
# Step 2: Preprocess (encode) categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # keep numerical columns as-is
)

In [None]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [None]:
# Step 3: Fit the SkopeRules model
skope = SkopeRules(
    feature_names=preprocessor.get_feature_names_out(),
    precision_min=0.5,
    recall_min=0.01,
    n_estimators=30,
    random_state=42
)

skope.fit(X_train_transformed, y_train)

In [None]:
# Step 4: Predict and evaluate
y_pred_skope = skope.predict(X_test_transformed)

print("SkopeRules Classification Report:")
print(classification_report(y_test, y_pred_skope))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred_skope)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Not Fraud", "Fraud"])
disp.plot(cmap="Oranges") 