In [1]:
import sklearn, numpy, pandas as pd
print("sklearn:", sklearn.__version__)
print("numpy  :", numpy.__version__)


sklearn: 1.4.2
numpy  : 1.26.4


In [2]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

df = pd.read_csv(Path(r"C:\Users\anton\OneDrive\Pulpit\Mastercard\Master\merged.csv"))

target = "is_fraud"
y = df[target]
X = df.drop(columns=[target])   # wszystkie pozostałe kolumny jako cechy

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print("Train:", X_train.shape, "| Test:", X_test.shape)
print("% fraud train:", y_train.mean().round(3), "| test:", y_test.mean().round(3))


Train: (400000, 30) | Test: (100000, 30)
% fraud train: 0.085 | test: 0.085


In [3]:
# %% ----------- HistGradientBoosting baseline --------------------------
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score

# 1⃣  Definicja kolumn
num_cols = X_train.select_dtypes(["int64", "float64"]).columns.tolist()
cat_cols = X_train.select_dtypes(["object"]).columns.tolist()

# 2⃣  Preprocessing: skaler + ordinal
preprocess = ColumnTransformer(
    [("num", StandardScaler(), num_cols),
     ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_cols)],
    remainder="drop",
)

# 3⃣  Model
hgb = HistGradientBoostingClassifier(
    learning_rate=0.15,
    max_depth=6,
    max_iter=200,
    class_weight={0:1, 1:10},      # ⬅️ lekkie doważenie fraudów
    random_state=42
)

pipe = make_pipeline(preprocess, hgb)
pipe.fit(X_train, y_train)

# 4⃣  Metryki
proba_test = pipe.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, proba_test)
ap  = average_precision_score(y_test, proba_test)

print(f"AUC = {auc:.3f}")
print(f"AP  = {ap:.3f}")


AUC = 0.567
AP  = 0.103


In [4]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, (proba_test >= 0.5).astype(int)))


[[55541 35977]
 [ 4349  4133]]
