In [3]:
from google.colab import files
uploaded = files.upload()

Saving dataset.csv to dataset.csv


In [None]:
import pandas as pd
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,transaction_id,transaction_amount,daily_transaction_count,avg_transaction_amount_7d,failed_transaction_count_7d,transaction_distance,risk_score,fraud_label,account_balance_category,transaction_type,...,population_category,state,authentication_method,authentication_method_category,auth_date_from,auth_date_to,merchant_category,merchant_type,ip_address_flag,previous_fraudulent_activity
0,TXN_0,95.97,14,36.25,0,4610.43,0.3773,0,Above Average,ATM Withdrawal,...,Mega City,Japan,Pin,Insecure,2025-05-15 11:14:21,,Restaurants,Luxury,0,0
1,TXN_1,55.41,7,435.42,1,2284.15,0.5871,0,Above Average,ATM Withdrawal,...,Mega City,United Kingdom,Pin,Insecure,2025-05-15 11:14:21,,Groceries,Everyday,0,0
2,TXN_10,283.26,1,363.07,0,4273.45,0.6237,0,Above Average,POS,...,Mega City,India,Otp,Secure,2025-05-15 11:14:21,,Travel,Luxury,0,1
3,TXN_100,57.24,14,89.36,0,3446.99,0.0785,0,Above Average,ATM Withdrawal,...,Large City,India,Biometric,Secure,2025-05-15 11:14:21,,Clothing,Consumable,0,0
4,TXN_1000,58.43,3,21.28,0,2968.75,0.1635,0,Above Average,ATM Withdrawal,...,Large City,United States,Otp,Secure,2025-05-15 11:14:21,,Electronics,Consumable,0,0


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Osnovne informacije
print(df.info(), '\n')
print(df.describe(include='all'), '\n')

# Provjera nedostajućih vrijednosti
print(df.isnull().sum(), '\n')

# Distribucija target varijable
print(df['fraud_label'].value_counts(normalize=True), '\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 31 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   transaction_id                  50000 non-null  object 
 1   transaction_amount              50000 non-null  float64
 2   daily_transaction_count         50000 non-null  int64  
 3   avg_transaction_amount_7d       50000 non-null  float64
 4   failed_transaction_count_7d     50000 non-null  int64  
 5   transaction_distance            50000 non-null  float64
 6   risk_score                      50000 non-null  float64
 7   fraud_label                     50000 non-null  int64  
 8   account_balance_category        50000 non-null  object 
 9   transaction_type                50000 non-null  object 
 10  year                            50000 non-null  int64  
 11  month                           50000 non-null  int64  
 12  day                             

In [None]:
!pip install --upgrade xgboost



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_score, recall_score,
    classification_report, average_precision_score
)
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from xgboost import XGBClassifier

# 1. Učitavanje podataka
df = pd.read_csv('dataset.csv')

# 2. Pretvaranje kategorijskih podataka u numeričke
for col in df.select_dtypes(include='object').columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# 3. Definiranje X i y
X = df.drop(columns=[
    'fraud_label', 'transaction_id', 'user_id',
    'kartica_date_from', 'kartica_date_to',
    'auth_date_from', 'auth_date_to', 'risk_score'
])
y = df['fraud_label']

# 4. Podjela na train/val/test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)

# 5. Izračun scale_pos_weight na originalnom train skupu
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# 6. Definiranje SMOTE tehnika
smote_variants = {
    'SMOTE': SMOTE(random_state=42),
    'BorderlineSMOTE': BorderlineSMOTE(random_state=42),
    'SVMSMOTE': SVMSMOTE(random_state=42)
}

# 7. Hiperparametri za pretragu
param_grid = {
    'xgb__n_estimators': [50, 100],
    'xgb__max_depth': [5, 7],
    'xgb__learning_rate': [0.05, 0.1],
    'xgb__subsample': [0.7, 1.0],
    'xgb__colsample_bytree': [0.7, 1.0],
    'xgb__gamma': [0, 1],
    'xgb__min_child_weight': [3, 5]
}

best_score = 0
best_model = None
best_config = {}

# 8. Evaluacija prema svim SMOTE tehnikama
for smote_name, smote_instance in smote_variants.items():
    print(f"\nEvaluacija za {smote_name}")

    pipeline = Pipeline([
        ('smote', smote_instance),
        ('under', RandomUnderSampler(random_state=42)),
        ('xgb', XGBClassifier(scale_pos_weight=scale_pos_weight, eval_metric='logloss', random_state=42))
    ])

    scoring = {
        'roc_auc': 'roc_auc',
        'f1': 'f1',
        'recall': 'recall',
        'precision': 'precision'
    }

    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring=scoring,
        refit='recall',  # fokus na recall
        cv=3,
        verbose=1,
        n_jobs=-1
    )

    grid.fit(X_train, y_train)

    best_rec = grid.best_score_
    if best_rec > best_score:
        best_score = best_rec
        best_model = grid.best_estimator_
        best_config = {
            'smote': smote_name,
            **grid.best_params_,
            'recall': best_rec
        }

# 9. Evaluacija na validation skupu
print("\nNajbolja kombinacija hiperparametara:")
for key, value in best_config.items():
    print(f"{key}: {value}")

# 10. Evaluacija na test skupu
y_test_proba = best_model.predict_proba(X_test)[:, 1]
y_test_pred = best_model.predict(X_test)

roc_auc_test = roc_auc_score(y_test, y_test_proba)
pr_auc_test = average_precision_score(y_test, y_test_proba)
f1_test = f1_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)

print("\nEvaluacija na TEST skupu")
print(f"ROC AUC:   {roc_auc_test:.4f}")
print(f"PR AUC:    {pr_auc_test:.4f}")
print(f"Recall:    {recall_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"F1 score:  {f1_test:.4f}")
print("\nClassification Report:\n")
print(classification_report(y_test, y_test_pred))



Evaluacija za SMOTE
Fitting 3 folds for each of 128 candidates, totalling 384 fits

Evaluacija za BorderlineSMOTE
Fitting 3 folds for each of 128 candidates, totalling 384 fits

Evaluacija za SVMSMOTE
Fitting 3 folds for each of 128 candidates, totalling 384 fits

Najbolja kombinacija hiperparametara:
smote: SMOTE
xgb__colsample_bytree: 0.7
xgb__gamma: 1
xgb__learning_rate: 0.05
xgb__max_depth: 5
xgb__min_child_weight: 5
xgb__n_estimators: 50
xgb__subsample: 0.7
recall: 0.7277249464860042

Evaluacija na TEST skupu
ROC AUC:   0.8138
PR AUC:    0.8124
Recall:    0.7354
Precision: 0.5560
F1 score:  0.6333

Classification Report:

              precision    recall  f1-score   support

           0       0.85      0.72      0.78      6787
           1       0.56      0.74      0.63      3213

    accuracy                           0.73     10000
   macro avg       0.70      0.73      0.71     10000
weighted avg       0.76      0.73      0.73     10000



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report

# Ciljna varijabla
y = df['fraud_label']

# Uklanjanje nepotrebnih atributa
X = df.drop(columns=['fraud_label', 'transaction_id', 'user_id', 'kartica_date_from', 'kartica_date_to', 'auth_date_from', 'auth_date_to', 'risk_score'])

# Pretvaranje kategorijskih podataka u numeričke
X = pd.get_dummies(X)

# Podjela na trening (80%) i test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicijalizacija XGBoost klasifikatora
model = XGBClassifier(eval_metric='logloss', random_state=42)

# Treniranje modela
model.fit(X_train, y_train)

# Predikcija vjerojatnosti za test skup
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Predikcija klasa za test skup
y_pred = model.predict(X_test)

# Evaluacija modela
auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC score: {auc:.4f}")

print(classification_report(y_test, y_pred))

ROC AUC score: 0.8083
Izvještaj klasifikacije:
              precision    recall  f1-score   support

           0       0.85      1.00      0.92      6769
           1       0.99      0.63      0.77      3231

    accuracy                           0.88     10000
   macro avg       0.92      0.81      0.84     10000
weighted avg       0.89      0.88      0.87     10000

