In [None]:
import pandas as pd
import numpy as np
import joblib 
import optuna
import optuna_dashboard
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,classification_report,confusion_matrix

In [3]:
data=pd.read_csv("processdata.csv")

In [4]:
data.value_counts()

TRANSACTION_ID  CUSTOMER_ID  TX_YEAR  TX_MONTH  TX_DAY  TERMINAL_ID  TX_AMOUNT  ABOVE220  FRAUD_SCORE  FRAUD
1754154         3542         2018     9         30      9849         23.59      0         4            0        1
0               596          2018     4         1       3156         57.16      0         17           0        1
1               4961         2018     4         1       3412         81.51      0         1            0        1
2               2            2018     4         1       1365         146.00     0         1            0        1
3               4128         2018     4         1       8737         64.49      0         0            0        1
                                                                                                               ..
15              3842         2018     4         1       1693         26.23      0         2            0        1
14              2989         2018     4         1       4111         28.42      0         0  

In [5]:
x=data.iloc[:,:-1]
y=data['FRAUD']
ru=RandomOverSampler()
x,y=ru.fit_resample(x,y)
y.value_counts()

FRAUD
0    1739474
1    1739474
Name: count, dtype: int64

In [6]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.30,random_state=42)

In [7]:
model=XGBClassifier(n_estimators= 500,
 max_depth= 10,
 learning_rate =0.29098703077029825,
 subsample= 0.7509206786221526,
 colsample_bytree= 0.7530342131605683)
model.fit(xtrain,ytrain)

In [8]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    }
    model = XGBClassifier(**params, tree_method='hist', use_label_encoder=False, eval_metric='logloss')
    model.fit(xtrain, ytrain)
    preds = model.predict(xtest)
    return accuracy_score(ytest, preds)

study = optuna.create_study(direction="maximize",study_name="dtr_study",storage="sqlite:///dtr_study.db",load_if_exists=True)
study.optimize(objective, n_trials=50)

[I 2025-04-21 13:16:28,247] Using an existing study with name 'dtr_study' instead of creating a new one.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-04-21 13:16:52,827] Trial 1 finished with value: 0.9911228004618252 and parameters: {'n_estimators': 154, 'max_depth': 8, 'learning_rate': 0.2898581379099978, 'subsample': 0.8894118211854558, 'colsample_bytree': 0.5794819524788524}. Best is trial 1 with value: 0.9911228004618252.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[W 2025-04-21 13:16:58,800] Trial 2 failed with parameters: {'n_estimators': 482, 'max_depth': 7, 'learning_rate': 0.15621106135731688, 'subsample': 0.8717271367138897, 'colsample_bytree': 0.6027560152284559} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\Avijit\Desktop\fraud_detection\.venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_t

KeyboardInterrupt: 

In [139]:
optuna_dashboard.run_server("sqlite:///dtr_study.db")

Bottle v0.13.2 server starting up (using WSGIRefServer())...
Listening on http://localhost:8080/
Hit Ctrl-C to quit.

127.0.0.1 - - [20/Apr/2025 16:21:50] "GET / HTTP/1.1" 302 0
127.0.0.1 - - [20/Apr/2025 16:21:50] "GET /dashboard HTTP/1.1" 200 4145
127.0.0.1 - - [20/Apr/2025 16:21:51] "GET /static/bundle.js HTTP/1.1" 200 4140872
127.0.0.1 - - [20/Apr/2025 16:21:52] "GET /api/studies HTTP/1.1" 200 137
127.0.0.1 - - [20/Apr/2025 16:21:52] "GET /favicon.ico HTTP/1.1" 200 7670
  study, target=target, evaluator=PedAnovaImportanceEvaluator()
127.0.0.1 - - [20/Apr/2025 16:21:53] "GET /api/studies/1/param_importances?evaluator=ped_anova HTTP/1.1" 200 510
127.0.0.1 - - [20/Apr/2025 16:21:53] "GET /api/studies/1?after=0 HTTP/1.1" 200 81536
127.0.0.1 - - [20/Apr/2025 16:21:53] "GET /api/meta HTTP/1.1" 200 64
127.0.0.1 - - [20/Apr/2025 16:21:54] "GET /api/studies/1/param_importances?evaluator=ped_anova HTTP/1.1" 200 510
127.0.0.1 - - [20/Apr/2025 16:22:04] "GET /api/studies/1?after=50 HTTP/1.1" 2

In [17]:
ypred=model.predict(xtest)
print("ACCURACY SCORE: ",accuracy_score(ytest,ypred)*100)
print("PRECISION SCORE: ",precision_score(ytest,ypred)*100)
print("F1 SCORE: ",f1_score(ytest,ypred)*100)
print("RECALL SCORE: ",recall_score(ytest,ypred)*100)
print("confusion_matrix: \n",confusion_matrix(ytest,ypred))
print("CLASSIFICATION REPORT:\n",classification_report(ytest,ypred))

ACCURACY SCORE:  99.88856791081601
PRECISION SCORE:  99.7776532959123
F1 SCORE:  99.88870291525632
RECALL SCORE:  100.0
confusion_matrix: 
 [[520628   1163]
 [     0 521894]]
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    521791
           1       1.00      1.00      1.00    521894

    accuracy                           1.00   1043685
   macro avg       1.00      1.00      1.00   1043685
weighted avg       1.00      1.00      1.00   1043685



In [18]:
joblib.dump(model,r"C:\Users\Avijit\Desktop\fraud_detection\src\test\model.pkl")

['C:\\Users\\Avijit\\Desktop\\fraud_detection\\src\\test\\model.pkl']