In [25]:
import pandas as pd

import warnings
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# ignore warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('DatosPrueba.csv', delimiter=';')
df = df.drop(columns=['Unnamed: 0.1'], axis=1)
df = df.drop(columns=['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,450,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,462306,-1121138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47034331,-112561071,0
1,2019-01-01 00:06:23,4642894980163,fraud_Rutherford-Mertz,grocery_pos,2474,Eddie,Mendez,M,1831 Faith View Suite 653,Clarinda,...,407491,-95038,7297,IT trainer,1990-07-13,d71c95ab6b7356dd74389d41df429c87,1325376383,40275891,-96011548,0
2,2019-01-01 00:21:32,4334230547694630,fraud_Bruen-Yost,misc_pos,685,Scott,Martin,M,7483 Navarro Flats,Freedom,...,430172,-1110292,471,"Education officer, museum",1967-08-02,f3c43d336e92a44fc2fb67058d5949e3,1325377292,43753735,-111454923,0
3,2019-01-01 00:22:44,630412733309,fraud_Torphy-Goyette,shopping_pos,6621,Heather,Stanton,F,445 Jerry Lights Apt. 081,Republic,...,46368,-879938,1038,Armed forces training and education officer,1964-04-22,20f048d3907dbb9978e23bee7b7578ce,1325377364,46412038,-88516663,0
4,2019-01-01 00:23:58,374125201044065,"fraud_Bahringer, Schoen and Corkery",shopping_pos,903,Christopher,Gilbert,M,20937 Reed Lakes Apt. 271,Washington,...,389757,-770282,601723,"Optician, dispensing",1970-07-20,c733711c521c41c578f4a964d8350df0,1325377438,38880898,-7644111600000002,0


# Feature Engineering

In [3]:
# Ubah amt menjadi float
df['amt'] = df['amt'].str.replace(',', '.').astype(float)

In [4]:
# Memisahkan kolom numerik dengan kategorik
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns
print("Numerical columns:", num_cols)
print("Categorical columns:", cat_cols)

Numerical columns: Index(['cc_num', 'amt', 'zip', 'city_pop', 'unix_time', 'is_fraud'], dtype='object')
Categorical columns: Index(['trans_date_trans_time', 'merchant', 'category', 'first', 'last',
       'gender', 'street', 'city', 'state', 'lat', 'long', 'job', 'dob',
       'trans_num', 'merch_lat', 'merch_long'],
      dtype='object')


In [5]:
# Filtering numerical columns

num_cols = num_cols[num_cols == 'amt']
print("Updated Numerical columns:", num_cols)

Updated Numerical columns: Index(['amt'], dtype='object')


In [6]:
# Menghapus kolom yang tidak diperlukan dari cat_cols
cat_cols = cat_cols.drop(['trans_date_trans_time', 'first', 'last', 'merch_lat', 'merch_long', 'lat', 'long', 'state', 'trans_num', 'gender'])
print("Updated Categorical columns:", cat_cols)

Updated Categorical columns: Index(['merchant', 'category', 'street', 'city', 'job', 'dob'], dtype='object')


In [7]:
# Cek keseimbangan data label
df['is_fraud'].value_counts()

is_fraud
0    193375
1      7506
Name: count, dtype: int64

In [8]:
# train test split
x = df.drop(columns=['is_fraud'])
y = df['is_fraud']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [9]:
from imblearn.under_sampling import RandomUnderSampler

# Menerapkan undersampling
rus = RandomUnderSampler(random_state=42)
x_train_under, y_train_under = rus.fit_resample(x_train, y_train)

# Modeling

In [10]:
from category_encoders import TargetEncoder

# Membuat transformer untuk numerik dan kategorikal
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), num_cols),  # Scaling untuk fitur numerik
        ('cat', TargetEncoder(cols=cat_cols), cat_cols)  # Encoding untuk fitur kategorikal
    ]
)

# Membuat pipeline
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))  # Model classifier
])

# Pipeline siap digunakan
pipeline_rf

In [11]:
# Membuat pipeline
pipeline_lg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))  # Model classifier
])

# Pipeline siap digunakan
pipeline_lg

# Training & Eval

In [12]:
# Melatih model Random Forest
pipeline_rf.fit(x_train_under, y_train_under)
y_train_pred_rf = pipeline_rf.predict(x_train_under)
y_test_pred_rf = pipeline_rf.predict(x_test)

In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation untuk Random Forest
cv_scores_rf = cross_val_score(pipeline_rf, x_train_under, y_train_under, cv=5, scoring='recall')
print("Cross-validation recall scores for Random Forest:", cv_scores_rf)
print("Mean recall score for Random Forest:", cv_scores_rf.mean())
print("STD recall score for Random Forest:", cv_scores_rf.std())

Cross-validation recall scores for Random Forest: [0.9254717  0.92830189 0.91509434 0.92735849 0.92641509]
Mean recall score for Random Forest: 0.9245283018867925
STD recall score for Random Forest: 0.004810395767540354


In [13]:
# Evaluasi model Random Forest
print("Random Forest - Train Classification Report:")
print(classification_report(y_train_under, y_train_pred_rf))
print("\nRandom Forest - Test Classification Report:")
print(classification_report(y_test, y_test_pred_rf))

Random Forest - Train Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5300
           1       1.00      1.00      1.00      5300

    accuracy                           1.00     10600
   macro avg       1.00      1.00      1.00     10600
weighted avg       1.00      1.00      1.00     10600


Random Forest - Test Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97     58059
           1       0.39      0.93      0.55      2206

    accuracy                           0.94     60265
   macro avg       0.69      0.94      0.76     60265
weighted avg       0.97      0.94      0.95     60265



In [14]:
# Melatih model Logistic Regression
pipeline_lg.fit(x_train_under, y_train_under)
y_train_pred_lg = pipeline_lg.predict(x_train_under)
y_test_pred_lg = pipeline_lg.predict(x_test)

In [22]:
# Cross-validation untuk Logistic Regression
cv_scores_lg = cross_val_score(pipeline_lg, x_train_under, y_train_under, cv=5, scoring='recall')
print("Cross-validation recall scores for Logistic Regression:", cv_scores_lg)
print("Mean recall score for Logistic Regression:", cv_scores_lg.mean())
print("STD recall score for Logistic Regression:", cv_scores_lg.std())

Cross-validation recall scores for Logistic Regression: [0.76132075 0.79339623 0.7754717  0.79528302 0.78490566]
Mean recall score for Logistic Regression: 0.7820754716981133
STD recall score for Logistic Regression: 0.012529779408214328


In [15]:
# Evaluasi model Logistic Regression
print("\nLogistic Regression - Train Classification Report:")
print(classification_report(y_train_under, y_train_pred_lg))
print("\nLogistic Regression - Test Classification Report:")
print(classification_report(y_test, y_test_pred_lg))


Logistic Regression - Train Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.91      0.87      5300
           1       0.90      0.80      0.85      5300

    accuracy                           0.86     10600
   macro avg       0.86      0.86      0.86     10600
weighted avg       0.86      0.86      0.86     10600


Logistic Regression - Test Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.86      0.92     58059
           1       0.18      0.78      0.29      2206

    accuracy                           0.86     60265
   macro avg       0.58      0.82      0.61     60265
weighted avg       0.96      0.86      0.90     60265



Pakai recall karena dalam kasus fraud lebih berbahaya memprediksi sebuah transaksi terjadi tidak fraud tetapi sebenarnya fraud yang menghasilkan kerugian pada bisnis. Dari hasil evaluasi, Random Forest memiliki nilai performa terbaik tetapi tidak sestabil Logistic Regression yang hanya memiliki perbedaan 2% antara train dengan test.

In [27]:
# Membuat dataframe untuk komparasi
comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Mean Recall': [cv_scores_lg.mean(), cv_scores_rf.mean()],
    'STD Recall': [cv_scores_lg.std(), cv_scores_rf.std()]
})

# Menampilkan dataframe
comparison_df

Unnamed: 0,Model,Mean Recall,STD Recall
0,Logistic Regression,0.782075,0.01253
1,Random Forest,0.924528,0.00481


Random Forest menghasilkan nilai performa tertinggi dengan stabilitas uang sangat tinggi.

# Hyperparam Tuning

In [16]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning untuk Logistic Regression
param_grid_lg = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    'classifier__solver': ['saga'],  # 'saga' mendukung l1, l2, dan elasticnet
    'classifier__l1_ratio': [0.1, 0.5, 0.9]  # Hanya digunakan untuk elasticnet
}

grid_search_lg = GridSearchCV(pipeline_lg, param_grid_lg, scoring='recall', cv=5, n_jobs=-1)
grid_search_lg.fit(x_train_under, y_train_under)

print("Best parameters for Logistic Regression:", grid_search_lg.best_params_)
print("Best recall score for Logistic Regression:", grid_search_lg.best_score_)

# Hyperparameter tuning untuk Random Forest
param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2', None]
}

grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, scoring='recall', cv=5, n_jobs=-1)
grid_search_rf.fit(x_train_under, y_train_under)

print("Best parameters for Random Forest:", grid_search_rf.best_params_)
print("Best recall score for Random Forest:", grid_search_rf.best_score_)



Best parameters for Logistic Regression: {'classifier__C': 100, 'classifier__l1_ratio': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
Best recall score for Logistic Regression: 0.7864150943396228
Best parameters for Random Forest: {'classifier__max_depth': 10, 'classifier__max_features': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Best recall score for Random Forest: 0.9515094339622641


In [26]:
# Cross-validation untuk Logistic Regression yang sudah di-tune
cv_scores_lg_tuned = cross_val_score(grid_search_lg.best_estimator_, x_train_under, y_train_under, cv=5, scoring='recall')
print("Cross-validation recall scores for Tuned Logistic Regression:", cv_scores_lg_tuned)
print("Mean recall score for Tuned Logistic Regression:", cv_scores_lg_tuned.mean())
print("STD recall score for Tuned Logistic Regression:", cv_scores_lg_tuned.std())

Cross-validation recall scores for Tuned Logistic Regression: [0.77075472 0.79150943 0.78396226 0.79811321 0.78773585]
Mean recall score for Tuned Logistic Regression: 0.7864150943396228
STD recall score for Tuned Logistic Regression: 0.009119279980071388


In [17]:
# Classification report untuk Logistic Regression
print("Logistic Regression - Train Classification Report (Tuned):")
y_train_pred_lg_tuned = grid_search_lg.best_estimator_.predict(x_train_under)
print(classification_report(y_train_under, y_train_pred_lg_tuned))

print("\nLogistic Regression - Test Classification Report (Tuned):")
y_test_pred_lg_tuned = grid_search_lg.best_estimator_.predict(x_test)
print(classification_report(y_test, y_test_pred_lg_tuned))

Logistic Regression - Train Classification Report (Tuned):
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      5300
           1       0.89      0.81      0.85      5300

    accuracy                           0.86     10600
   macro avg       0.86      0.86      0.86     10600
weighted avg       0.86      0.86      0.86     10600


Logistic Regression - Test Classification Report (Tuned):
              precision    recall  f1-score   support

           0       0.99      0.85      0.91     58059
           1       0.16      0.78      0.27      2206

    accuracy                           0.85     60265
   macro avg       0.58      0.82      0.59     60265
weighted avg       0.96      0.85      0.89     60265



In [28]:
# Cross-validation untuk Random Forest yang sudah di-tune
cv_scores_rf_tuned = cross_val_score(grid_search_rf.best_estimator_, x_train_under, y_train_under, cv=5, scoring='recall')
print("Cross-validation recall scores for Tuned Random Forest:", cv_scores_rf_tuned)
print("Mean recall score for Tuned Random Forest:", cv_scores_rf_tuned.mean())
print("STD recall score for Tuned Random Forest:", cv_scores_rf_tuned.std())

Cross-validation recall scores for Tuned Random Forest: [0.95849057 0.95566038 0.93207547 0.96037736 0.9509434 ]
Mean recall score for Tuned Random Forest: 0.9515094339622641
STD recall score for Tuned Random Forest: 0.010223560138976647


In [18]:
# Classification report untuk Random Forest
print("\nRandom Forest - Train Classification Report (Tuned):")
y_train_pred_rf_tuned = grid_search_rf.best_estimator_.predict(x_train_under)
print(classification_report(y_train_under, y_train_pred_rf_tuned))

print("\nRandom Forest - Test Classification Report (Tuned):")
y_test_pred_rf_tuned = grid_search_rf.best_estimator_.predict(x_test)
print(classification_report(y_test, y_test_pred_rf_tuned))


Random Forest - Train Classification Report (Tuned):
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5300
           1       0.98      0.98      0.98      5300

    accuracy                           0.98     10600
   macro avg       0.98      0.98      0.98     10600
weighted avg       0.98      0.98      0.98     10600


Random Forest - Test Classification Report (Tuned):
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     58059
           1       0.42      0.96      0.58      2206

    accuracy                           0.95     60265
   macro avg       0.71      0.95      0.78     60265
weighted avg       0.98      0.95      0.96     60265



Dari hasil tuning yang telah dilakukan, model Random Forest menghasilkan nilai performa terbaik dari kedua model sehingga model Random Forest yang akan digunakan untuk inferens

In [30]:
# Membuat dataframe untuk perbandingan cross-validation sebelum dan setelah tuning
comparison_cv_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Mean Recall (Before Tuning)': [cv_scores_lg.mean(), cv_scores_rf.mean()],
    'Mean Recall (After Tuning)': [cv_scores_lg_tuned.mean(), cv_scores_rf_tuned.mean()],
    'STD Recall (Before Tuning)': [cv_scores_lg.std(), cv_scores_rf.std()],
    'STD Recall (After Tuning)': [cv_scores_lg_tuned.std(), cv_scores_rf_tuned.std()]
})

# Menampilkan dataframe
comparison_cv_df

Unnamed: 0,Model,Mean Recall (Before Tuning),Mean Recall (After Tuning),STD Recall (Before Tuning),STD Recall (After Tuning)
0,Logistic Regression,0.782075,0.786415,0.01253,0.009119
1,Random Forest,0.924528,0.951509,0.00481,0.010224


Dari hasil yang ada, model Random Forest mengalami kenaikan performa tetapi stabilitas menurun sementara Logistic Regression tidak mengalami kenaikan dalam performa tetapi menjadi lebih stabil.

In [19]:
# Save the tuned Random Forest model
import joblib
joblib.dump(grid_search_rf.best_estimator_, 'model_rf_tuned.pkl')

['model_rf_tuned.pkl']