In [34]:
import pandas as pd

# import encoders and scalers
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [48]:
df = pd.read_csv('DatosPrueba.csv', delimiter=';')
df = df.drop(columns=['Unnamed: 0.1'], axis=1)
df = df.drop(columns=['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,450,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,462306,-1121138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47034331,-112561071,0
1,2019-01-01 00:06:23,4642894980163,fraud_Rutherford-Mertz,grocery_pos,2474,Eddie,Mendez,M,1831 Faith View Suite 653,Clarinda,...,407491,-95038,7297,IT trainer,1990-07-13,d71c95ab6b7356dd74389d41df429c87,1325376383,40275891,-96011548,0
2,2019-01-01 00:21:32,4334230547694630,fraud_Bruen-Yost,misc_pos,685,Scott,Martin,M,7483 Navarro Flats,Freedom,...,430172,-1110292,471,"Education officer, museum",1967-08-02,f3c43d336e92a44fc2fb67058d5949e3,1325377292,43753735,-111454923,0
3,2019-01-01 00:22:44,630412733309,fraud_Torphy-Goyette,shopping_pos,6621,Heather,Stanton,F,445 Jerry Lights Apt. 081,Republic,...,46368,-879938,1038,Armed forces training and education officer,1964-04-22,20f048d3907dbb9978e23bee7b7578ce,1325377364,46412038,-88516663,0
4,2019-01-01 00:23:58,374125201044065,"fraud_Bahringer, Schoen and Corkery",shopping_pos,903,Christopher,Gilbert,M,20937 Reed Lakes Apt. 271,Washington,...,389757,-770282,601723,"Optician, dispensing",1970-07-20,c733711c521c41c578f4a964d8350df0,1325377438,38880898,-7644111600000002,0


In [49]:
# Ubah amt menjadi float
df['amt'] = df['amt'].str.replace(',', '.').astype(float)

In [51]:
# Memisahkan kolom numerik dengan kategorik
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns
print("Numerical columns:", num_cols)
print("Categorical columns:", cat_cols)

Numerical columns: Index(['cc_num', 'amt', 'zip', 'city_pop', 'unix_time', 'is_fraud'], dtype='object')
Categorical columns: Index(['trans_date_trans_time', 'merchant', 'category', 'first', 'last',
       'gender', 'street', 'city', 'state', 'lat', 'long', 'job', 'dob',
       'trans_num', 'merch_lat', 'merch_long'],
      dtype='object')


In [None]:
# Filtering numerical columns

num_cols = num_cols[num_cols == 'amt']
print("Updated Numerical columns:", num_cols)

In [None]:
# Menghapus kolom Is Fraudulent karena berupa kolom label pada num_cols
num_cols = num_cols.drop('is_fraud')
print("Updated Numerical columns:", num_cols)

Updated Numerical columns: Index(['Transaction Amount', 'Quantity', 'Customer Age', 'Account Age Days',
       'Transaction Hour'],
      dtype='object')


In [15]:
# Cek skew pada kolom numerik
df[num_cols].skew().sort_values(ascending=False)

Transaction Amount    6.696741
Transaction Hour      0.033876
Account Age Days      0.023367
Quantity              0.005508
Customer Age         -0.013832
dtype: float64

In [16]:
# Menghapus kolom yang tidak diperlukan seperti Transaction ID dan Customer ID pada cat_cols
cat_cols = cat_cols.drop(['Transaction ID', 'Customer ID'])
print("Updated Categorical columns:", cat_cols)

Updated Categorical columns: Index(['Transaction Date', 'Payment Method', 'Product Category',
       'Customer Location', 'Device Used', 'IP Address', 'Shipping Address',
       'Billing Address'],
      dtype='object')


In [17]:
# Cek kardinalitas pada kolom kategorik

df[cat_cols].nunique()

Transaction Date     23607
Payment Method           4
Product Category         5
Customer Location    14868
Device Used              3
IP Address           23634
Shipping Address     23634
Billing Address      23634
dtype: int64

In [18]:
# Cek keseimbangan data label
df['Is Fraudulent'].value_counts()

Is Fraudulent
0    22412
1     1222
Name: count, dtype: int64

In [42]:
df[df['Quantity']<=0]

Unnamed: 0,Transaction ID,Customer ID,Transaction Amount,Transaction Date,Payment Method,Product Category,Quantity,Customer Age,Customer Location,Device Used,IP Address,Shipping Address,Billing Address,Is Fraudulent,Account Age Days,Transaction Hour


In [30]:
# train test split
x = df.drop(columns=['Is Fraudulent'])
y = df['Is Fraudulent']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [43]:
from imblearn.under_sampling import RandomUnderSampler

# Melakukan undersampling
rus = RandomUnderSampler(random_state=42)
x_train_under, y_train_under = rus.fit_resample(x_train, y_train)

# Menampilkan jumlah data setelah undersampling
print("Jumlah data setelah undersampling:")
print(y_train_under.value_counts())

Jumlah data setelah undersampling:
Is Fraudulent
0    980
1    980
Name: count, dtype: int64


In [32]:
# Memilih fitur numerik yang berupa Transaction Amount, Account Age Days, dan Transaction Hour

num_cols_sel = ['Transaction Amount', 'Account Age Days', 'Transaction Hour']

In [35]:
# ColumnTransformer untuk memilih hanya fitur pada num_cols_sel
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_cols_sel)  # Memilih kolom numerik yang diinginkan
    ]
)

# Pipeline untuk Logistic Regression
logreg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing dengan ColumnTransformer
    ('scaler', RobustScaler()),      # Standarisasi data numerik dengan RobustScaler
    ('logreg', LogisticRegression(random_state=42))
])

# Pipeline untuk Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing dengan ColumnTransformer
    ('scaler', RobustScaler()),      # Standarisasi data numerik dengan RobustScaler
    ('rf', RandomForestClassifier(random_state=42))
])

# Menampilkan pipeline
print("Logistic Regression Pipeline:", logreg_pipeline)
print("Random Forest Pipeline:", rf_pipeline)

Logistic Regression Pipeline: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  ['Transaction Amount',
                                                   'Account Age Days',
                                                   'Transaction Hour'])])),
                ('scaler', RobustScaler()),
                ('logreg', LogisticRegression(random_state=42))])
Random Forest Pipeline: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  ['Transaction Amount',
                                                   'Account Age Days',
                                                   'Transaction Hour'])])),
                ('scaler', RobustScaler()),
                ('rf', RandomForestClassifier(random_state=42))])


In [45]:
# Melatih Logistic Regression
logreg_pipeline.fit(x_train_under, y_train_under)

# Melatih Random Forest
rf_pipeline.fit(x_train_under, y_train_under)

# Evaluasi Logistic Regression pada data train
logreg_train_score = logreg_pipeline.score(x_train_under, y_train_under)
print("Logistic Regression Train Accuracy:", logreg_train_score)

# Evaluasi Random Forest pada data train
rf_train_score = rf_pipeline.score(x_train_under, y_train_under)
print("Random Forest Train Accuracy:", rf_train_score)

# Evaluasi Logistic Regression pada data test
logreg_test_score = logreg_pipeline.score(x_test, y_test)
print("Logistic Regression Test Accuracy:", logreg_test_score)

# Evaluasi Random Forest pada data test
rf_test_score = rf_pipeline.score(x_test, y_test)
print("Random Forest Test Accuracy:", rf_test_score)

Logistic Regression Train Accuracy: 0.7168367346938775
Random Forest Train Accuracy: 1.0
Logistic Regression Test Accuracy: 0.7277342923630209
Random Forest Test Accuracy: 0.7670827163105564


In [37]:
# Evaluasi Logistic Regression pada data train
logreg_train_score = logreg_pipeline.score(x_train, y_train)
print("Logistic Regression Train Accuracy:", logreg_train_score)

# Evaluasi Random Forest pada data train
rf_train_score = rf_pipeline.score(x_train, y_train)
print("Random Forest Train Accuracy:", rf_train_score)

# Evaluasi Logistic Regression pada data test
logreg_test_score = logreg_pipeline.score(x_test, y_test)
print("Logistic Regression Test Accuracy:", logreg_test_score)

# Evaluasi Random Forest pada data test
rf_test_score = rf_pipeline.score(x_test, y_test)
print("Random Forest Test Accuracy:", rf_test_score)

Logistic Regression Train Accuracy: 0.9532448299571588
Random Forest Train Accuracy: 1.0
Logistic Regression Test Accuracy: 0.9526126507298498
Random Forest Test Accuracy: 0.9496509414004655


In [46]:
from sklearn.metrics import f1_score

# Evaluasi Logistic Regression pada data train
logreg_train_f1_class_1 = f1_score(y_train_under, logreg_pipeline.predict(x_train_under), pos_label=1)
print("Logistic Regression Train F1-Score (Class 1):", logreg_train_f1_class_1)

# Evaluasi Logistic Regression pada data test
logreg_test_f1_class_1 = f1_score(y_test, logreg_pipeline.predict(x_test), pos_label=1)
print("Logistic Regression Test F1-Score (Class 1):", logreg_test_f1_class_1)

# Evaluasi Random Forest pada data train
rf_train_f1_class_1 = f1_score(y_train_under, rf_pipeline.predict(x_train_under), pos_label=1)
print("Random Forest Train F1-Score (Class 1):", rf_train_f1_class_1)

# Evaluasi Random Forest pada data test
rf_test_f1_class_1 = f1_score(y_test, rf_pipeline.predict(x_test), pos_label=1)
print("Random Forest Test F1-Score (Class 1):", rf_test_f1_class_1)

Logistic Regression Train F1-Score (Class 1): 0.7098797699947726
Logistic Regression Test F1-Score (Class 1): 0.20210787352758836
Random Forest Train F1-Score (Class 1): 1.0
Random Forest Test F1-Score (Class 1): 0.22953114065780267
