# Import

In [2]:
import pandas as pd

import warnings
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from category_encoders import TargetEncoder
from imblearn.under_sampling import RandomUnderSampler

# ignore warnings
warnings.filterwarnings("ignore")

# Load

In [3]:
df = pd.read_csv('DatosPrueba.csv', delimiter=';')
df = df.drop(columns=['Unnamed: 0.1'], axis=1)
df = df.drop(columns=['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,450,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,462306,-1121138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47034331,-112561071,0
1,2019-01-01 00:06:23,4642894980163,fraud_Rutherford-Mertz,grocery_pos,2474,Eddie,Mendez,M,1831 Faith View Suite 653,Clarinda,...,407491,-95038,7297,IT trainer,1990-07-13,d71c95ab6b7356dd74389d41df429c87,1325376383,40275891,-96011548,0
2,2019-01-01 00:21:32,4334230547694630,fraud_Bruen-Yost,misc_pos,685,Scott,Martin,M,7483 Navarro Flats,Freedom,...,430172,-1110292,471,"Education officer, museum",1967-08-02,f3c43d336e92a44fc2fb67058d5949e3,1325377292,43753735,-111454923,0
3,2019-01-01 00:22:44,630412733309,fraud_Torphy-Goyette,shopping_pos,6621,Heather,Stanton,F,445 Jerry Lights Apt. 081,Republic,...,46368,-879938,1038,Armed forces training and education officer,1964-04-22,20f048d3907dbb9978e23bee7b7578ce,1325377364,46412038,-88516663,0
4,2019-01-01 00:23:58,374125201044065,"fraud_Bahringer, Schoen and Corkery",shopping_pos,903,Christopher,Gilbert,M,20937 Reed Lakes Apt. 271,Washington,...,389757,-770282,601723,"Optician, dispensing",1970-07-20,c733711c521c41c578f4a964d8350df0,1325377438,38880898,-7644111600000002,0


# Feature Engineering

In [4]:
# Ubah amt menjadi float
df['amt'] = df['amt'].str.replace(',', '.').astype(float)

In [5]:
# Convert trans_date_trans_time to datetime
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

# Create a new column trans_hour
df['trans_hour'] = df['trans_date_trans_time'].dt.hour

In [6]:
# Convert dob to datetime
df['dob'] = pd.to_datetime(df['dob'])
latest_date = df['trans_date_trans_time'].max()

# Calculate age using the latest_date
df['age'] = (latest_date - df['dob']).dt.days // 365

In [31]:
# Select specific columns for feature selection
selected_features = df[['trans_hour', 'category', 'state', 'amt', 'age', 'is_fraud']].copy()
selected_features.head()

Unnamed: 0,trans_hour,category,state,amt,age,is_fraud
0,0,gas_transport,MT,45.0,53,0
1,0,grocery_pos,IA,24.74,29,0
2,0,misc_pos,WY,6.85,52,0
3,0,shopping_pos,MI,66.21,56,0
4,0,shopping_pos,DC,9.03,49,0


In [32]:
# Hitung nilai skew untuk kolom numerik
numeric_skew = selected_features.select_dtypes(include=['int32', 'float64', 'int64']).skew()

# Tampilkan hasil
print(numeric_skew)

trans_hour    -0.278868
amt           14.377982
age            0.599278
is_fraud       4.878719
dtype: float64


In [33]:
# Hitung nilai korelasi trans_hour dengan is_fraud menggunakan metode Pearson
correlation_trans_hour_is_fraud = selected_features['trans_hour'].corr(selected_features['is_fraud'], method='pearson')

# Tampilkan hasil
print("Korelasi trans_hour dengan is_fraud (Pearson):", correlation_trans_hour_is_fraud)


Korelasi trans_hour dengan is_fraud (Pearson): 0.034020052108609304


In [36]:
# Hitung nilai korelasi trans_hour dengan is_fraud menggunakan metode Pearson
correlation_trans_hour_is_fraud = selected_features['age'].corr(selected_features['is_fraud'], method='spearman')

# Tampilkan hasil
print("Korelasi age dengan is_fraud (Pearson):", correlation_trans_hour_is_fraud)

Korelasi age dengan is_fraud (Pearson): 0.028098937623054307


In [37]:
from scipy.stats import chi2_contingency
import numpy as np

def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    return np.sqrt(phi2 / min(k - 1, r - 1))

# Hitung Cramér's V untuk kolom 'category'
category_contingency = pd.crosstab(selected_features['category'], selected_features['is_fraud'])
cramers_v_category = cramers_v(category_contingency.values)

# Hitung Cramér's V untuk kolom 'state'
state_contingency = pd.crosstab(selected_features['state'], selected_features['is_fraud'])
cramers_v_state = cramers_v(state_contingency.values)

print("Cramér's V untuk category:", cramers_v_category)
print("Cramér's V untuk state:", cramers_v_state)

Cramér's V untuk category: 0.17398875718130805
Cramér's V untuk state: 0.051479892475386146


In [8]:
# Split the data into features and target
X = selected_features.drop(columns=['is_fraud'])
y = selected_features['is_fraud']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Perform undersampling on the training set
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Display the class distribution after undersampling
print("Class distribution after undersampling:")

print(y_train_resampled.value_counts())

Class distribution after undersampling:
is_fraud
0    5254
1    5254
Name: count, dtype: int64


In [24]:
from sklearn.neighbors import LocalOutlierFactor

# Encode the 'category' column in the training set
categorical_transformer = TargetEncoder()
X_train['category'] = categorical_transformer.fit_transform(X_train['category'], y_train)
X_train['state'] = categorical_transformer.fit_transform(X_train['state'], y_train)
X_test['category'] = categorical_transformer.fit_transform(X_test['category'], y_test)
X_test['state'] = categorical_transformer.fit_transform(X_test['state'], y_test)

clf = LocalOutlierFactor(n_neighbors=20, contamination=0.04)
y_pred = clf.fit_predict(X_train)



In [18]:
y_pred_new = []

for i in y_pred:
    if i == -1:
        y_pred_new.append(1)
    else:
        y_pred_new.append(0)
        
# Ambil indeks dari X_train
index_X_train = X_train.index

# Gabungkan indeks dengan y_pred
result = pd.DataFrame({'Index': index_X_train, 'Prediction': y_pred_new})

result.set_index('Index', inplace=True)
result.head()
result.value_counts()

Prediction
0             134991
1               5625
Name: count, dtype: int64

In [19]:


# Generate classification reports
train_report = classification_report(y_train, y_pred_new)

print("Classification Report - Train Data")
print(train_report)

# print("\nClassification Report - Test Data")
# print(test_report)

Classification Report - Train Data
              precision    recall  f1-score   support

           0       0.97      0.96      0.96    135362
           1       0.12      0.13      0.12      5254

    accuracy                           0.93    140616
   macro avg       0.54      0.55      0.54    140616
weighted avg       0.93      0.93      0.93    140616



In [25]:
from sklearn.ensemble import IsolationForest

# Create an Isolation Forest model
iso_forest = IsolationForest(n_estimators=100, contamination=0.04, random_state=42)

# Fit the model on the training data
iso_forest.fit(X_train)

# Predict anomalies (-1 for anomalies, 1 for normal points)
y_pred_iso = iso_forest.predict(X_train)
y_pred_iso_test = iso_forest.predict(X_test)

# Convert predictions to binary format (1 for anomalies, 0 for normal points)
y_pred_iso_binary = [1 if pred == -1 else 0 for pred in y_pred_iso]
y_pred_iso_binary_test = [1 if pred == -1 else 0 for pred in y_pred_iso_test]

# Create a DataFrame to store the results
iso_result = pd.DataFrame({'Index': X_train.index, 'Prediction': y_pred_iso_binary})
iso_result.set_index('Index', inplace=True)

# Display the value counts of predictions
iso_result.value_counts()

Prediction
0             134991
1               5625
Name: count, dtype: int64

In [26]:
# Generate classification reports
train_report_iso = classification_report(y_train, y_pred_iso_binary)
test_report_iso = classification_report(y_test, y_pred_iso_binary_test)

print("Classification Report - Train Data")
print(train_report_iso)

print("\nClassification Report - Test Data")
print(test_report_iso)

Classification Report - Train Data
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    135362
           1       0.51      0.55      0.53      5254

    accuracy                           0.96    140616
   macro avg       0.75      0.76      0.75    140616
weighted avg       0.96      0.96      0.96    140616


Classification Report - Test Data
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     58013
           1       0.50      0.59      0.54      2252

    accuracy                           0.96     60265
   macro avg       0.74      0.78      0.76     60265
weighted avg       0.97      0.96      0.96     60265



In [34]:
# Pisahkan fitur numerik dan kategorik
num_cols = X.select_dtypes(include=['int32', 'float64', 'int64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

print("Fitur numerik:", num_cols)
print("Fitur kategorik:", cat_cols)

Fitur numerik: ['trans_hour', 'amt', 'age']
Fitur kategorik: ['category', 'state']


In [35]:
# Define preprocessing for numeric and categorical features
numeric_transformer = RobustScaler()
categorical_transformer = TargetEncoder()

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)
preprocessor

# Model Definition

In [36]:
# Define pipelines for Logistic Regression and Random Forest
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

# Display the pipelines
logistic_pipeline

In [37]:
random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

random_forest_pipeline

# Training

In [38]:
# Train Logistic Regression model
logistic_pipeline.fit(X_train_resampled, y_train_resampled)

In [39]:
# Train Random Forest model
random_forest_pipeline.fit(X_train_resampled, y_train_resampled)

# Evaluasi

In [40]:
# Perform cross-validation for Logistic Regression
logistic_cv_scores = cross_val_score(logistic_pipeline, X_train_resampled, y_train_resampled, cv=5, scoring='recall')

# Perform cross-validation for Random Forest
random_forest_cv_scores = cross_val_score(random_forest_pipeline, X_train_resampled, y_train_resampled, cv=5, scoring='recall')

# Create a dataframe to store the results
cv_results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Mean Recall': [logistic_cv_scores.mean(), random_forest_cv_scores.mean()],
    'Std Recall': [logistic_cv_scores.std(), random_forest_cv_scores.std()]
})

cv_results

Unnamed: 0,Model,Mean Recall,Std Recall
0,Logistic Regression,0.758091,0.006326
1,Random Forest,0.972591,0.004722


Dari hasil, model Random forest memiliki performa terbaik dengan performa sangat stabil.

## LogReg

In [41]:
# Predict on train and test sets
y_train_pred = logistic_pipeline.predict(X_train_resampled)
y_test_pred = logistic_pipeline.predict(X_test)

# Generate classification reports
train_report = classification_report(y_train_resampled, y_train_pred)
test_report = classification_report(y_test, y_test_pred)

print("Classification Report - Train Data")
print(train_report)

print("\nClassification Report - Test Data")
print(test_report)

Classification Report - Train Data
              precision    recall  f1-score   support

           0       0.80      0.95      0.87      5254
           1       0.94      0.76      0.84      5254

    accuracy                           0.85     10508
   macro avg       0.87      0.85      0.85     10508
weighted avg       0.87      0.85      0.85     10508


Classification Report - Test Data
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     58013
           1       0.39      0.75      0.51      2252

    accuracy                           0.95     60265
   macro avg       0.69      0.85      0.74     60265
weighted avg       0.97      0.95      0.95     60265



Hasil recall logreg untuk kelas 1 tidak berbeda jauh dengan nilai perbedaan 1%

## RF

In [42]:
# Predict on train and test sets using Random Forest
y_train_pred_rf = random_forest_pipeline.predict(X_train_resampled)
y_test_pred_rf = random_forest_pipeline.predict(X_test)

# Generate classification reports
train_report_rf = classification_report(y_train_resampled, y_train_pred_rf)
test_report_rf = classification_report(y_test, y_test_pred_rf)

print("Classification Report - Train Data (Random Forest)")
print(train_report_rf)

print("\nClassification Report - Test Data (Random Forest)")
print(test_report_rf)

Classification Report - Train Data (Random Forest)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5254
           1       1.00      1.00      1.00      5254

    accuracy                           1.00     10508
   macro avg       1.00      1.00      1.00     10508
weighted avg       1.00      1.00      1.00     10508


Classification Report - Test Data (Random Forest)
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     58013
           1       0.62      0.98      0.76      2252

    accuracy                           0.98     60265
   macro avg       0.81      0.98      0.87     60265
weighted avg       0.98      0.98      0.98     60265



# Hyperparameter Tuning

In [43]:
# Define the parameter grid for Random Forest
param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search_rf = GridSearchCV(
    estimator=random_forest_pipeline,
    param_grid=param_grid_rf,
    scoring='recall',
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Perform the grid search
grid_search_rf.fit(X_train_resampled, y_train_resampled)

# Display the best parameters and the best recall score
print("Best parameters for Random Forest:", grid_search_rf.best_params_)
print("Best recall score:", grid_search_rf.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters for Random Forest: {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Best recall score: 0.9729721353812696


In [44]:
# Perform cross-validation with the best parameters from GridSearchCV
best_rf_pipeline = grid_search_rf.best_estimator_
best_rf_cv_scores = cross_val_score(best_rf_pipeline, X_train_resampled, y_train_resampled, cv=5, scoring='recall')

# Create a dataframe to compare the results
comparison_results = pd.DataFrame({
    'Model': ['Random Forest (Before Tuning)', 'Random Forest (After Tuning)'],
    'Mean Recall': [random_forest_cv_scores.mean(), best_rf_cv_scores.mean()],
    'Std Recall': [random_forest_cv_scores.std(), best_rf_cv_scores.std()]
})

comparison_results

Unnamed: 0,Model,Mean Recall,Std Recall
0,Random Forest (Before Tuning),0.972591,0.004722
1,Random Forest (After Tuning),0.972972,0.004767


In [45]:
# Predict on train and test sets using the best Random Forest pipeline
y_train_pred_best_rf = best_rf_pipeline.predict(X_train_resampled)
y_test_pred_best_rf = best_rf_pipeline.predict(X_test)

# Generate classification reports
train_report_best_rf = classification_report(y_train_resampled, y_train_pred_best_rf)
test_report_best_rf = classification_report(y_test, y_test_pred_best_rf)

print("Classification Report - Train Data (Best Random Forest)")
print(train_report_best_rf)

print("\nClassification Report - Test Data (Best Random Forest)")
print(test_report_best_rf)

Classification Report - Train Data (Best Random Forest)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5254
           1       1.00      1.00      1.00      5254

    accuracy                           1.00     10508
   macro avg       1.00      1.00      1.00     10508
weighted avg       1.00      1.00      1.00     10508


Classification Report - Test Data (Best Random Forest)
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     58013
           1       0.62      0.98      0.76      2252

    accuracy                           0.98     60265
   macro avg       0.81      0.98      0.87     60265
weighted avg       0.99      0.98      0.98     60265



In [46]:
# Create a dataframe to compare recall for train and test sets
rf_recall_comparison = pd.DataFrame({
    'Dataset': ['Train', 'Test', 'Train', 'Test'],
    'Model': ['Random Forest (Before Tuning)', 'Random Forest (Before Tuning)',
              'Random Forest (After Tuning)', 'Random Forest (After Tuning)'],
    'Recall': [
        train_report_rf.split()[15],  # Extract recall for class 1 from train report before tuning
        test_report_rf.split()[15],  # Extract recall for class 1 from test report before tuning
        train_report_best_rf.split()[15],  # Extract recall for class 1 from train report after tuning
        test_report_best_rf.split()[15]   # Extract recall for class 1 from test report after tuning
    ]
})

rf_recall_comparison

Unnamed: 0,Dataset,Model,Recall
0,Train,Random Forest (Before Tuning),1.0
1,Test,Random Forest (Before Tuning),0.98
2,Train,Random Forest (After Tuning),1.0
3,Test,Random Forest (After Tuning),0.98


In [48]:
import joblib

# Simpan model ke file
joblib.dump(best_rf_pipeline, 'best_random_forest_model_ian.pkl')

print("Model berhasil disimpan sebagai 'best_random_forest_model_ian.pkl'")

Model berhasil disimpan sebagai 'best_random_forest_model_ian.pkl'
