# Modeling - Before Hyperparameter

In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
import gc

# Modeling 
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report


In [2]:
dataset = pd.read_csv('dataclear.csv')

In [3]:
# Memisahkan data menjadi fitur (X) dan label (y)
X = dataset.drop(columns=['TARGET'], axis = 1)
y = dataset['TARGET']


print('X data shape: ', X.shape)
print('y data shape: ', y.shape)

X data shape:  (307507, 129)
y data shape:  (307507,)


In [4]:
dataset.TARGET.value_counts()

0    282682
1     24825
Name: TARGET, dtype: int64

Jika dilihat label yang terdapat di dataset, kita menemukan data yang tidak seimbang (imbalance) ini. Hal ini akan sangat mempengaruhi akurasi pada saat pemodelan. Oleh karena itu dataset harus diseimbangkan 0 dan 1.

In [5]:
# Dataset di seimbangkan dengan cara RandomOverSampling minoruty
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42, sampling_strategy='minority')
X, y = ros.fit_resample(X, y)
print('X data shape after oversampling: ', X.shape)
print('y data shape after oversampling: ', y.shape)

X data shape after oversampling:  (565364, 129)
y data shape after oversampling:  (565364,)


In [6]:
# Data suda seimbang
y.value_counts()

1    282682
0    282682
Name: TARGET, dtype: int64

In [7]:
dataset.shape

(307507, 130)

In [8]:
# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Mencetak bentuk data X_train, X_test, y_train, dan y_test
print('X_train data shape: ', X_train.shape)
print('y_train data shape: ', y_train.shape)
print('X_test data shape: ', X_test.shape)
print('y_test data shape: ', y_test.shape)

X_train data shape:  (452291, 129)
y_train data shape:  (452291,)
X_test data shape:  (113073, 129)
y_test data shape:  (113073,)


## Logistic Regrestion

In [9]:
m1 = 'Logistic Regression'
lr = LogisticRegression()
model = lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)
print("confussion matrix")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Logistic Regression:",lr_acc_score*100,'\n')
print(classification_report(y_test,lr_predict))

confussion matrix
[[34856 21493]
 [25408 31316]]


Accuracy of Logistic Regression: 58.52148611958646 

              precision    recall  f1-score   support

           0       0.58      0.62      0.60     56349
           1       0.59      0.55      0.57     56724

    accuracy                           0.59    113073
   macro avg       0.59      0.59      0.58    113073
weighted avg       0.59      0.59      0.58    113073



## Decision Tree

In [11]:
m6 = 'DecisionTreeClassifier'
dt = DecisionTreeClassifier(criterion = 'entropy',random_state=0,max_depth = 6)
dt.fit(X_train, y_train)
dt_predicted = dt.predict(X_test)
dt_conf_matrix = confusion_matrix(y_test, dt_predicted)
dt_acc_score = accuracy_score(y_test, dt_predicted)
print("confussion matrix")
print(dt_conf_matrix)
print("\n")
print("Accuracy of DecisionTreeClassifier:",dt_acc_score*100,'\n')
print(classification_report(y_test,dt_predicted))

confussion matrix
[[36947 19402]
 [18062 38662]]


Accuracy of DecisionTreeClassifier: 66.86742193096495 

              precision    recall  f1-score   support

           0       0.67      0.66      0.66     56349
           1       0.67      0.68      0.67     56724

    accuracy                           0.67    113073
   macro avg       0.67      0.67      0.67    113073
weighted avg       0.67      0.67      0.67    113073



## Naive Bayes

In [12]:
m2 = 'Naive Bayes'
nb = GaussianNB()
nb.fit(X_train,y_train)
nbpred = nb.predict(X_test)
nb_conf_matrix = confusion_matrix(y_test, nbpred)
nb_acc_score = accuracy_score(y_test, nbpred)
print("confussion matrix")
print(nb_conf_matrix)
print("\n")
print("Accuracy of Naive Bayes model:",nb_acc_score*100,'\n')
print(classification_report(y_test,nbpred))

confussion matrix
[[37575 18774]
 [28340 28384]]


Accuracy of Naive Bayes model: 58.33311223722728 

              precision    recall  f1-score   support

           0       0.57      0.67      0.61     56349
           1       0.60      0.50      0.55     56724

    accuracy                           0.58    113073
   macro avg       0.59      0.58      0.58    113073
weighted avg       0.59      0.58      0.58    113073



## Random Forest Classfier

In [13]:
m3 = 'Random Forest Classfier'
rf = RandomForestClassifier(n_estimators=20, random_state=12,max_depth=5)
rf.fit(X_train,y_train)
rf_predicted = rf.predict(X_test)
rf_conf_matrix = confusion_matrix(y_test, rf_predicted)
rf_acc_score = accuracy_score(y_test, rf_predicted)
print("confussion matrix")
print(rf_conf_matrix)
print("\n")
print("Accuracy of Random Forest:",rf_acc_score*100,'\n')
print(classification_report(y_test,rf_predicted))

confussion matrix
[[37668 18681]
 [17509 39215]]


Accuracy of Random Forest: 67.99412768742317 

              precision    recall  f1-score   support

           0       0.68      0.67      0.68     56349
           1       0.68      0.69      0.68     56724

    accuracy                           0.68    113073
   macro avg       0.68      0.68      0.68    113073
weighted avg       0.68      0.68      0.68    113073



## Extreme Gradient Boost

In [14]:
m4 = 'Extreme Gradient Boost'
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_predicted = xgb.predict(X_test)
xgb_conf_matrix = confusion_matrix(y_test, xgb_predicted)
xgb_acc_score = accuracy_score(y_test, xgb_predicted)
print("confussion matrix")
print(xgb_conf_matrix)
print("\n")
print("Accuracy of Extreme Gradient Boost:",xgb_acc_score*100,'\n')
print(classification_report(y_test,xgb_predicted))

confussion matrix
[[42912 13437]
 [11562 45162]]


Accuracy of Extreme Gradient Boost: 77.8912737788862 

              precision    recall  f1-score   support

           0       0.79      0.76      0.77     56349
           1       0.77      0.80      0.78     56724

    accuracy                           0.78    113073
   macro avg       0.78      0.78      0.78    113073
weighted avg       0.78      0.78      0.78    113073



## KNN

In [15]:
m5 = 'K-NeighborsClassifier'
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
knn_predicted = knn.predict(X_test)
knn_conf_matrix = confusion_matrix(y_test, knn_predicted)
knn_acc_score = accuracy_score(y_test, knn_predicted)
print("confussion matrix")
print(knn_conf_matrix)
print("\n")
print("Accuracy of K-NeighborsClassifier:",knn_acc_score*100,'\n')
print(classification_report(y_test,knn_predicted))

confussion matrix
[[36461 19888]
 [ 3246 53478]]


Accuracy of K-NeighborsClassifier: 79.54065073005934 

              precision    recall  f1-score   support

           0       0.92      0.65      0.76     56349
           1       0.73      0.94      0.82     56724

    accuracy                           0.80    113073
   macro avg       0.82      0.79      0.79    113073
weighted avg       0.82      0.80      0.79    113073



## Lightgbm

In [16]:
# Create model
model = "LightGBM"
lgbm = lgb.LGBMClassifier()

# Fit model to training data
lgbm.fit(X_train, y_train)

# Predict labels for test data
lgbm_predicted = lgbm.predict(X_test)

# Evaluate model performance
lgbm_conf_matrix = confusion_matrix(y_test, lgbm_predicted)
lgbm_acc_score = accuracy_score(y_test, lgbm_predicted)
print("confusion matrix")
print(lgbm_conf_matrix)
print("\n")
print("Accuracy of LightGBM:",lgbm_acc_score*100,'\n')
print(classification_report(y_test,lgbm_predicted))

confusion matrix
[[40869 15480]
 [14829 41895]]


Accuracy of LightGBM: 73.19519248626993 

              precision    recall  f1-score   support

           0       0.73      0.73      0.73     56349
           1       0.73      0.74      0.73     56724

    accuracy                           0.73    113073
   macro avg       0.73      0.73      0.73    113073
weighted avg       0.73      0.73      0.73    113073

