# Import Libraries

In [3]:
#Primary Libraries
import pandas as pd  
import seaborn as sns  
import numpy as np  
import matplotlib.pyplot as plt  
import phik


#Feature Engineering & Classification Modeling Related
from scipy.stats import chi2_contingency  # Untuk uji chi-square pada tabel kontingensi.
from imblearn.over_sampling import SMOTE, SMOTENC  # Untuk mengatasi data tidak seimbang dengan oversampling.
from sklearn.linear_model import LogisticRegression  # Model regresi logistik untuk klasifikasi biner.
from sklearn.model_selection import train_test_split  # Untuk membagi data menjadi set pelatihan dan pengujian.
from sklearn.metrics import (  # Untuk mengevaluasi kinerja model.
    classification_report,  # Laporan klasifikasi dengan precision, recall, dll.
    f1_score,  # Menghitung f1-score untuk menilai keseimbangan antara precision dan recall.
    roc_auc_score,  # Menghitung AUC dari kurva ROC.
    confusion_matrix,  # Matriks kebingungan untuk melihat performa model.
    accuracy_score,  # Menghitung akurasi model.
    ConfusionMatrixDisplay  # Untuk menampilkan matriks kebingungan secara visual.
)
from sklearn.preprocessing import (  # Untuk mempersiapkan data sebelum pelatihan model.
    StandardScaler,  # Menstandarisasi fitur (rata-rata 0, deviasi standar 1).
    OneHotEncoder,  # Mengubah fitur kategori menjadi representasi biner.
    LabelEncoder,  # Mengubah label kategori menjadi angka.
    OrdinalEncoder,  # Mengubah fitur kategori menjadi angka dengan urutan.
    MinMaxScaler,  # Mengubah fitur ke rentang [0, 1].
    PowerTransformer
)

from category_encoders import TargetEncoder
from feature_engine.outliers import Winsorizer  # Untuk menangani outlier dengan membatasi nilai ekstrim.
from sklearn.pipeline import Pipeline, make_pipeline  # Untuk mengatur serangkaian langkah pemrosesan dan model.
from sklearn.impute import SimpleImputer  # Untuk mengisi nilai yang hilang dalam dataset.
from sklearn.compose import ColumnTransformer  # Untuk menerapkan transformasi berbeda pada kolom yang berbeda.
from sklearn.feature_selection import SelectPercentile  # Untuk memilih fitur berdasarkan skor tertentu.

#Display Setting
pd.set_option('display.max_colwidth', None)             # while column display need to be expanded 
import warnings 
warnings.filterwarnings('ignore')                       # for not displaying warning outputs

# Pipeline Related
from sklearn.neighbors import KNeighborsClassifier
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTENC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Data Loading

In [4]:
df_ori= pd.read_csv('initial_cleaning_individual.csv')
df = df_ori.copy()
df

Unnamed: 0,ID,Gender,Location,E-Paylater User Status,Educational Background,Year of Birth,Job Status,Monthly Income,Online Shopping Expenditure Percentage,IBB1,...,SC1,SC2,SC3,SC4,SC5,NE1,NE2,NE3,NE4,NE5
0,1,Female,West Sumatera,Non e-paylater user,Bachelor/Diploma IV,1997,Working,"Rp 2,500,001 - Rp 5,000,000",Less than 20%,3,...,5,5,3,3,4,3,3,4,3,2
1,2,Male,DKI Jakarta,e-paylater user,Until Senior High School,2002,Students,"Rp 1,000,001 - Rp 2,500,000",21% - 40%,3,...,4,4,2,3,4,5,5,5,5,4
2,3,Male,West Sumatera,Non e-paylater user,Until Senior High School,2002,Students,"Rp 1,000,001 - Rp 2,500,000",Less than 20%,3,...,4,4,2,2,2,3,3,3,3,3
3,4,Female,West Sumatera,e-paylater user,Bachelor/Diploma IV,2002,Students,"Less than Rp 1,000,000",Less than 20%,4,...,2,4,2,2,2,1,1,1,2,1
4,5,Male,West Sumatera,e-paylater user,Bachelor/Diploma IV,2002,Working,"More than Rp 10,000,000",Less than 20%,1,...,5,2,3,1,5,3,3,2,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
805,806,Male,West Sumatera,Non e-paylater user,Bachelor/Diploma IV,1989,Students,"Less than Rp 1,000,000",Less than 20%,5,...,4,4,3,4,4,3,3,3,4,3
806,807,Male,West Sumatera,Non e-paylater user,Bachelor/Diploma IV,2003,Students,"Less than Rp 1,000,000",Less than 20%,3,...,3,3,3,3,3,2,1,2,4,3
807,808,Female,West Sumatera,Non e-paylater user,"Diploma I, II, III",2002,Entrepreneur,"Rp 1,000,001 - Rp 2,500,000",Less than 20%,3,...,4,4,2,2,3,1,5,2,5,1
808,809,Female,West Sumatera,Non e-paylater user,Until Senior High School,2003,Students,"Less than Rp 1,000,000",Less than 20%,2,...,4,4,4,4,4,5,5,5,3,3


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 810 entries, 0 to 809
Data columns (total 37 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   ID                                      810 non-null    int64 
 1   Gender                                  810 non-null    object
 2   Location                                810 non-null    object
 3   E-Paylater User Status                  810 non-null    object
 4   Educational Background                  810 non-null    object
 5   Year of Birth                           810 non-null    int64 
 6   Job Status                              810 non-null    object
 7   Monthly Income                          810 non-null    object
 8   Online Shopping Expenditure Percentage  810 non-null    object
 9   IBB1                                    810 non-null    int64 
 10  IBB2                                    810 non-null    int64 
 11  IBB3  

In [6]:
df.rename(columns={"E-Paylater User Status":"PayLaterStatus","Educational Background":"Education","Year of Birth":"BirthYear","Job Status":"JobStatus","Monthly Income":"MonthlyIncome","Average monthly expenditure for online shopping in relation to monthly income": "AVGMonthlyExpenditure"}, inplace=True)
df

Unnamed: 0,ID,Gender,Location,PayLaterStatus,Education,BirthYear,JobStatus,MonthlyIncome,Online Shopping Expenditure Percentage,IBB1,...,SC1,SC2,SC3,SC4,SC5,NE1,NE2,NE3,NE4,NE5
0,1,Female,West Sumatera,Non e-paylater user,Bachelor/Diploma IV,1997,Working,"Rp 2,500,001 - Rp 5,000,000",Less than 20%,3,...,5,5,3,3,4,3,3,4,3,2
1,2,Male,DKI Jakarta,e-paylater user,Until Senior High School,2002,Students,"Rp 1,000,001 - Rp 2,500,000",21% - 40%,3,...,4,4,2,3,4,5,5,5,5,4
2,3,Male,West Sumatera,Non e-paylater user,Until Senior High School,2002,Students,"Rp 1,000,001 - Rp 2,500,000",Less than 20%,3,...,4,4,2,2,2,3,3,3,3,3
3,4,Female,West Sumatera,e-paylater user,Bachelor/Diploma IV,2002,Students,"Less than Rp 1,000,000",Less than 20%,4,...,2,4,2,2,2,1,1,1,2,1
4,5,Male,West Sumatera,e-paylater user,Bachelor/Diploma IV,2002,Working,"More than Rp 10,000,000",Less than 20%,1,...,5,2,3,1,5,3,3,2,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
805,806,Male,West Sumatera,Non e-paylater user,Bachelor/Diploma IV,1989,Students,"Less than Rp 1,000,000",Less than 20%,5,...,4,4,3,4,4,3,3,3,4,3
806,807,Male,West Sumatera,Non e-paylater user,Bachelor/Diploma IV,2003,Students,"Less than Rp 1,000,000",Less than 20%,3,...,3,3,3,3,3,2,1,2,4,3
807,808,Female,West Sumatera,Non e-paylater user,"Diploma I, II, III",2002,Entrepreneur,"Rp 1,000,001 - Rp 2,500,000",Less than 20%,3,...,4,4,2,2,3,1,5,2,5,1
808,809,Female,West Sumatera,Non e-paylater user,Until Senior High School,2003,Students,"Less than Rp 1,000,000",Less than 20%,2,...,4,4,4,4,4,5,5,5,3,3


In [7]:
df.rename(columns={"Online Shopping Expenditure Percentage":"AVGMonthlyExpenditure"}, inplace=True)
df

Unnamed: 0,ID,Gender,Location,PayLaterStatus,Education,BirthYear,JobStatus,MonthlyIncome,AVGMonthlyExpenditure,IBB1,...,SC1,SC2,SC3,SC4,SC5,NE1,NE2,NE3,NE4,NE5
0,1,Female,West Sumatera,Non e-paylater user,Bachelor/Diploma IV,1997,Working,"Rp 2,500,001 - Rp 5,000,000",Less than 20%,3,...,5,5,3,3,4,3,3,4,3,2
1,2,Male,DKI Jakarta,e-paylater user,Until Senior High School,2002,Students,"Rp 1,000,001 - Rp 2,500,000",21% - 40%,3,...,4,4,2,3,4,5,5,5,5,4
2,3,Male,West Sumatera,Non e-paylater user,Until Senior High School,2002,Students,"Rp 1,000,001 - Rp 2,500,000",Less than 20%,3,...,4,4,2,2,2,3,3,3,3,3
3,4,Female,West Sumatera,e-paylater user,Bachelor/Diploma IV,2002,Students,"Less than Rp 1,000,000",Less than 20%,4,...,2,4,2,2,2,1,1,1,2,1
4,5,Male,West Sumatera,e-paylater user,Bachelor/Diploma IV,2002,Working,"More than Rp 10,000,000",Less than 20%,1,...,5,2,3,1,5,3,3,2,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
805,806,Male,West Sumatera,Non e-paylater user,Bachelor/Diploma IV,1989,Students,"Less than Rp 1,000,000",Less than 20%,5,...,4,4,3,4,4,3,3,3,4,3
806,807,Male,West Sumatera,Non e-paylater user,Bachelor/Diploma IV,2003,Students,"Less than Rp 1,000,000",Less than 20%,3,...,3,3,3,3,3,2,1,2,4,3
807,808,Female,West Sumatera,Non e-paylater user,"Diploma I, II, III",2002,Entrepreneur,"Rp 1,000,001 - Rp 2,500,000",Less than 20%,3,...,4,4,2,2,3,1,5,2,5,1
808,809,Female,West Sumatera,Non e-paylater user,Until Senior High School,2003,Students,"Less than Rp 1,000,000",Less than 20%,2,...,4,4,4,4,4,5,5,5,3,3


In [8]:
df.MonthlyIncome.unique()

array(['Rp 2,500,001 - Rp 5,000,000', 'Rp 1,000,001 - Rp 2,500,000',
       'Less than Rp 1,000,000', 'More than Rp 10,000,000',
       'Rp 5,000,001 - Rp 7,500,000', 'Rp 7,500,001 - Rp 10,000,000'],
      dtype=object)

In [9]:
df.Education.unique()

array(['Bachelor/Diploma IV', 'Until Senior High School', 'Post Graduate',
       'Diploma I, II, III'], dtype=object)

In [29]:
df.columns

Index(['ID', 'Gender', 'Location', 'PayLaterStatus', 'Education', 'BirthYear',
       'JobStatus', 'MonthlyIncome', 'AVGMonthlyExpenditure', 'IBB1', 'IBB2',
       'IBB3', 'IBB4', 'P1', 'P2', 'P3', 'P4', 'SI1', 'SI2', 'SI3', 'SI4',
       'SI5', 'SI6', 'H1', 'H2', 'H3', 'H4', 'SC1', 'SC2', 'SC3', 'SC4', 'SC5',
       'NE1', 'NE2', 'NE3', 'NE4', 'NE5'],
      dtype='object')

# Data Preprocessing

In [10]:
# Data Splitting
X = df.drop(columns={"ID","Gender","Location","BirthYear","PayLaterStatus"})
y= df.drop(columns=X)
y = y.drop(columns={"ID","Location","Gender","BirthYear"})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

X_train

Unnamed: 0,Education,JobStatus,MonthlyIncome,AVGMonthlyExpenditure,IBB1,IBB2,IBB3,IBB4,P1,P2,...,SC1,SC2,SC3,SC4,SC5,NE1,NE2,NE3,NE4,NE5
23,Until Senior High School,Students,"Less than Rp 1,000,000",21% - 40%,4,3,2,2,4,4,...,4,4,2,2,2,3,3,3,3,3
196,Until Senior High School,Students,"Less than Rp 1,000,000",Less than 20%,4,3,3,4,1,1,...,4,4,2,2,2,1,2,1,1,1
803,Bachelor/Diploma IV,Not Working,"Less than Rp 1,000,000",21% - 40%,4,3,3,4,1,1,...,4,4,3,3,3,4,4,4,4,3
66,Until Senior High School,Students,"Less than Rp 1,000,000",21% - 40%,4,3,3,2,1,3,...,4,4,2,2,3,5,4,5,4,1
99,Until Senior High School,Students,"Less than Rp 1,000,000",Less than 20%,1,1,1,1,1,1,...,5,5,2,2,4,3,4,3,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293,Until Senior High School,Students,"Less than Rp 1,000,000",Less than 20%,4,4,4,4,2,2,...,4,4,2,2,2,3,3,3,3,3
752,Until Senior High School,Students,"More than Rp 10,000,000",Less than 20%,1,2,2,2,2,2,...,2,2,4,4,4,4,4,3,4,4
543,"Diploma I, II, III",Working,"Rp 2,500,001 - Rp 5,000,000",21% - 40%,2,2,2,2,2,2,...,3,4,3,3,3,4,4,4,4,4
696,Until Senior High School,Students,"Less than Rp 1,000,000",21% - 40%,2,2,2,2,2,2,...,4,3,3,2,4,4,3,4,3,3


# Collinearity Checking

In [27]:
from scipy.stats import pearsonr

In [62]:
ord_enc = OrdinalEncoder()
y_train_enc0 = ord_enc.fit_transform(y_train)

In [65]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()  # Numerical columns
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()  # Categorical columns
results = []  # Empty list to save our correlation analysis result

# For numerical columns, check for constant values before calculating correlation
for col in num_cols:
    if X_train[col].nunique() > 1:  # Check if the column has more than one unique value
        p_value = pearsonr(X_train[col], y_train_enc0, alternative='two-sided')[1]
        p_value_text = 'Correlated' if p_value < 0.05 else 'Not Correlated'
    else:
        p_value_text = 'Not Correlated'  # Indicate that the input is constant
    results.append((col, p_value_text))

# For categorical columns, we'll use chi-square to find the correlation
for col in cat_cols:
    contingency_table = pd.crosstab(X_train[col], y_train_enc0)
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    p_value_text = 'Correlated' if p_value < 0.05 else 'Not Correlated'
    results.append((col, p_value_text))

# Save the result into a DataFrame for easier reading
result_df = pd.DataFrame(results, columns=['Column Name', 'Result'])
result_df

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [11]:
# Further Data Splitting
X_Ord = ['Education',"JobStatus","MonthlyIncome","AVGMonthlyExpenditure"]
ord_values = [
    ["Until Senior High School","Diploma I, II, III","Bachelor/Diploma IV","Post Graduate"],
    ["Students","Working","Not Working","Entrepreneur"],
    ["Less than Rp  1,000,000",	"Rp 1,000,001 - Rp 2,500,000",	"Rp 2,500,001 - Rp 5,000,000",	"Rp 5,000,001 - Rp 7,500,000",	 "Rp 7,500,001 - Rp 10,000,000", "More than Rp. 10,000,000"],
    ["Less than 20%",	"21% - 40%",	"41% - 60%",	 "61% - 80%",	"81% - 100%",	"More than 100%"]
]
y_ord = ["Non e-paylater user", "e-paylater user"]
norm = ['IBB1', 'IBB2', 'IBB3', 'IBB4', 'P1', 'P2', 'P3', 'P4', 'SI1', 'SI2', 'SI3', 'SI4',
       'SI5', 'SI6', 'H1', 'H2', 'H3', 'H4', 'SC1', 'SC2', 'SC3', 'SC4', 'SC5',
       'NE1', 'NE2', 'NE3', 'NE4', 'NE5']
categorical = [0-31]
smotenc = SMOTENC(categorical_features=categorical, random_state=27)

In [12]:
# Create a Custom Transformer for y
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, encoder):
        self.encoder = encoder
    
    def fit(self, y, **fit_params):
        y = y.values.reshape(-1, 1) if isinstance(y, pd.Series) else y
        self.encoder.fit(y)
        return self
    
    def transform(self, y):
        y = y.values.reshape(-1, 1) if isinstance(y, pd.Series) else y
        return self.encoder.transform(y)
    
target_encoder = TargetEncoder(OrdinalEncoder(categories=[y_ord]))

y_train_encoded = target_encoder.fit_transform(y_train)
y_test_encoded = target_encoder.transform(y_test)

# Pipe line defining

In [13]:
# Preprocessing pipeline
preproc = ColumnTransformer(
    transformers=[
        ('ord', Pipeline(steps=[
            ('ord_encoder', OrdinalEncoder(categories=ord_values, handle_unknown='use_encoded_value', unknown_value=-1))
        ]),X_Ord),
        ('pass', 'passthrough', norm)
    ]
)

In [14]:
preproc.fit(X_train, y_train)

# Modelling: Init

## K Nearest Neighbors

In [55]:
knn_pipe = Pipeline(steps=[
    ('preprocessing', preproc),
    #('target_encoder', TargetEncoder(target_encoder)),
    ('smotenc', smotenc),
    ('model', KNeighborsClassifier())
])

y_train_encoded = target_encoder.fit_transform(y_train)
y_test_encoded = target_encoder.transform(y_test)

knn_pipe.fit(X_train,y_train_encoded)

## Support Vector Machine

In [16]:
svm_pipe = Pipeline(steps=[
    ('preprocessing', preproc),
    #('target_encoder', TargetEncoder(target_encoder)),
    ('smotenc', smotenc),
    ('model', SVC())
])

y_train_encoded = target_encoder.fit_transform(y_train)
y_test_encoded = target_encoder.transform(y_test)

svm_pipe.fit(X_train,y_train_encoded)

## Decision Tree

In [17]:
dt_pipe = Pipeline(steps=[
    ('preprocessing', preproc),
    #('target_encoder', TargetEncoder(target_encoder)),
    ('smotenc', smotenc),
    ('model', DecisionTreeClassifier())
])

y_train_encoded = target_encoder.fit_transform(y_train)
y_test_encoded = target_encoder.transform(y_test)

dt_pipe.fit(X_train,y_train_encoded)

## Random Forest

In [18]:
rf_pipe = Pipeline(steps=[
    ('preprocessing', preproc),
    #('target_encoder', TargetEncoder(target_encoder)),
    ('smotenc', smotenc),
    ('model', RandomForestClassifier())
])

y_train_encoded = target_encoder.fit_transform(y_train)
y_test_encoded = target_encoder.transform(y_test)

rf_pipe.fit(X_train,y_train_encoded)

## Boosting

In [19]:
ada_pipe = Pipeline(steps=[
    ('preprocessing', preproc),
    #('target_encoder', TargetEncoder(target_encoder)),
    ('smotenc', smotenc),
    ('model', AdaBoostClassifier())
])

y_train_encoded = target_encoder.fit_transform(y_train)
y_test_encoded = target_encoder.transform(y_test)

ada_pipe.fit(X_train,y_train_encoded)

# Model Evaluation

In [25]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import cross_val_score
import pandas as pd

# Define models and their names
models = {
    'KNN': knn_pipe,  # Replace with your KNN pipeline
    'SVM': svm_pipe,  # Replace with your SVM pipeline
    'DT': dt_pipe,    # Replace with your Decision Tree pipeline
    'RF': rf_pipe,    # Replace with your Random Forest pipeline
    'Boost': ada_pipe
}

# Initialize a list to store evaluation results
results = []

# Evaluate each model
for name, model in models.items():
    # Predict class labels
    y_pred = model.predict(X_test)
    
    # Predict probabilities (required for ROC AUC)
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test)[:, 1]  # For binary classification
    else:
        # For models like SVM without `predict_proba`, use decision function
        y_pred_proba = model.decision_function(X_test)
    
    # Compute evaluation metrics
    precision = precision_score(y_test_encoded, y_pred, average='binary')  # Use 'weighted' for multi-class
    recall = recall_score(y_test_encoded, y_pred, average='binary')        # Adjust average if needed
    f1 = f1_score(y_test_encoded, y_pred, average='binary')                # Adjust average if needed
    roc_auc = roc_auc_score(y_test_encoded, y_pred_proba)
    
    # Cross-validation for F1-score
    cv_scores = cross_val_score(model, X_train, y_train_encoded, cv=5, scoring='f1')
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()
    
    # Append the results as a dictionary
    results.append({
        'Model': name,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'ROC AUC': roc_auc,
        'CV Mean': cv_mean,
        'CV Std': cv_std
    })

# Convert results to a dataframe
results_df = pd.DataFrame(results)

# Display the results
print(results_df)



   Model  Precision  Recall        F1   ROC AUC   CV Mean    CV Std
0    KNN   0.420290   0.725  0.532110  0.748156  0.550524  0.022649
1    SVM   0.591837   0.725  0.651685  0.796107  0.661000  0.021849
2     DT   0.396226   0.525  0.451613  0.631352  0.610298  0.027720
3     RF   0.583333   0.525  0.552632  0.825410  0.635109  0.061062
4  Boost   0.620690   0.450  0.521739  0.768238  0.606844  0.033642
