In [33]:
import joblib
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix, f1_score, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# models
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import LinearSVC

# counter 
from collections import Counter

# time 
import time


In [34]:
def object_to_cat(df):
    tmp = df.copy()
    for col in tmp.columns:
        if np.dtype(tmp[col]) == "object":
            tmp[col] = tmp[col].astype("category")
    return tmp

df = pd.read_csv("dataset/german_clean.csv")
df = object_to_cat(df)


In [35]:
SEED = 42
TARGET = 'class'
FEATURES = df.columns.drop(TARGET)

NUMERICAL = df[FEATURES].select_dtypes('number').columns
CATEGORICAL = pd.Index(np.setdiff1d(FEATURES, NUMERICAL))

X = df.drop(columns=TARGET)
y = df[TARGET]


In [36]:
print('original dataset class distribution:', sorted(Counter(y).items()))

original dataset class distribution: [(0, 297), (1, 698)]


#### Pipeline

In [37]:
# result lists
# indexs = 0:over, 1:under, 2:normal 
classifiers = ["linearSVC", "Randomforest"]
cls_acc = []
trainin_time = []
cls_prec = []
cls_recall = []
cls_f1 = []

In [38]:
numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipe, NUMERICAL),
        ('cat', categorical_pipe, CATEGORICAL)
    ])

Hold out the testing set

In [39]:
# Split train-test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=SEED,
                                                    stratify=df[TARGET])


In [40]:
print("train size: ",X_train.shape, y_train.shape)
print("test size: ",X_test.shape, y_test.shape)
print('normal class distribution:', sorted(Counter(y_train).items()))

train size:  (796, 20) (796,)
test size:  (199, 20) (199,)
normal class distribution: [(0, 238), (1, 558)]


### Over-sampling

In [41]:
# oversampling usign random oversampling technique 
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=SEED)
X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

print('over sampled class distribution:',sorted(Counter(y_oversampled).items()))


over sampled class distribution: [(0, 558), (1, 558)]


In [42]:
# SVC 
over_sampled_SVC = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearSVC())
])

start = time.time()

over_sampled_SVC.fit(X_oversampled, y_oversampled)

end = time.time()
svc_exec_time = end - start

# RF
over_sampled_RF = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

start = time.time()

over_sampled_RF.fit(X_oversampled, y_oversampled)

end = time.time()
rf_exec_time = end - start

trainin_time.append((svc_exec_time, rf_exec_time))

filename = 'models/over_svc.pkl'
joblib.dump(over_sampled_SVC, open(filename, 'wb'))
filename = 'models/over_rf.pkl'
joblib.dump(over_sampled_RF, open(filename, 'wb'))


### Under-sampling

In [43]:
# under-sampling using random undersampling technique 
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=SEED)
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)

print('over sampled class distribution:',sorted(Counter(y_undersampled).items()))


over sampled class distribution: [(0, 238), (1, 238)]


In [44]:
# SVC
under_sampled_SVC = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearSVC())
])

start = time.time()

under_sampled_SVC.fit(X_undersampled, y_undersampled)

end = time.time()
svc_exec_time = end - start

# RF
under_sampled_RF = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

start = time.time()

under_sampled_RF.fit(X_undersampled, y_undersampled)

end = time.time()
rf_exec_time = end - start

trainin_time.append((svc_exec_time, rf_exec_time))

filename = 'models/under_svc.pkl'
joblib.dump(under_sampled_SVC, open(filename, 'wb'))
filename = 'models/under_rf.pkl'
joblib.dump(under_sampled_RF, open(filename, 'wb'))


### Normal dataset

In [45]:
# SVC
normal_SVC = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearSVC())
])

start = time.time()

normal_SVC.fit(X_train, y_train)

end = time.time()
svc_exec_time = end - start

# RF
normal_RF = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

start = time.time()

normal_RF.fit(X_train, y_train)

end = time.time()
rf_exec_time = end - start

trainin_time.append((svc_exec_time, rf_exec_time))

filename = 'models/normal_svc.pkl'
joblib.dump(normal_SVC, open(filename, 'wb'))
filename = 'models/normal_rf.pkl'
joblib.dump(normal_RF, open(filename, 'wb'))

### Evaluation

Accuracy