In [63]:
import joblib
import pandas as pd
import numpy as np
import psutil

from sklearn.metrics import confusion_matrix, f1_score, ConfusionMatrixDisplay, classification_report,precision_recall_fscore_support, precision_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# models
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import LinearSVC

# counter 
from collections import Counter

# time 
import time

#table
from tabulate import tabulate


In [64]:
def object_to_cat(df):
    tmp = df.copy()
    for col in tmp.columns:
        if np.dtype(tmp[col]) == "object":
            tmp[col] = tmp[col].astype("category")
    return tmp

df = pd.read_csv("dataset/german_clean.csv")
df = object_to_cat(df)


In [65]:
SEED = 42
TARGET = 'class'
FEATURES = df.columns.drop(TARGET)

NUMERICAL = df[FEATURES].select_dtypes('number').columns
CATEGORICAL = pd.Index(np.setdiff1d(FEATURES, NUMERICAL))

X = df.drop(columns=TARGET)
y = df[TARGET]


In [66]:
print('original dataset class distribution:', sorted(Counter(y).items()))

original dataset class distribution: [(0, 297), (1, 698)]


#### Pipeline

In [67]:
numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipe, NUMERICAL),
        ('cat', categorical_pipe, CATEGORICAL)
    ])

Hold out the testing set

In [68]:
# Split train-test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=SEED,stratify=df[TARGET])


In [69]:
print("train size: ",X_train.shape, y_train.shape)
print("test size: ",X_test.shape, y_test.shape)
print('normal class distribution:', sorted(Counter(y_train).items()))

train size:  (796, 20) (796,)
test size:  (199, 20) (199,)
normal class distribution: [(0, 238), (1, 558)]


In [70]:
# result lists
# indexs = 0:over, 1:under, 2:normal 

trainin_time = []
memory_usage = []


### Over-sampling

In [71]:
# oversampling usign random oversampling technique 
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=SEED)
X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

print('over sampled class distribution:',sorted(Counter(y_oversampled).items()))


over sampled class distribution: [(0, 558), (1, 558)]


In [72]:
print(psutil.Process().memory_info().rss / (1024 * 1024))

159.89453125


In [73]:
# SVC 
over_sampled_SVC = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearSVC())
])

start = time.time()

start_mem = psutil.Process().memory_info().rss / (1024 * 1024)

over_sampled_SVC.fit(X_oversampled, y_oversampled)

end = time.time()
finished_mem = psutil.Process().memory_info().rss / (1024 * 1024)
#  ======================

svc_exec_time = end - start
mem_used = finished_mem - start_mem

print("over_SVC : ",svc_exec_time, "mem: ",mem_used)

trainin_time.append(svc_exec_time)
memory_usage.append(mem_used)


#  =========================================================

# RF
over_sampled_RF = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

start = time.time()
start_mem = psutil.Process().memory_info().rss / (1024 * 1024)

over_sampled_RF.fit(X_oversampled, y_oversampled)

end = time.time()
finished_mem = psutil.Process().memory_info().rss / (1024 * 1024)
#  ======================

rf_exec_time = end - start
mem_used = finished_mem - start_mem

trainin_time.append(rf_exec_time)
memory_usage.append(mem_used)
print("over_Rf : ", rf_exec_time, "mem: ", mem_used)
#  =========================================================
filename = 'models/over_svc.pkl'
joblib.dump(over_sampled_SVC, open(filename, 'wb'))
filename = 'models/over_rf.pkl'
joblib.dump(over_sampled_RF, open(filename, 'wb'))


over_SVC :  0.03499889373779297 mem:  1.390625
over_Rf :  0.22403907775878906 mem:  1.62890625


### Under-sampling

In [74]:
# under-sampling using random undersampling technique 
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=SEED)
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)

print('over sampled class distribution:',sorted(Counter(y_undersampled).items()))


over sampled class distribution: [(0, 238), (1, 238)]


In [75]:
# SVC
under_sampled_SVC = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearSVC())
])

start = time.time()
start_mem = psutil.Process().memory_info().rss / (1024 * 1024)

under_sampled_SVC.fit(X_undersampled, y_undersampled)

end = time.time()
finished_mem = psutil.Process().memory_info().rss / (1024 * 1024)

#  ========================
svc_exec_time = end - start
mem_used = finished_mem - start_mem


trainin_time.append(svc_exec_time)
memory_usage.append(mem_used)

#  =========================================================

under_sampled_RF = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])


start = time.time()
start_mem = psutil.Process().memory_info().rss / (1024 * 1024)

under_sampled_RF.fit(X_undersampled, y_undersampled)

end = time.time()
finished_mem = psutil.Process().memory_info().rss / (1024 * 1024)
#  ========================

rf_exec_time = end - start
mem_used = finished_mem - start_mem

trainin_time.append(rf_exec_time)
memory_usage.append(mem_used)


#  =========================================================
filename = 'models/under_svc.pkl'
joblib.dump(under_sampled_SVC, open(filename, 'wb'))
filename = 'models/under_rf.pkl'
joblib.dump(under_sampled_RF, open(filename, 'wb'))


### Normal dataset

In [76]:
# SVC
normal_SVC = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearSVC())
])


start = time.time()
start_mem = psutil.Process().memory_info().rss / (1024 * 1024)

normal_SVC.fit(X_train, y_train)

end = time.time()
finished_mem = psutil.Process().memory_info().rss / (1024 * 1024)
#  ========================
svc_exec_time = end - start
mem_used = finished_mem - start_mem

trainin_time.append(svc_exec_time)
memory_usage.append(mem_used)

#  =========================================================
# RF
normal_RF = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

start = time.time()
start_mem = psutil.Process().memory_info().rss / (1024 * 1024)

normal_RF.fit(X_train, y_train)

end = time.time()
finished_mem = psutil.Process().memory_info().rss / (1024 * 1024)
#  ========================
rf_exec_time = end - start
mem_used = finished_mem - start_mem

trainin_time.append(rf_exec_time)
memory_usage.append(mem_used)

#  =========================================================
filename = 'models/normal_svc.pkl'
joblib.dump(normal_SVC, open(filename, 'wb'))
filename = 'models/normal_rf.pkl'
joblib.dump(normal_RF, open(filename, 'wb'))


### Evaluation

Training Time

In [77]:
memory_usage

[1.390625, 1.62890625, 0.0234375, 0.3984375, 0.77734375, 1.96484375]

Accuracy

In [78]:
models = [over_sampled_SVC, over_sampled_RF, under_sampled_SVC, under_sampled_RF, normal_SVC, normal_RF]

confusion_met = []
false_negatives = []
false_positives = []
cls_specificity = []

cls_acc = []
cls_precision = []
cls_recall = []
cls_f1 = []

test_time = []
test_mem = []

In [79]:
for model in models:
    start = time.time()
    start_mem = psutil.Process().memory_info().rss / (1024 * 1024)
    
    y_pred = model.predict(X_test)

    end = time.time()
    end_mem = psutil.Process().memory_info().rss / (1024 * 1024)
    time_used = end - start
    mem_used = end_mem - start_mem

    test_time.append(time_used)
    test_mem.append(mem_used)

    accuracy = model.score(X_test, y_test)
    cls_acc.append(accuracy)

    score = precision_recall_fscore_support(y_test, y_pred, average='weighted')

    cls_precision.append(score[0])
    cls_recall.append(score[1])
    cls_f1.append(score[2])
   
    confusion_met.append(classification_report(y_test, y_pred, target_names=['bad','good']))
    TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
    false_negatives.append(int(FN))
    false_positives.append(int(FP))
    specificity = TN / (FP + TN)

    cls_specificity.append(specificity)
    


In [80]:

info_tables = [['Total instances', 'Training instance', 'Testing instances'], [X.shape[0], X_train.shape[0], X_test.shape[0]]]

print(tabulate(info_tables, tablefmt='fancy_grid'))


╒═════════════════╤═══════════════════╤═══════════════════╕
│ Total instances │ Training instance │ Testing instances │
├─────────────────┼───────────────────┼───────────────────┤
│ 995             │ 796               │ 199               │
╘═════════════════╧═══════════════════╧═══════════════════╛


In [81]:

classifers = ['Over SVC', 'Over Random forest','Under SVC','Under Random forest', 'Normal SVC', 'Normal Random forest']

tables = {
    'Classifiers' : classifers,
    'Accuracy' : cls_acc,
    'Precision' : cls_precision,
    'Recall' : cls_recall,
    'F1' : cls_f1,
    'Testing time' : test_time,
    'Testing memory' : test_mem
}
print(tabulate(tables, headers='keys', tablefmt='fancy_grid', floatfmt=".4f"))


╒══════════════════════╤════════════╤═════════════╤══════════╤════════╤════════════════╤══════════════════╕
│ Classifiers          │   Accuracy │   Precision │   Recall │     F1 │   Testing time │   Testing memory │
╞══════════════════════╪════════════╪═════════════╪══════════╪════════╪════════════════╪══════════════════╡
│ Over SVC             │     0.7286 │      0.7741 │   0.7286 │ 0.7393 │         0.0090 │           0.2695 │
├──────────────────────┼────────────┼─────────────┼──────────┼────────┼────────────────┼──────────────────┤
│ Over Random forest   │     0.7538 │      0.7429 │   0.7538 │ 0.7458 │         0.0190 │           0.1094 │
├──────────────────────┼────────────┼─────────────┼──────────┼────────┼────────────────┼──────────────────┤
│ Under SVC            │     0.7236 │      0.7547 │   0.7236 │ 0.7327 │         0.0060 │           0.0000 │
├──────────────────────┼────────────┼─────────────┼──────────┼────────┼────────────────┼──────────────────┤
│ Under Random forest  │    

In [82]:
classifers = ['Over SVC', 'Over Random forest', 'Under SVC',
              'Under Random forest', 'Normal SVC', 'Normal Random forest']

false_positive_tables = {
    'Classifiers': classifers,
    'No. False Positive': false_positives,
    'Precision': cls_precision,
    'Recall': cls_recall,
    'Specificity': cls_specificity
}
print(tabulate(false_positive_tables, headers='keys',
      tablefmt='fancy_grid', floatfmt=".4f"))


╒══════════════════════╤══════════════════════╤═════════════╤══════════╤═══════════════╕
│ Classifiers          │   No. False Positive │   Precision │   Recall │   Specificity │
╞══════════════════════╪══════════════════════╪═════════════╪══════════╪═══════════════╡
│ Over SVC             │                   14 │      0.7741 │   0.7286 │        0.7627 │
├──────────────────────┼──────────────────────┼─────────────┼──────────┼───────────────┤
│ Over Random forest   │                   30 │      0.7429 │   0.7538 │        0.4915 │
├──────────────────────┼──────────────────────┼─────────────┼──────────┼───────────────┤
│ Under SVC            │                   18 │      0.7547 │   0.7236 │        0.6949 │
├──────────────────────┼──────────────────────┼─────────────┼──────────┼───────────────┤
│ Under Random forest  │                   17 │      0.7425 │   0.6935 │        0.7119 │
├──────────────────────┼──────────────────────┼─────────────┼──────────┼───────────────┤
│ Normal SVC         

In [83]:
for idx, cm in enumerate(confusion_met):
    print(classifers[idx])
    print(cm,'\n=====================================================')


Over SVC
              precision    recall  f1-score   support

         bad       0.53      0.76      0.62        59
        good       0.88      0.71      0.79       140

    accuracy                           0.73       199
   macro avg       0.70      0.74      0.71       199
weighted avg       0.77      0.73      0.74       199
 
Over Random forest
              precision    recall  f1-score   support

         bad       0.60      0.49      0.54        59
        good       0.80      0.86      0.83       140

    accuracy                           0.75       199
   macro avg       0.70      0.68      0.69       199
weighted avg       0.74      0.75      0.75       199
 
Under SVC
              precision    recall  f1-score   support

         bad       0.53      0.69      0.60        59
        good       0.85      0.74      0.79       140

    accuracy                           0.72       199
   macro avg       0.69      0.72      0.69       199
weighted avg       0.75      0.72 