In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, ShuffleSplit, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

seed = 42

In [3]:
data_path = './data/german.data-numeric'
credit = np.genfromtxt(data_path)

print(credit)

[[ 1.  6.  4. ...  0.  1.  1.]
 [ 2. 48.  2. ...  0.  1.  2.]
 [ 4. 12.  4. ...  1.  0.  1.]
 ...
 [ 4. 12.  2. ...  0.  1.  1.]
 [ 1. 45.  2. ...  0.  1.  2.]
 [ 2. 45.  4. ...  0.  1.  1.]]


In [6]:
X = credit[:, :-1]
y = credit[:, -1]

print(X.shape, y.shape)

(1000, 24) (1000,)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = seed)

print(X_train.shape, y_train.shape)

(800, 24) (800,)


In [9]:
DT = DecisionTreeClassifier(random_state = seed)
RF = RandomForestClassifier(random_state = seed)

In [15]:
print('Demo DT prediction:')

DT.fit(X_train, y_train)
y_dt_pred = DT.predict(X_test)

print("DT acc = {}".format(accuracy_score(y_test, y_dt_pred)))
print("DT f1 = {}".format(f1_score(y_test, y_dt_pred)))
print("DT roc auc = {}".format(roc_auc_score(y_test, y_dt_pred)))
print(confusion_matrix(y_test, y_dt_pred))

Demo DT prediction:
DT acc = 0.665
DT f1 = 0.7632508833922261
DT roc auc = 0.5948431301839163
[[108  33]
 [ 34  25]]


In [16]:
print(classification_report(y_test, y_dt_pred))

              precision    recall  f1-score   support

         1.0       0.76      0.77      0.76       141
         2.0       0.43      0.42      0.43        59

    accuracy                           0.67       200
   macro avg       0.60      0.59      0.60       200
weighted avg       0.66      0.67      0.66       200



In [13]:
print('Demo RF prediction:')

RF.fit(X_train, y_train)
y_rf_pred = RF.predict(X_test)

print("RF acc = {}".format(accuracy_score(y_test, y_rf_pred)))
print("RF f1 = {}".format(f1_score(y_test, y_rf_pred)))
print("RF roc auc = {}".format(roc_auc_score(y_test, y_rf_pred)))
print(confusion_matrix(y_test, y_rf_pred))


Demo RF prediction:
RF acc = 0.81
RF f1 = 0.875
RF roc auc = 0.7173939175381656
[[133   8]
 [ 30  29]]


In [14]:
print(classification_report(y_test, y_rf_pred))

              precision    recall  f1-score   support

         1.0       0.82      0.94      0.88       141
         2.0       0.78      0.49      0.60        59

    accuracy                           0.81       200
   macro avg       0.80      0.72      0.74       200
weighted avg       0.81      0.81      0.80       200



In [17]:
def grid_search(algorithm, n_jobs, dict_param):
    
    if algorithm == 'decision-tree':
        model = DecisionTreeClassifier()
    else:
        model = RandomForestClassifier()
        
    classifier = GridSearchCV(estimator = model, cv = 5, param_grid = dict_param, n_jobs = n_jobs, scoring = 'f1')
    classifier.fit(X_train, y_train)
    
    print(classifier.best_estimator_)
    return classifier.best_estimator_

In [18]:
def evaluate(model):
    
    print("Train Accuracy :", accuracy_score(y_train, model.predict(X_train)))
    print("Train f1 score :", f1_score(y_train, model.predict(X_train)))
    print("Train roc auc :", roc_auc_score(y_train, model.predict(X_train)))
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, model.predict(X_train)))
    
    print("-" * 50)
    
    print("Test Accuracy :", accuracy_score(y_test, model.predict(X_test)))
    print("Test f1 score :", f1_score(y_test, model.predict(X_test)))
    print("Test roc auc :", roc_auc_score(y_test, model.predict(X_test)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, model.predict(X_test)))