In [1]:
!pip install tdml



In [2]:
import tdml
import numpy as np
import pandas as pd

In [3]:
# load the credit-g data from the OpenML website
# https://www.openml.org/d/31
url = "https://www.openml.org/data/get_csv/31/dataset_31_credit-g.arff"
df = pd.read_csv(url)
df

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,'<0',6,'critical/other existing credit',radio/tv,1169,'no known savings','>=7',4,'male single',none,4,'real estate',67,none,own,2,skilled,1,yes,yes,good
1,'0<=X<200',48,'existing paid',radio/tv,5951,'<100','1<=X<4',2,'female div/dep/mar',none,2,'real estate',22,none,own,1,skilled,1,none,yes,bad
2,'no checking',12,'critical/other existing credit',education,2096,'<100','4<=X<7',2,'male single',none,3,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good
3,'<0',42,'existing paid',furniture/equipment,7882,'<100','4<=X<7',2,'male single',guarantor,4,'life insurance',45,none,'for free',1,skilled,2,none,yes,good
4,'<0',24,'delayed previously','new car',4870,'<100','1<=X<4',3,'male single',none,4,'no known property',53,none,'for free',2,skilled,2,none,yes,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,'no checking',12,'existing paid',furniture/equipment,1736,'<100','4<=X<7',3,'female div/dep/mar',none,4,'real estate',31,none,own,1,'unskilled resident',1,none,yes,good
996,'<0',30,'existing paid','used car',3857,'<100','1<=X<4',4,'male div/sep',none,4,'life insurance',40,none,own,1,'high qualif/self emp/mgmt',1,yes,yes,good
997,'no checking',12,'existing paid',radio/tv,804,'<100','>=7',4,'male single',none,4,car,38,none,own,1,skilled,1,none,yes,good
998,'<0',45,'existing paid',radio/tv,1845,'<100','1<=X<4',4,'male single',none,4,'no known property',23,none,'for free',1,skilled,1,yes,yes,bad


In [4]:
# load the dataframe into TDML dataset with specifying "class" as the label
ds = tdml.Dataset(df, label="class")
ds.transform() # transform the data in the dataset
ds.train_test_split(train_size=0.9, test_size=0.1, seed=1) # split into train and test with ratio 9:1
print(ds)

Dataset(label=[1000], feature=[1000, 20], train_x=[900, 20], train_y=[900], test_x=[100, 20], test_y=[100], label_mapping=2, feature_mapping=13)


In [10]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier, LogisticRegression

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA", "SGD", "Logistic"]

classifiers = [
    KNeighborsClassifier(2),
    SVC(kernel="linear", C=0.05),
    SVC(gamma=1, C=0.05),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=8),
    RandomForestClassifier(max_depth=8, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    SGDClassifier(),
    LogisticRegression(solver='liblinear')]

X_train = StandardScaler().fit_transform(ds.train_x)
X_test = StandardScaler().fit_transform(ds.test_x)
y_train, y_test = ds.train_y, ds.test_y
for name, clf in zip(names, classifiers):
  clf.fit(X_train, y_train)
  accu = clf.score(X_test, y_test)
  print("{} accuracy is {}".format(name, accu))

Nearest Neighbors accuracy is 0.61
Linear SVM accuracy is 0.67
RBF SVM accuracy is 0.64
Gaussian Process accuracy is 0.64
Decision Tree accuracy is 0.61
Random Forest accuracy is 0.71
Neural Net accuracy is 0.74
AdaBoost accuracy is 0.74
Naive Bayes accuracy is 0.67
QDA accuracy is 0.69
SGD accuracy is 0.65
Logistic accuracy is 0.64
