In [1]:
import graphviz
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn import tree
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

In [2]:
trainingSet = pd.read_csv('data/training.csv')
testingSet = pd.read_csv('data/testing.csv')

In [3]:
# training data
x_train = trainingSet.iloc[:, :-1]  # features
y_train = trainingSet.iloc[:, -1]  # labels

In [4]:
# testing data
x_test = testingSet.iloc[:, :-1]  # features
y_test = testingSet.iloc[:, -1]  # labels

In [5]:
scaler = MinMaxScaler(copy=True, feature_range=(0,1))
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [6]:
classifier = tree.DecisionTreeClassifier()  # default is gini which gives better results if data is binary
classifier.fit(x_train, y_train)

In [7]:
classifier.get_params()  # 

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [8]:
y_predicted = classifier.predict(x_test)

In [9]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

          -1       0.80      0.79      0.79      2006
           1       0.79      0.80      0.80      2006

    accuracy                           0.79      4012
   macro avg       0.79      0.79      0.79      4012
weighted avg       0.79      0.79      0.79      4012



In [10]:
labels = ['h','g']
columns = [f'Predicted {label}' for label in labels]
index = [f'Actual {label}' for label in labels]
table = pd.DataFrame(confusion_matrix(y_test, y_predicted),
                     columns=columns, index=index)

table

Unnamed: 0,Predicted h,Predicted g
Actual h,1582,424
Actual g,400,1606


In [11]:
acc = accuracy_score(y_test, y_predicted)
prec = precision_score(y_test, y_predicted)
recall = recall_score(y_test, y_predicted)
f1 = f1_score(y_test, y_predicted)

print('model_accuracy = ', acc)
print('model_precision = ', prec)
print('model_recall = ', recall)
print('model_f1 = ', f1)

model_accuracy =  0.7946161515453639
model_precision =  0.7911330049261084
model_recall =  0.8005982053838484
model_f1 =  0.7958374628344895


In [None]:
# import graphviz
# dot_data = tree.export_graphviz(classifier, out_file=None)
# graph = graphviz.Source(dot_data)
# graph.render("decisiontree")