# Using sci-kit learn for decision trees

In [7]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [8]:
d = ["fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorites", "free_sulfir_dioxide","total_sulfur_dioxide", "density", "ph", "sulphates", "alcohol"]
t = "quality"

df = pd.read_csv('../../datasets/winequality.decision-trees.csv', sep=',', names=d + [t], header=0)

# Label Encoding
df[t] = pd.cut(df[t], bins=range(2, 12, 3), right=False, labels=['Goor','Matig', 'Goed'])

df[t].unique()

['Matig', 'Goed', 'Goor']
Categories (3, object): ['Goor' < 'Matig' < 'Goed']

In [9]:
X = df.drop(t,axis=1)
y = df[t]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # random_state=42


In [10]:
classifier = DecisionTreeClassifier(criterion='entropy')
wine_tree = classifier.fit(X_train, y_train)

# tree.plot_tree(wine_tree, feature_names=d, fontsize = 12,
#                       class_names=['Goor','Matig', 'Goed'],  
#                       filled=True, rounded=True) 
f"{len(y_train)} records used"

'3918 records used'

In [11]:
prediction = wine_tree.predict(X_test)
f"Accuracy: {accuracy_score(y_test, prediction)}" 

'Accuracy: 0.9081632653061225'

In [12]:
print("Classification Report:\n", classification_report(y_test, prediction))

Classification Report:
               precision    recall  f1-score   support

        Goed       0.49      0.48      0.48        42
        Goor       0.33      0.31      0.32        36
       Matig       0.95      0.95      0.95       902

    accuracy                           0.91       980
   macro avg       0.59      0.58      0.58       980
weighted avg       0.91      0.91      0.91       980

