In [1]:
import numpy as np
from sklearn import datasets
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

from decision_tree import DecisionTree
from random_forest_classifier import RandomForestClassifier

In [2]:
# Generation of a synthetic data set 
X, y = make_blobs(n_samples=1000, centers=3, n_features=5, random_state=42)

# Division in a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Training and performance measure of a simple decision tree 
dt_model = DecisionTree(max_depth=5, min_samples_leaf=10)
dt_model.fit(X_train, y_train)
dt_accuracy = dt_model.score(X_test, y_test)
print(f"Accuracy of Decision Tree: {dt_accuracy:.2f}")

Accuracy of Decision Tree: 0.99


In [4]:
# Training and performance measure of a Random Forest Classifier
rd_model = RandomForestClassifier(n_trees=100, max_depth=5, min_samples_leaf=10)
rd_model.fit(X_train, y_train)
rd_accuracy = rd_model.score(X_test, y_test)
print(f"Accuracy of Random Forest: {rd_accuracy:.2f}")

Accuracy of Random Forest: 0.99


# Mushroom dataset

In [5]:
# required libraries
!pip install ucimlrepo



In [6]:
from ucimlrepo import fetch_ucirepo 


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(800, 5) (200, 5) (800,) (200,)


In [9]:
# import dataset
heart_disease = fetch_ucirepo(id=45)
# alternatively: fetch_ucirepo(name='Heart Disease')

# access data
X = heart_disease.data.features
y = heart_disease.data.targets
# train model e.g. sklearn.linear_model.LinearRegression().fit(X, y)

# access metadata
print(heart_disease.metadata.uci_id)
print(heart_disease.metadata.num_instances)
print(heart_disease.metadata.additional_info.summary)

# access variable info in tabular format
print(heart_disease.variables)

45
303
This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them.  In particular, the Cleveland database is the only one that has been used by ML researchers to date.  The "goal" field refers to the presence of heart disease in the patient.  It is integer valued from 0 (no presence) to 4. Experiments with the Cleveland database have concentrated on simply attempting to distinguish presence (values 1,2,3,4) from absence (value 0).  
   
The names and social security numbers of the patients were recently removed from the database, replaced with dummy values.

One file has been "processed", that one containing the Cleveland database.  All four unprocessed files also exist in this directory.

To see Test Costs (donated by Peter Turney), please see the folder "Costs" 
        name     role         type demographic  \
0        age  Feature      Integer         Age   
1        sex  Feature  Categorical         Sex   
2         cp  Feature  Categ

In [10]:
# Training and performance measure of a simple decision tree 
mushroom_decision_tree = DecisionTree(max_depth=5, min_samples_leaf=10)
mushroom_decision_tree.fit(X_train, y_train)
mushroom_decision_tree_accuracy = mushroom_decision_tree.score(X_test, y_test)
print(f"Accuracy of Decision Tree: {mushroom_decision_tree_accuracy:.2f}")

Accuracy of Decision Tree: 0.99


In [11]:
# Training and performance measure of a Random Forest Classifier
mushroom_random_forest = RandomForestClassifier(n_trees=100, max_depth=5, min_samples_leaf=10)
mushroom_random_forest.fit(X_train, y_train)
mushroom_random_forest_accuracy = mushroom_random_forest.score(X_test, y_test)
print(f"Accuracy of Random Forest: {mushroom_random_forest_accuracy:.2f}")

Accuracy of Random Forest: 0.99
