In [12]:
# libraries
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

# Files


In [13]:
# Create synthetic dataset
num_samples = 500
num_features = 3
num_informative = 3
num_redundant = 0
num_clusters_per_class = 1
num_classes = 5
random_state = 42 # Meaning of life

X, y = make_classification(
    n_samples=num_samples, 
    n_features=num_features,
    n_informative=num_informative,
    n_redundant=num_redundant,
    n_clusters_per_class=num_clusters_per_class,
    n_classes=num_classes,
    random_state=random_state
)

df = pd.DataFrame(X, columns=['Length(cm)', 'Weight(g)', 'Brightness'])

# 1. Make all values non-negative
df = df.abs()

# 2. Scale 'Length(cm)' to be between 10 and 100
length_min, length_max = df['Length(cm)'].min(), df['Length(cm)'].max()
df['Length(cm)'] = 10 + (df['Length(cm)'] - length_min) * (90 / (length_max - length_min))

# 3. Scale 'Weight(g)' to be between 50 and 5000
weight_min, weight_max = df['Weight(g)'].min(), df['Weight(g)'].max()
df['Weight(g)'] = 50 + (df['Weight(g)'] - weight_min) * (4950 / (weight_max - weight_min))

# 4. Scale brightness to be between 0 and 10
brightness_min, brightness_max = df['Brightness'].min(), df['Brightness'].max()
df['Brightness'] = 0 + (df['Brightness'] - brightness_min) * (10 / (brightness_max - brightness_min))

# 5. Give tangible names to the classes and add them to dataframe
fish_types = {
    0: 'Salmon',
    1: 'Tuna',
    2: 'Trout',
    3: 'Bass',
    4: 'Mackerel'
}
df['Class'] = np.vectorize(fish_types.get)(y)

df

Unnamed: 0,Length(cm),Weight(g),Brightness,Class
0,30.447050,2164.840540,3.351783,Tuna
1,25.334927,1536.982421,2.664048,Bass
2,12.107737,442.859404,2.491295,Salmon
3,14.376890,1806.842131,2.591037,Bass
4,47.720512,1128.870753,0.896277,Bass
...,...,...,...,...
495,30.426427,1081.102787,2.804622,Trout
496,66.070650,1366.238307,0.909074,Trout
497,30.173622,2546.033160,1.371208,Tuna
498,41.985169,2183.583914,3.262593,Salmon


In [14]:
# Split the dataset
X_scaled = df.drop('Class', axis=1).values
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

# Decision Trees

In [15]:
# Decision Tree variables
model_criterion = 'entropy'
model_max_depth = 4
model_random_state = 42

In [16]:
# Execute the self-made decision tree
sm_clf = dt.DecisionTree(
    criterion = model_criterion, 
    max_depth = model_max_depth,
    random_state = model_random_state
)

sm_clf.fit(X_train, y_train)
sm_clf_predictions = sm_clf.predict(X_test)


In [17]:
# Execute the sklearn decision tree for comparison
sk_clf = DecisionTreeClassifier(
    criterion= model_criterion, 
    max_depth= model_max_depth, 
    random_state = model_random_state
)

sk_clf.fit(X_train, y_train)
sk_clf_predictions = sk_clf.predict(X_test)

In [18]:
# Compare accuracies
sm_accuracy = accuracy_score(y_test, sm_clf_predictions)
sk_accuracy = accuracy_score(y_test, sk_clf_predictions)
print(f"Self-Made Decision Tree Accuracy on the test set: {sm_accuracy:.2f}")
print(f"Scikit-learn Decision Tree Accuracy on the test set: {sk_accuracy:.2f}")

Self-Made Decision Tree Accuracy on the test set: 0.31
Scikit-learn Decision Tree Accuracy on the test set: 0.34


# Random Forests

In [19]:
model_num_estimators = 50
model_criterion = 'entropy'
model_max_depth = 4
model_random_state = 42
model_max_features = 'sqrt'

In [20]:
# Executing self-made random forest
sm_rf = dt.RandomForest(
    n_estimators = model_num_estimators,
    criterion = model_criterion,
    max_depth = model_max_depth,
    max_features = model_max_features,
    random_state = model_random_state
)
sm_rf.fit(X_train, y_train)
sm_rf_predictions = sm_rf.predict(X_test)

In [21]:
# Executing sklearn random forest
sk_rf = RandomForestClassifier(
    n_estimators = model_num_estimators,
    criterion = model_criterion,
    max_depth = model_max_depth,
    max_features = model_max_features,
    random_state = model_random_state
)

sk_rf.fit(X_train, y_train)
sk_rf_predictions = sk_rf.predict(X_test)

In [22]:
# Compare accuracies
sm_rf_accuracy = accuracy_score(y_test, sm_rf_predictions)
sk_rf_accuracy = accuracy_score(y_test, sk_rf_predictions)
print(f"Self-Made Random Forest Accuracy on the test set: {sm_rf_accuracy:.2f}")
print(f"Scikit-learn Random Forest Accuracy on the test set: {sk_rf_accuracy:.2f}")

Self-Made Random Forest Accuracy on the test set: 0.37
Scikit-learn Random Forest Accuracy on the test set: 0.39
