In [1]:
pip install fpdf graphviz

Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
import graphviz
from fpdf import FPDF

ds = pd.read_csv(r"C:\Users\Jkvan\anaconda3\Dummy_Trash_Pickup_Dataset.csv")
ds['Location'] = ds['Location'].str.replace(r'\d+', '', regex=True).str.strip()
ds = ds.dropna()
ds['Time'] = pd.to_datetime(ds['Time'])
ds['Hour'] = ds['Time'].dt.hour

features = ['Hour', 'Temperature']
X = ds[features]
y = ds['Location']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)
rf = RandomForestClassifier(criterion='entropy', n_estimators=50, max_depth=6, random_state=101)
rf.fit(X_train, y_train)

def plot_tree_classification(model, features, class_names, output_file='random_forest'):  
    if isinstance(model, RandomForestClassifier):
        pdf = FPDF()
        for i, tree_model in enumerate(model.estimators_):
            dot_data = tree.export_graphviz(tree_model, out_file=None, 
                                  feature_names=features,  
                                  class_names=class_names,  
                                  filled=True, rounded=True,  
                                  special_characters=True)  
            graph = graphviz.Source(dot_data)  
            image_file = f"{output_file}tree{i+1}.png"
            graph.render(filename=image_file, format='png')
            pdf.add_page()
            pdf.image(image_file + '.png', x=10, y=10, w=180)
        pdf_output_file = f"{output_file}.pdf"
        pdf.output(pdf_output_file)
        print(f"All trees saved in {pdf_output_file}.")
    else:
        raise ValueError("The model is not a RandomForestClassifier.")                                
    
    return graph

feature_names = X.columns
class_names = np.sort(np.unique(y)).astype(str)
plot_tree_classification(rf, feature_names, class_names)

def calculate_accuracy(predictions, actuals):
    if(len(predictions) != len(actuals)):
        raise Exception("The amount of predictions did not equal the amount of actuals")
    return (predictions == actuals).sum() / len(actuals)

predictionsOnTrainset = rf.predict(X_train)
predictionsOnTestset = rf.predict(X_test)

accuracyTrain = calculate_accuracy(predictionsOnTrainset, y_train)
accuracyTest = calculate_accuracy(predictionsOnTestset, y_test)

print("Accuracy on training set " + str(accuracyTrain))
print("Accuracy on test set " + str(accuracyTest))

: 