<a href="https://colab.research.google.com/github/Flabert/Calculator/blob/main/Random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

#load the iris dataset
iris = load_iris()
x = iris.data
y = iris.target

#Basic dataset overview
print (f"Dataset shape: {x.shape}")
print (f"Classes: {iris.target_names}")
print (f"Feature names: {iris.feature_names}")

"""
Visualize pairplot to see the relationship between features
Pairplot shows the relationship between different features, coloured by species,
giving a visula understanding of the dataset structure
"""
sns.pairplot(sns.load_dataset("iris"), hue="species")
plt.show()

#Split the data into training and testing set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#Define the hyperparameter grid, by testing different combination of parameters
param_grid = {
    'max_depth': [None, 10,20,30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

#Perform grid search
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

print (f"Best parameters: {grid_search.best_params_}")
print (f"Best cross-validation score: {grid_search.best_score_:.2f}")

#Use the best model
model = grid_search.best_estimator_

#Make predictions on the test set
y_pred = model.predict(x_test)

#Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

#Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

#Generate the classification report
cr = classification_report(y_test, y_pred, target_names=iris.target_names)
print (f"Classification Report: \n{cr}")

#Get features importances
feature_importances = model.feature_importances_

#Plot feature importances
features = iris.feature_names
indices = np.argsort(feature_importances)[::-1]

plt.figure(figsize=(8, 6))
plt.title("Feature Importances")
plt.bar(range(x.shape[1]), feature_importances[indices], align="center")
plt.xticks(range(x.shape[1]), [features[i] for i in indices], rotation=90)
plt.show()

#Visualise the decision tree
plt.figure(figsize=(12, 8))
plot_tree(model, filled=True, feature_names=iris.feature_names, class_names=iris.target_names, rounded=True)
plt.title("Decision Tree Visualization")
plt.show()

#Save the model
joblib.dump(model, 'decision_tree_model.pkl')

#Load the model
loaded_model = joblib.load('decision_tree_model.pkl')

#Verify the loaded model
y_loaded_pred = loaded_model.predict(x_test)
accuracy_loaded = accuracy_score(y_test, y_loaded_pred)
print (f"Accuracy of the loaded model: {accuracy_loaded:.2f}")



