In [6]:
# pre-pruning and post-pruning technique in decision tree
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

iris = load_iris() 
x = iris.data
y = iris.target

# split the data into training(70% and testing 30%)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

print(f"Shape of X_train: {x_train.shape}")
print(f"Shape of X_test: {x_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

# implement pre-pruning
# training the decision tree with gini index
for i in range(1, 5):
    for j in range(2, 5):
        for k in range(1, 5):
            decisiontree_gini = DecisionTreeClassifier(criterion='gini', max_depth=i, min_samples_split=j, min_samples_leaf=k)
            decisiontree_gini.fit(x_train, y_train)
            print(f"dpeth: {i}, split: {j}, minleaf: {k}")
            # evaluate the accuracy on the test set
            accuracy_gini = decisiontree_gini.score(x_test, y_test)
            print(f"accuracy {accuracy_gini}")

            # Plot the decision tree
            # plt.figure(figsize=(12, 8))
            # plot_tree(decisiontree_gini, 
            #         feature_names=iris.feature_names,  
            #         class_names=iris.target_names,
            #         filled=True, 
            #         rounded=True)
            # plt.show()

Shape of X_train: (105, 4)
Shape of X_test: (45, 4)
Shape of y_train: (105,)
Shape of y_test: (45,)
dpeth: 1, split: 2, minleaf: 1
accuracy 0.5555555555555556
dpeth: 1, split: 2, minleaf: 2
accuracy 0.5555555555555556
dpeth: 1, split: 2, minleaf: 3
accuracy 0.5555555555555556
dpeth: 1, split: 2, minleaf: 4
accuracy 0.5555555555555556
dpeth: 1, split: 3, minleaf: 1
accuracy 0.5555555555555556
dpeth: 1, split: 3, minleaf: 2
accuracy 0.5555555555555556
dpeth: 1, split: 3, minleaf: 3
accuracy 0.5555555555555556
dpeth: 1, split: 3, minleaf: 4
accuracy 0.5555555555555556
dpeth: 1, split: 4, minleaf: 1
accuracy 0.5555555555555556
dpeth: 1, split: 4, minleaf: 2
accuracy 0.5555555555555556
dpeth: 1, split: 4, minleaf: 3
accuracy 0.5555555555555556
dpeth: 1, split: 4, minleaf: 4
accuracy 0.5555555555555556
dpeth: 2, split: 2, minleaf: 1
accuracy 0.8666666666666667
dpeth: 2, split: 2, minleaf: 2
accuracy 0.8666666666666667
dpeth: 2, split: 2, minleaf: 3
accuracy 0.8666666666666667
dpeth: 2, split

In [7]:
# Get effective alphas and corresponding trees
decisiontree_gini = DecisionTreeClassifier(criterion='gini', max_depth=2, min_samples_split=4, min_samples_leaf=1)
decisiontree_gini.fit(x_train, y_train)
accuracy_gini = decisiontree_gini.score(x_test, y_test)
print(f"accuracy {accuracy_gini}")

path = decisiontree_gini.cost_complexity_pruning_path(x_train, y_train)
alphas, impurities = path.ccp_alphas, path.impurities

# Train models for each alpha
models = []
for alpha in alphas:
    clf = DecisionTreeClassifier(ccp_alpha=alpha, random_state=42)
    clf.fit(x_train, y_train)
    models.append(clf)

for model in models:
    accuracy_gini = model.score(x_test, y_test)
    print(f"accuracy {accuracy_gini}")
    

accuracy 0.8666666666666667
accuracy 0.9555555555555556
accuracy 0.5555555555555556
accuracy 0.2222222222222222


In [8]:
# 2. Naive Bayes on Iris Dataset

from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# (a) Train a Gaussian Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
conf_matrix_gnb = confusion_matrix(y_test, y_pred_gnb)
print(f"Gaussian Naive Bayes Accuracy: {accuracy_gnb}")
print(f"Gaussian Naive Bayes Confusion Matrix:\n{conf_matrix_gnb}")

# (b) Train a Multinomial Naive Bayes model
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)
f1_mnb = f1_score(y_test, y_pred_mnb, average='weighted')
print(f"Multinomial Naive Bayes Accuracy: {accuracy_mnb}")
print(f"Multinomial Naive Bayes F1-Score: {f1_mnb}")

Gaussian Naive Bayes Accuracy: 0.9777777777777777
Gaussian Naive Bayes Confusion Matrix:
[[19  0  0]
 [ 0 12  1]
 [ 0  0 13]]
Multinomial Naive Bayes Accuracy: 0.9555555555555556
Multinomial Naive Bayes F1-Score: 0.9555555555555556
