In [None]:
# pip install python-mnist will install the required package
from mnist import MNIST
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
class SlpGeneticForestClassifier:
    def __init__(self, N, generation_number):
        self.N = N
        self.generation_number = generation_number
        self.trained_trees = []
        
        
    def fit(self, X_train, y_train):
        self.label_count = len(y_train.label.unique())
        self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
        tree_parameters = self.genetic_find_parameters()
        for tree_parameter in tree_parameters:     
            X_train_subtree = self.X_train.sample(frac=1, replace=True, random_state=i)
            Y_train_subtree = self.y_train.sample(frac=1, replace=True, random_state=i)
            dtc = DecisionTreeClassifier(**tree_parameter)
            dtc = dtc.fit(X_train_subtree,Y_train_subtree)
            y_pred = dtc.predict(self.X_valid)
            print("Accuracy of Tree",i+1,":",metrics.accuracy_score(self.y_valid, y_pred))
            self.trained_trees.append(dtc)
            
            
    def predict(self, X_test):
        total_predictions = self.trained_trees[0].predict(X_test)
        for i in range(1, self.N):
            total_predictions = np.vstack([total_predictions, self.trained_trees[i].predict(X_test)])
        total_predictions = np.transpose(total_predictions)
        predicted_values = []
        for row in total_predictions:
            majority_vote = np.bincount(row).argmax()
            predicted_values.append(majority_vote)
        return np.asarray(predicted_values)

    
    # Genetic algorithm
    def genetic_find_parameters(self):
        generation = self.generate_parent_samples()
        print("Generation\n","*"*50)
        print(generation)
        print("*"*50)
        # for i in range(self.generation_number):
        #     generation = self.evolve(generation)
        return generation
            
        
    def generate_parent_samples(self):
        arg_test_values = {
            "max_depth": [10**i for i in range(0,10)],
            "min_samples_split": [i for i in range(2,self.label_count)],
            "min_samples_leaf": [i for i in range(2,self.label_count)],
            "max_leaf_nodes": [10**i for i in range(1,8)],
            "min_impurity_decrease": [0]+[10**(-i) for i in range(0,10)]
        }
        arg_range_values = {}
        for arg in arg_test_values:
            print("\nArg:", arg)
            accuracies = []
            for test_value in arg_test_values[arg]:
                print(test_value,", ",end="")
                dtc = DecisionTreeClassifier(**{arg: test_value})
                X_train_subtree = self.X_train.sample(frac=1, replace=True)
                Y_train_subtree = self.y_train.sample(frac=1, replace=True)
                dtc = dtc.fit(X_train_subtree,Y_train_subtree)
                y_pred = dtc.predict(self.X_valid)
                accuracies.append(metrics.accuracy_score(self.y_valid, y_pred))
            accuracies = np.asarray(accuracies)
            max_accuracy_index = np.argmax(accuracies)
            min_range_index = max(0, max_accuracy_index-1)
            max_range_index = min(len(accuracies)-1, max_accuracy_index+1)
            arg_range_values[arg] = (arg_test_values[arg][min_range_index], arg_test_values[arg][max_range_index])
        print("arg_range_values\n","*"*50)
        print(arg_range_values)
        print("*"*50)
        generation = []
        for i in range(self.N):
            generation.append({
                "max_depth": np.random.randint(arg_range_values["max_depth"][0],arg_range_values["max_depth"][1]),
                "min_samples_split": np.random.randint(arg_range_values["min_samples_split"][0],arg_range_values["min_samples_split"][1]),
                "min_samples_leaf": np.random.randint(arg_range_values["min_samples_leaf"][0],arg_range_values["min_samples_leaf"][1]),
                "max_leaf_nodes": np.random.randint(arg_range_values["max_leaf_nodes"][0],arg_range_values["max_leaf_nodes"][1]),
                "min_impurity_decrease": np.random.uniform(arg_range_values["min_impurity_decrease"][0],arg_range_values["min_impurity_decrease"][1])
            })
        return generation
    
    
    def evolve(self, generation):
        accuracies = []
        for tree_parameter in generation:     
            X_train_subtree = self.X_train.sample(frac=1, replace=True, random_state=i)
            Y_train_subtree = self.y_train.sample(frac=1, replace=True, random_state=i)
            dtc = DecisionTreeClassifier(**tree_parameter)
            dtc = dtc.fit(X_train_subtree,Y_train_subtree)
            y_pred = dtc.predict(self.X_valid)
            accuracies.append(metrics.accuracy_score(self.y_valid, y_pred))
        
        next_generation = []
        max_accuracy_index = np.argmax(accuracies)
        next_generation.append(generation[max_accuracy_index])
    
        for i in range(1, self.N):
            parent_1 = self.tournament(generation)
            parent_2 = self.tournament(generation)
            child = self.crossover(parent_1, parent_2)
            self.mutate(child)
            next_generation.append(child)
        return next_generation

    
    def crossover(self, tree1, tree2):
        return

    
    def mutate(self, tree):
        return

    
    def tournament(self, trees):
        return best

In [None]:
np.random.seed(60) # reproducability
mndata = MNIST('Datasets/MNIST')

# read training images and corresponding labels
tr_images, tr_labels = mndata.load_training()
# read test images and corresponding labels
tt_images, tt_labels = mndata.load_testing()

# convert lists into numpy format and apply normalization
tr_images = np.array(tr_images) / 255. # shape (60000, 784)
tr_labels = np.array(tr_labels)         # shape (60000,)
tt_images = np.array(tt_images) / 255. # shape (10000, 784)
tt_labels = np.array(tt_labels)         # shape (10000,)

columns_images = ['p{}'.format(i+1) for i in range(784)]
tr_df_images = pd.DataFrame(data=tr_images, columns=columns_images)
tr_df_labels = pd.DataFrame(data=tr_labels, columns=['label'])
tt_df_images = pd.DataFrame(data=tt_images, columns=columns_images)
tt_df_labels = pd.DataFrame(data=tt_labels, columns=['label'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tr_df_images, tr_df_labels, test_size=0.2, random_state=0)

In [None]:
dtc = DecisionTreeClassifier(random_state=0, max_depth = 40, min_samples_split = 2, min_samples_leaf=2, max_leaf_nodes=1000, min_impurity_decrease=0.00003)

In [None]:
# Train Decision Tree Classifer
dtc = dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=2)
rf.fit(X_train,y_train)
pred=rf.predict(X_test)
print(metrics.accuracy_score(y_test, pred))

In [None]:
sgfc = SlpGeneticForestClassifier(N=2, generation_number=20)
sgfc.fit(X_train, y_train)
pred=sgfc.predict(X_test)
print(metrics.accuracy_score(y_test, pred))