In [1]:
# pip install python-mnist will install the required package
from mnist import MNIST
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
class SlpGeneticForestClassifier:
    def __init__(self, N, generation_number):
        self.N = N
        self.generation_number = generation_number
        self.trained_trees = []
        
        
    def fit(self, X_train, y_train):
        self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)  # Train-test split pairs
        self.label_count = len(y_train.label.unique())
        self.sample_count = y_train.shape[0]
        tree_parameters = self.genetic_find_parameters()
        state_counter = 0
        for tree_parameter in tree_parameters:
            X_train_subtree = self.X_train.sample(frac=1, replace=True, random_state=state_counter)
            Y_train_subtree = self.y_train.sample(frac=1, replace=True, random_state=state_counter)
            dtc = DecisionTreeClassifier(**tree_parameter, random_state=state_counter)
            dtc = dtc.fit(X_train_subtree,Y_train_subtree)
            y_pred = dtc.predict(self.X_valid)
            print("Accuracy of Tree",state_counter+1,":",metrics.accuracy_score(self.y_valid, y_pred))
            self.trained_trees.append(dtc)
            state_counter += 1
            
    
    # Genetic algorithm
    def genetic_find_parameters(self):
        generation = self.generate_parent_samples()
        print("Generation\n","*"*50)
        print(generation)
        print("*"*50)
        # for i in range(self.generation_number):
        #     generation = self.evolve(generation)
        return generation


    def generate_parent_samples(self):
        generation = []
        for i in range(self.N):
            generation.append({
                "max_depth":  np.random.normal(np.log2(self.sample_count)*2, np.log2(self.sample_count), 1),
                "min_samples_split": np.random.randint(2,self.label_count),
                "min_samples_leaf": np.random.randint(2,self.label_count),
                "max_leaf_nodes": np.random.randint(10, self.sample_count),
            })
        return generation

    def predict(self, X_test):
        total_predictions = self.trained_trees[0].predict(X_test)
        for i in range(1, self.N):
            total_predictions = np.vstack([total_predictions, self.trained_trees[i].predict(X_test)])
        total_predictions = np.transpose(total_predictions)
        predicted_values = []
        for row in total_predictions:
            majority_vote = np.bincount(row).argmax()
            predicted_values.append(majority_vote)
        return np.asarray(predicted_values)
            
        
    def evolve(self, generation):
        accuracies = []
        for tree_parameter in generation:     
            X_train_subtree = self.X_train.sample(frac=1, replace=True, random_state=i)
            Y_train_subtree = self.y_train.sample(frac=1, replace=True, random_state=i)
            dtc = DecisionTreeClassifier(**tree_parameter)
            dtc = dtc.fit(X_train_subtree,Y_train_subtree)
            y_pred = dtc.predict(self.X_valid)
            accuracies.append(metrics.accuracy_score(self.y_valid, y_pred))
        
        next_generation = []
        max_accuracy_index = np.argmax(accuracies)
        next_generation.append(generation[max_accuracy_index])
    
        for i in range(1, self.N):
            parent_1 = self.tournament(generation)
            parent_2 = self.tournament(generation)
            child = self.crossover(parent_1, parent_2)
            self.mutate(child)
            next_generation.append(child)
        return next_generation

    
    def crossover(self, tree1, tree2):
        return

    
    def mutate(self, tree):
        return

    
    def tournament(self, trees):
        return best

In [3]:
np.random.seed(60) # reproducability
mndata = MNIST('Datasets/MNIST')

# read training images and corresponding labels
tr_images, tr_labels = mndata.load_training()
# read test images and corresponding labels
tt_images, tt_labels = mndata.load_testing()

# convert lists into numpy format and apply normalization
tr_images = np.array(tr_images) / 255. # shape (60000, 784)
tr_labels = np.array(tr_labels)         # shape (60000,)
tt_images = np.array(tt_images) / 255. # shape (10000, 784)
tt_labels = np.array(tt_labels)         # shape (10000,)

columns_images = ['p{}'.format(i+1) for i in range(784)]
tr_df_images = pd.DataFrame(data=tr_images, columns=columns_images)
tr_df_labels = pd.DataFrame(data=tr_labels, columns=['label'])
tt_df_images = pd.DataFrame(data=tt_images, columns=columns_images)
tt_df_labels = pd.DataFrame(data=tt_labels, columns=['label'])

In [4]:
X_train, X_test, y_train, y_test = train_test_split(tr_df_images, tr_df_labels, test_size=0.2, random_state=0)

In [5]:
dtc = DecisionTreeClassifier(random_state=0, max_depth = 30, min_samples_split = 2, min_samples_leaf=2, max_leaf_nodes=1000, min_impurity_decrease=0.00003)

In [6]:
# Train Decision Tree Classifer
dtc = dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8724166666666666


In [7]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=10)
rf.fit(X_train,y_train)
pred=rf.predict(X_test)
print(metrics.accuracy_score(y_test, pred))

  rf.fit(X_train,y_train)


0.948


In [8]:
sgfc = SlpGeneticForestClassifier(N=10, generation_number=20)
sgfc.fit(X_train, y_train)
pred=sgfc.predict(X_test)
print(metrics.accuracy_score(y_test, pred))

Generation
 **************************************************
[{'max_depth': array([5.57254433]), 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_leaf_nodes': 5247}, {'max_depth': array([9.56093961]), 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_leaf_nodes': 13773}, {'max_depth': array([32.99840133]), 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_leaf_nodes': 39993}, {'max_depth': array([10.26889347]), 'min_samples_split': 4, 'min_samples_leaf': 6, 'max_leaf_nodes': 4330}, {'max_depth': array([16.43236252]), 'min_samples_split': 7, 'min_samples_leaf': 8, 'max_leaf_nodes': 46403}, {'max_depth': array([19.89919546]), 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_leaf_nodes': 17247}, {'max_depth': array([14.48048912]), 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_leaf_nodes': 40815}, {'max_depth': array([27.10897128]), 'min_samples_split': 2, 'min_samples_leaf': 9, 'max_leaf_nodes': 46697}, {'max_depth': array([43.37421811]), 'min_samples_split': 9, 'min_sam

In [9]:
np.random.uniform(0, 0.1)

0.09760528685772735

In [10]:
abs(np.random.normal(0.05, 0.1, 10))

array([0.25099539, 0.02028125, 0.0499199 , 0.13618547, 0.19408191,
       0.17271978, 0.20880924, 0.04993786, 0.04311953, 0.00136426])

In [11]:
max(np.random.normal(0.05, 0.1, 1)[0], 0)

0.20297071082829138