In [1]:
# pip install python-mnist will install the required package
from mnist import MNIST
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from threading import Thread

In [2]:
import torch
import torchvision                                 # datasets and transformations modules
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.nn as nn                              # neural network module
import torch.nn.functional as F
import torch.optim as optim                        # optimization module
import torch.optim.lr_scheduler as lr_scheduler

# All networks derive from the base class nn.Module
class Perceptron(nn.Module):
    # get input and output dimensions as input
    def __init__(self, d, K):
        # all derived classes must call __init__ method of super class
        super(Perceptron, self).__init__()
        # create a fully connected layer from input to output
        self.model = nn.Linear(d, K)

    
    # forward method should get the input and return the output
    def forward(self,x):
        batch_size = x.shape[0]
        # flatten the image from BxCxHXW to Bx784
        x = x.view(batch_size, -1)
        x = self.model(x.float())
        # softmax is internally done inside cross entropy loss
        return x

In [3]:
class SlpGeneticForestClassifier:
    def __init__(self, N, generation_number):
        self.N = N
        self.generation_number = generation_number
        self.trained_trees = []  
        # torch parameters
        self.SEED = 0            # reproducability
        # NN Parameters
        self.EPOCHS = 10          # number of epochs
        self.LR = 0.01            # learning rate
        self.MOMENTUM = 0.9       # momentum for the optimizer
        self.WEIGHT_DECAY = 1e-5  # weight decay for the optimizer
        self.GAMMA = 0.1          # learning rate schedular
        self.BATCH_SIZE = 64      # number of images to load per iteration

    def train_net(self):
        # put the network in training mode
        self.slp.train()
        # keep record of the loss value
        epoch_loss = 0.0
        # use training data as batches
        for xt, rt in self.train_loader:
            # move training instances and corresponding labels into gpu if cuda is available
            xt, rt = xt.to(self.device), rt.to(self.device)
            # clear the previously accumulated gradients
            self.optimizer.zero_grad() 
            # forward the network
            yt = self.slp(xt)
            # calculate loss
            loss = self.loss_fn(yt, rt)
            # make a backward pass, calculate gradients
            loss.backward()
            # update weights
            self.optimizer.step()
            # accumulate loss
            epoch_loss += loss.item()
        return epoch_loss
    
    def train_tree(self, tree_parameter, state_counter):
        X_train_subtree = self.X_train.sample(frac=1, replace=True, random_state=state_counter)
        Y_train_subtree = self.y_train.sample(frac=1, replace=True, random_state=state_counter)
        dtc = DecisionTreeClassifier(**tree_parameter, random_state=state_counter)
        dtc = dtc.fit(X_train_subtree,Y_train_subtree)
        y_pred = dtc.predict(self.X_valid)
        print("Accuracy of Tree",state_counter+1,":",metrics.accuracy_score(self.y_valid, y_pred))
        self.trained_trees.append(dtc)
        

        
    def fit(self, X_train, y_train):
        self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)  # Train-test split pairs
        self.label_count = len(y_train.label.unique())
        self.sample_count = y_train.shape[0]
        self.K = self.label_count              # number of output features
        tree_parameters = self.genetic_find_parameters()
        state_counter = 0
        train_threads = []
        for tree_parameter in tree_parameters:
            train_threads.append(Thread(target=self.train_tree, args=[tree_parameter, state_counter]))
            state_counter += 1
        for thread in train_threads:
            thread.start()
        for thread in train_threads:
            thread.join()
    
        total_predictions = self.trained_trees[0].predict(self.X_train)
        for i in range(1, self.N):
            total_predictions = np.vstack([total_predictions, self.trained_trees[i].predict(self.X_train)])
        total_predictions = np.transpose(total_predictions)
        enc = OneHotEncoder(handle_unknown='ignore') 
        enc.fit(total_predictions)
        
        
        
        one_hot_encoded_predictions = enc.transform(total_predictions).toarray() 
        
        self.d = one_hot_encoded_predictions.shape[1]      # number of input features 
        
        # manual seed to reproduce same resultsnet
        torch.manual_seed(self.SEED)
        # create the network
        self.slp = Perceptron(self.d,self.K)
        # check if CUDA is available
        cuda = torch.cuda.is_available()  
        self.device = torch.device("cuda:0" if cuda else "cpu")
        # if cuda is available move network into gpu
        self.slp.to(self.device)
        # specify the loss to be used
        # softmax is internally computed.
        self.loss_fn = nn.CrossEntropyLoss()
        # specify the optimizer to update the weights during backward pass
        self.optimizer = optim.SGD(self.slp.parameters(), lr=self.LR, momentum=self.MOMENTUM, weight_decay=self.WEIGHT_DECAY)
        # change learning rate over time
        self.scheduler = lr_scheduler.StepLR(self.optimizer, step_size=10, gamma=self.GAMMA) #CHECK THIS
        
        
        train_target = torch.tensor(self.y_train.values.flatten().astype(np.int32)).long()

        train = torch.tensor(one_hot_encoded_predictions) 

        train_tensor = torch.utils.data.TensorDataset(train, train_target) 
        self.train_loader = torch.utils.data.DataLoader(dataset = train_tensor, batch_size = self.BATCH_SIZE, shuffle = True, num_workers=8)
        
        # train the network
        for epoch in range(1,self.EPOCHS+1):
            # train network for one epoch
            self.train_net()


    # Genetic algorithm
    def genetic_find_parameters(self):
        generation = self.generate_parent_samples()
        print("Generation\n","*"*50)
        print(generation)
        print("*"*50)
        # for i in range(self.generation_number):
        #     generation = self.evolve(generation)
        return generation


    def generate_parent_samples(self):
        generation = []
        for i in range(self.N):
            generation.append({
                "max_depth":  np.random.normal(np.log2(self.sample_count)*2, np.log2(self.sample_count), 1),
                "min_samples_split": np.random.randint(2,self.label_count),
                "min_samples_leaf": np.random.randint(2,self.label_count),
                "max_leaf_nodes": np.random.randint(10, self.sample_count),
            })
        return generation
    
    

    def predict(self, X_test):
        total_predictions = self.trained_trees[0].predict(X_test)
        for i in range(1, self.N):
            total_predictions = np.vstack([total_predictions, self.trained_trees[i].predict(X_test)])
        total_predictions = np.transpose(total_predictions)
        
        enc = OneHotEncoder(handle_unknown='ignore')
        enc.fit(total_predictions)
        one_hot_encoded_predictions = enc.transform(total_predictions).toarray() 
        test = torch.tensor(one_hot_encoded_predictions) 
        y_pred = self.slp(test.to(self.device))
        y_pred = y_pred.cpu().detach().numpy()
        y_pred_class = np.asarray([np.argmax(pred) for pred in y_pred])
      
        return y_pred_class
            
        
    def evolve(self, generation):
        accuracies = []
        for tree_parameter in generation:     
            X_train_subtree = self.X_train.sample(frac=1, replace=True, random_state=i)
            Y_train_subtree = self.y_train.sample(frac=1, replace=True, random_state=i)
            dtc = DecisionTreeClassifier(**tree_parameter)
            dtc = dtc.fit(X_train_subtree,Y_train_subtree)
            y_pred = dtc.predict(self.X_valid)
            accuracies.append(metrics.accuracy_score(self.y_valid, y_pred))
        
        next_generation = []
        max_accuracy_index = np.argmax(accuracies)
        next_generation.append(generation[max_accuracy_index])
    
        for i in range(1, self.N):
            parent_1 = self.tournament(generation)
            parent_2 = self.tournament(generation)
            child = self.crossover(parent_1, parent_2)
            self.mutate(child)
            next_generation.append(child)
        return next_generation

    
    def crossover(self, tree1, tree2):
        return

    
    def mutate(self, tree):
        return

    
    def tournament(self, trees):
        return best

In [4]:
np.random.seed(60) # reproducability
mndata = MNIST('Datasets/MNIST')

# read training images and corresponding labels
tr_images, tr_labels = mndata.load_training()
# read test images and corresponding labels
tt_images, tt_labels = mndata.load_testing()

# convert lists into numpy format and apply normalization
tr_images = np.array(tr_images) / 255. # shape (60000, 784)
tr_labels = np.array(tr_labels)         # shape (60000,)
tt_images = np.array(tt_images) / 255. # shape (10000, 784)
tt_labels = np.array(tt_labels)         # shape (10000,)

columns_images = ['p{}'.format(i+1) for i in range(784)]
tr_df_images = pd.DataFrame(data=tr_images, columns=columns_images)
tr_df_labels = pd.DataFrame(data=tr_labels, columns=['label'])
tt_df_images = pd.DataFrame(data=tt_images, columns=columns_images)
tt_df_labels = pd.DataFrame(data=tt_labels, columns=['label'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(tr_df_images, tr_df_labels, test_size=0.2, random_state=0)

In [6]:
dtc = DecisionTreeClassifier(random_state=0, max_depth = 30, min_samples_split = 2, min_samples_leaf=2, max_leaf_nodes=1000, min_impurity_decrease=0.00003)

In [7]:
# Train Decision Tree Classifer
dtc = dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8724166666666666


In [8]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=30)
rf.fit(X_train,y_train)
pred=rf.predict(X_test)
print(metrics.accuracy_score(y_test, pred))

  rf.fit(X_train,y_train)


0.96375


In [9]:
sgfc = SlpGeneticForestClassifier(N=30, generation_number=20)
sgfc.fit(X_train, y_train)
pred=sgfc.predict(X_test)
print(metrics.accuracy_score(y_test, pred))

Generation
 **************************************************
[{'max_depth': array([2.40149385]), 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_leaf_nodes': 44471}, {'max_depth': array([31.1458573]), 'min_samples_split': 8, 'min_samples_leaf': 9, 'max_leaf_nodes': 46403}, {'max_depth': array([20.61144699]), 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_leaf_nodes': 13925}, {'max_depth': array([34.49369961]), 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_leaf_nodes': 40815}, {'max_depth': array([31.52259731]), 'min_samples_split': 6, 'min_samples_leaf': 9, 'max_leaf_nodes': 41306}, {'max_depth': array([35.4554778]), 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_leaf_nodes': 19584}, {'max_depth': array([30.65578675]), 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_leaf_nodes': 45317}, {'max_depth': array([27.0039844]), 'min_samples_split': 9, 'min_samples_leaf': 7, 'max_leaf_nodes': 10813}, {'max_depth': array([37.81818757]), 'min_samples_split': 5, 'min_sam

In [10]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(np.array([[1, 0], [1,1],[2,2],[1,3],[1,4],[1,5],[1,6],[1,7],[1,8],[1,9]]))
one_hot_encoded_predictions = enc.transform(np.array([[1,1],[1,3]])).toarray() 
print(one_hot_encoded_predictions)

[[1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]
