In [1]:
# pip install python-mnist will install the required package
from mnist import MNIST
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from threading import Thread
from operator import itemgetter

In [2]:
import torch
import torchvision                                 # datasets and transformations modules
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.nn as nn                              # neural network module
import torch.nn.functional as F
import torch.optim as optim                        # optimization module
import torch.optim.lr_scheduler as lr_scheduler

# All networks derive from the base class nn.Module
class Perceptron(nn.Module):
    # get input and output dimensions as input
    def __init__(self, d, K):
        # all derived classes must call __init__ method of super class
        super(Perceptron, self).__init__()
        
        # create a fully connected layer from input to output
        self.model = nn.Linear(d, K)
        
        #H=100
        #self.model = nn.Sequential(
        #        nn.Linear(d,H),          # input to hidden layer
        #        nn.Sigmoid(),            # hidden activation function
        #        nn.Linear(H,K)           # hidden to output layer
        #    )

    
    # forward method should get the input and return the output
    def forward(self,x):
        batch_size = x.shape[0]
        # flatten the image from BxCxHXW to Bx784
        x = x.view(batch_size, -1)
        x = self.model(x.float())
        # softmax is internally done inside cross entropy loss
        return x

In [3]:
class SlpGeneticForestClassifier:
    def __init__(self, N, generation_number, class_percentage):
        self.N = N
        self.generation_number = generation_number
        self.class_percentage = class_percentage
        self.trained_trees = []  
        # torch parameters
        self.SEED = 0            # reproducability
        # NN Parameters
        self.EPOCHS = 20          # number of epochs
        self.LR = 0.01            # learning rate
        self.MOMENTUM = 0.9       # momentum for the optimizer
        self.WEIGHT_DECAY = 1e-5  # weight decay for the optimizer
        self.GAMMA = 0.1          # learning rate schedular
        self.BATCH_SIZE = 32      # number of images to load per iteration

    def train_net(self):
        # put the network in training mode
        self.slp.train()
        # keep record of the loss value
        epoch_loss = 0.0
        # use training data as batches
        for xt, rt in self.train_loader:
            # move training instances and corresponding labels into gpu if cuda is available
            xt, rt = xt.to(self.device), rt.to(self.device)
            # clear the previously accumulated gradients
            self.optimizer.zero_grad() 
            # forward the network
            yt = self.slp(xt)
            # calculate loss
            loss = self.loss_fn(yt, rt)
            # make a backward pass, calculate gradients
            loss.backward()
            # update weights
            self.optimizer.step()
            # accumulate loss
            epoch_loss += loss.item()
        return epoch_loss
        
    def train_tree(self, filter_label, state_counter, tree_list):
        #y_train = self.y_train.copy()
        #X_train = self.X_train.copy()
        #y_train.loc[~y_train.label.isin(filter_label), 'label'] = -1
        
        #indexes = y_train.index.values.tolist()
        #selected_indexes = np.random.choice(indexes, len(indexes), replace=True)

        #X_train_subtree = X_train.loc[selected_indexes]
        #Y_train_subtree = y_train.loc[selected_indexes]
        
        
        y_train_subset = self.y_train[self.y_train.label.isin(filter_label)]
        Y_train_subtree = y_train_subset.sample(frac=1, replace=True, random_state=state_counter)
        # X_train_subset = self.X_train.loc[y_train_subset.index]
        # X_train_subtree = X_train_subset.sample(frac=1, replace=True, random_state=state_counter)
        X_train_subtree = self.X_train.loc[Y_train_subtree.index.values.tolist()]
       
        
        dtc = DecisionTreeClassifier(random_state=state_counter)
        dtc = dtc.fit(X_train_subtree,Y_train_subtree)
        y_valid_filtered= self.y_valid[self.y_valid.label.isin(filter_label)]
        X_valid_filtered = self.X_valid.loc[y_valid_filtered.index]
        y_pred = dtc.predict(X_valid_filtered)
        tree_accuracy = metrics.accuracy_score(y_valid_filtered, y_pred)
        print("Accuracy of Tree",state_counter+1,":",tree_accuracy)
        print("Classes: ",filter_label)
        tree_list.append({"tree": dtc, "accuracy": tree_accuracy, "filter_label": filter_label})   
        
    def fit_trees(self, filter_labels, tree_list, thread_batch):
        state_counter = 0
        train_threads = []
        for filter_label in filter_labels:
            train_threads.append(Thread(target=self.train_tree, args=[filter_label, state_counter, tree_list]))
            state_counter += 1
        for thread_index in range(0, len(train_threads), thread_batch):
            current_train_threads = train_threads[thread_index:thread_index+thread_batch]
            for train_thread in current_train_threads:
                train_thread.start()
            for train_thread in current_train_threads:
                train_thread.join()

    def fit(self, X_train, y_train):
        self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)  # Train-test split pairs
        self.label_count = len(y_train.label.unique())
        self.sample_count = y_train.shape[0]
        self.K = self.label_count              # number of output features
        
        filter_labels = self.genetic_find_parameters()
        self.fit_trees(filter_labels, self.trained_trees, thread_batch=25)
        
        total_predictions = self.trained_trees[0]["tree"].predict(self.X_train)
        for i in range(1, self.N):
            total_predictions = np.vstack([total_predictions, self.trained_trees[i]["tree"].predict(self.X_train)])
        total_predictions = np.transpose(total_predictions)
        enc = OneHotEncoder(handle_unknown='ignore') 
        enc.fit(total_predictions)
        
        one_hot_encoded_predictions = enc.transform(total_predictions).toarray() 
    
        
        self.d = one_hot_encoded_predictions.shape[1]      # number of input features 
        
        print("SLP input dimension:", self.d)
        
        # manual seed to reproduce same resultsnet
        torch.manual_seed(self.SEED)
        # create the network
        self.slp = Perceptron(self.d,self.K)
        # check if CUDA is available
        cuda = torch.cuda.is_available()  
        self.device = torch.device("cuda:0" if cuda else "cpu")
        # if cuda is available move network into gpu
        self.slp.to(self.device)
        # specify the loss to be used
        # softmax is internally computed.
        self.loss_fn = nn.CrossEntropyLoss()
        # specify the optimizer to update the weights during backward pass
        self.optimizer = optim.SGD(self.slp.parameters(), lr=self.LR, momentum=self.MOMENTUM, weight_decay=self.WEIGHT_DECAY)
        # change learning rate over time
        self.scheduler = lr_scheduler.StepLR(self.optimizer, step_size=10, gamma=self.GAMMA) #CHECK THIS
        
        
        train_target = torch.tensor(self.y_train.values.flatten().astype(np.int32)).long()

        train = torch.tensor(one_hot_encoded_predictions) 

        train_tensor = torch.utils.data.TensorDataset(train, train_target) 
        self.train_loader = torch.utils.data.DataLoader(dataset = train_tensor, batch_size = self.BATCH_SIZE, shuffle = True, num_workers=8)
        
        # train the network
        for epoch in range(1,self.EPOCHS+1):
            # train network for one epoch
            self.train_net()
        print("SLP Weights:", self.slp.model.weight)
    
    
    def predict(self, X_test):
        total_predictions = self.trained_trees[0]["tree"].predict(X_test)
        for i in range(1, self.N):
            total_predictions = np.vstack([total_predictions, self.trained_trees[i]["tree"].predict(X_test)])
        total_predictions = np.transpose(total_predictions)
        
        
        #predicted_values = []
        #for row in total_predictions:
        #    majority_vote = np.bincount(row).argmax()
        #    predicted_values.append(majority_vote)
        #y_pred_class = np.asarray(predicted_values)
        
        
        enc = OneHotEncoder(handle_unknown='ignore')
        enc.fit(total_predictions)
        one_hot_encoded_predictions = enc.transform(total_predictions).toarray() 
        test = torch.tensor(one_hot_encoded_predictions) 
        y_pred = self.slp(test.to(self.device))
        y_pred = y_pred.cpu().detach().numpy()
        y_pred_class = np.asarray([np.argmax(pred) for pred in y_pred])
      
        return y_pred_class
    
    
    
    # Genetic algorithm  
    def generate_parent_samples(self):
        generation = []
        for i in range(self.N):
            generation.append(
                #{"max_depth":  np.random.normal(np.log2(self.sample_count)*2, np.log2(self.sample_count), 1),
                #"min_samples_split": np.random.randint(2,self.label_count),
                #"min_samples_leaf": np.random.randint(2,self.label_count),
                #"max_leaf_nodes": np.random.randint(10, self.sample_count),}
                np.random.choice(range(self.label_count), round(self.label_count*self.class_percentage), replace=False)
            )
        return generation  


    def genetic_find_parameters(self):
        generation = self.generate_parent_samples()
        #print("Generation\n","*"*50)
        #print(generation)
        #print("*"*50)
        for i in range(self.generation_number):
            generation = self.evolve(generation)
            print("Gen:",i+1)
        return generation
    
        
    def evolve(self, generation):
        trained_tree_results = []
        self.fit_trees(generation, trained_tree_results, thread_batch=25)
        trained_tree_results_sorted = sorted(trained_tree_results, key=itemgetter("accuracy"), reverse=True)
        
        next_generation = []
        next_generation.append(trained_tree_results_sorted[0]["filter_label"])
    
        for i in range(1, len(generation)):
            parent_1 = self.tournament(trained_tree_results)
            parent_2 = self.tournament(trained_tree_results)
            child = self.crossover(parent_1, parent_2)
            # self.mutate(child)
            next_generation.append(child)
            
        return next_generation

    
    def crossover(self, parent1, parent2):
        parents_merged = np.unique(np.append(parent1, parent2))
        child = np.random.choice(parents_merged, len(parent1), replace=False)
        return child

    
    def mutate(self, child):
        return

    
    def tournament(self, generation):
        # print("*********** Tournament ***********")
        accuracies = np.asarray([tree["accuracy"] for tree in generation])
        accuracies -= np.min(accuracies)
        probabilities = np.asarray(accuracies)/sum(accuracies)
        # print("Probabilities:",probabilities)
        selected = np.random.choice(generation, 1, p=probabilities)[0]["filter_label"]
        # print("Selected:", selected)
        return selected

In [4]:
np.random.seed(60) # reproducability
mndata = MNIST('Datasets/MNIST')

# read training images and corresponding labels
tr_images, tr_labels = mndata.load_training()
# read test images and corresponding labels
tt_images, tt_labels = mndata.load_testing()

# convert lists into numpy format and apply normalization
tr_images = np.array(tr_images) / 255. # shape (60000, 784)
tr_labels = np.array(tr_labels)         # shape (60000,)
tt_images = np.array(tt_images) / 255. # shape (10000, 784)
tt_labels = np.array(tt_labels)         # shape (10000,)

columns_images = ['p{}'.format(i+1) for i in range(784)]
tr_df_images = pd.DataFrame(data=tr_images, columns=columns_images)
tr_df_labels = pd.DataFrame(data=tr_labels, columns=['label'])
tt_df_images = pd.DataFrame(data=tt_images, columns=columns_images)
tt_df_labels = pd.DataFrame(data=tt_labels, columns=['label'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(tr_df_images, tr_df_labels, test_size=0.2, random_state=0)

In [6]:
dtc = DecisionTreeClassifier(random_state=0, max_depth = 30, min_samples_split = 2, min_samples_leaf=2, max_leaf_nodes=1000, min_impurity_decrease=0.00003)

In [7]:
# Train Decision Tree Classifer
dtc = dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8724166666666666


In [8]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=10)
xX_train, xX_valid, yy_train, yy_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)  # Train-test split pairs
rf.fit(xX_train,yy_train)
pred=rf.predict(xX_valid)
print(metrics.accuracy_score(yy_valid, pred))

  rf.fit(xX_train,yy_train)


0.936875


In [9]:
sgfc = SlpGeneticForestClassifier(N=10, generation_number=0, class_percentage = 1)
sgfc.fit(X_train, y_train)
pred=sgfc.predict(X_test)
print(metrics.accuracy_score(y_test, pred))

Accuracy of Tree 7 : 0.839375
Classes:  [4 0 9 1 8 5 3 2 6 7]
Accuracy of Tree 8 : 0.8425
Classes:  [9 5 1 3 8 2 0 7 6 4]
Accuracy of Tree 4 : 0.8477083333333333
Classes:  [5 4 1 9 8 6 2 3 0 7]
Accuracy of Tree 3 : 0.8410416666666667
Classes:  [8 6 7 5 1 2 9 3 0 4]
Accuracy of Tree 6 : 0.8455208333333334
Classes:  [4 7 5 3 9 2 0 6 1 8]
Accuracy of Tree 1 : 0.846875
Classes:  [3 6 8 9 4 0 5 2 7 1]
Accuracy of Tree 9 : 0.84375
Classes:  [4 8 0 5 1 2 9 3 6 7]
Accuracy of Tree 10 : 0.8409375
Classes:  [3 4 2 5 9 1 6 7 0 8]
Accuracy of Tree 2 : 0.8316666666666667
Classes:  [8 1 9 0 7 5 2 4 3 6]
Accuracy of Tree 5 : 0.8430208333333333
Classes:  [1 8 5 3 4 2 6 9 7 0]
SLP input dimension: 100
SLP Weights: Parameter containing:
tensor([[ 9.4129e-01, -5.3340e-02, -2.4032e-01, -1.1786e-01, -1.6744e-01,
         -1.0716e-01, -1.3956e-01, -3.3742e-02, -7.0885e-02, -7.7221e-02,
          9.9209e-01, -1.5274e-01, -1.9399e-01, -1.5047e-01, -1.5634e-01,
         -2.2406e-01, -1.9988e-02, -8.5024e-02, -

In [10]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(np.array([[1, 0], [1,1],[2,2],[1,3],[1,4],[1,5],[1,6],[1,7],[1,8],[1,9]]))
one_hot_encoded_predictions = enc.transform(np.array([[1,1],[1,3]])).toarray() 
print(one_hot_encoded_predictions)

[[1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]


In [11]:
sample = X_train.sample(frac=1, replace=True, random_state=190)
len(sample.index.unique())/len(sample.index)

0.6303541666666667

In [12]:
np.random.randint(10, size=3)

array([8, 4, 9])

In [13]:
round(10*0.33)

3

In [14]:
filter_label = np.random.choice(range(10), 3, replace=False)
y_train_subset = y_train[y_train.label.isin(filter_label)]
X_train_subset = X_train.loc[y_train_subset.index]

In [15]:
lst = [i for i in range(60000)]
slected_lst = np.random.choice(lst, 60000, replace=True)
len(np.unique(slected_lst))/len(lst)

0.6311666666666667

In [16]:
import time
slected_lst = []
for i in range(60000):
    rnd = np.random.randint(60000)
    slected_lst.append(rnd)
len(np.unique(slected_lst))/60000

0.6340833333333333

In [17]:
np.random.rand()

0.03014696828331853

In [18]:
from operator import itemgetter
newlist = sorted([{"tree": 4, "accuracy": 3, "filter_label": 2},{"tree": 2, "accuracy": 4, "filter_label": 5}], key=itemgetter('accuracy'), reverse=True)
print(newlist)

[{'tree': 2, 'accuracy': 4, 'filter_label': 5}, {'tree': 4, 'accuracy': 3, 'filter_label': 2}]


In [19]:
lst = ['a','b','c']
accuracies = [0.98, 0.94,0.9]
accuracies -= np.min(accuracies)
print(accuracies)
probabilities = np.asarray(accuracies)/sum(accuracies)
print(probabilities)
np.random.choice(lst, 1, p=probabilities)

[0.08 0.04 0.  ]
[0.66666667 0.33333333 0.        ]


array(['a'], dtype='<U1')

In [20]:
y_train_subset = y_train[y_train.label.isin([3,4])]
X_train_subset = X_train.loc[y_train_subset.index]
X_train_subtree = X_train_subset.sample(frac=1, replace=True, random_state=1)
Y_train_subtree = y_train_subset.sample(frac=1, replace=True, random_state=1)
dtc = DecisionTreeClassifier(random_state=0)
dtc = dtc.fit(X_train_subtree,Y_train_subtree)
y_valid_filtered= y_test[y_test.label.isin(filter_label)]
X_valid_filtered = X_test.loc[y_valid_filtered.index]
y_pred = dtc.predict(X_valid_filtered)
print(y_valid_filtered)
print(y_pred)
tree_accuracy = metrics.accuracy_score(y_valid_filtered, y_pred)

       label
40228      0
56085      8
5007       0
40115      0
38310      0
...      ...
33673      0
37954      0
41983      9
45066      0
51708      9

[3601 rows x 1 columns]
[4 4 4 ... 4 4 3]
