In [None]:
import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from sklearn.model_selection import KFold 

In [None]:
def get_device():
    if torch.cuda.is_available():
        device = 'cuda:0'
    else:
        device = 'cpu'
    return device
device = get_device()
device

# Loading Datasets

In [None]:
automobile = pd.read_csv('datasets/automobile/automobile.dat', header = None, skiprows=30, na_values=['?']) #keel

dermatology = pd.read_csv('datasets/dermatology/dermatology.dat', skiprows=39, header = None, na_values=['?'])

diabetes = pd.read_csv('datasets/diabetes(pima)/pima.dat', header=None, skiprows=13)

ecoli = pd.read_csv('datasets/ecoli/ecoli.data', header=None, delim_whitespace=True)
ecoli.drop(ecoli.columns[0], axis=1, inplace=True) # drooping the seq. numbers

flare = pd.read_csv('datasets/flare/flare.dat', header=None, skiprows=16)

iris = pd.read_csv('datasets/iris/iris.dat', header = None, skiprows=9)

led7digit = pd.read_csv('datasets/led7digit/led7digit.dat', skiprows=12, header=None)

monk = pd.read_csv('datasets/monk/monk-2.dat', header=None, skiprows=11)

new_thyroid = pd.read_csv('datasets/new thyroid/newthyroid.dat', header=None, skiprows=10)

sonar = pd.read_csv('datasets/sonar/sonar.dat', skiprows=65, header=None)

vowel = pd.read_csv('datasets/vowel/vowel.dat', skiprows=18, header=None)

wine = pd.read_csv('datasets/wine/wine.data', header=None)
cols = list(wine) # Class column needs to be the last column. Not first.
cols[0], cols[-1] = cols[-1], cols[0] # swapping 1st and last elements of column list
wine = wine.loc[:, cols] # Assigning changed column list

wisconsin = pd.read_csv('datasets/wisconsin/wisconsin.dat', skiprows=14, header=None, na_values=[' <null>'])

yeast = pd.read_csv('datasets/yeast/yeast.dat', header=None, skiprows=13)

zoo = pd.read_csv('datasets/zoo/zoo.dat', header=None, skiprows=21) #keel

In [None]:
# Uncomment the dataset to be selected

#df = automobile.copy() 
#df = dermatology.copy() 
#df = diabetes.copy() 
#df = ecoli.copy() 
#df = flare.copy() 
#df = iris.copy()
#df = led7digit.copy() 
#df = monk.copy()
df = new_thyroid.copy() 
#df = sonar.copy() 
#df = vowel.copy() 
#df = wine.copy() 
#df = wisconsin.copy() 
#df = yeast.copy()
#df = zoo.copy()

df.shape

# Encoding Categorical Features

In [None]:
feature_df = df.iloc[:, :-1] 
class_df = df.iloc[:, -1] # Process feature data separately and conatinate the class column with it at the end (to prevent one hot encoding of the class column 

try:
    one_hot_cats_df = pd.get_dummies(feature_df.select_dtypes(['object'])) # one_hot_cats_df contains the one hot encoded categorical features. If a categorical feature is nan, all one hot codes of it will contain 0. We need to turn this 0 to nans to represent missing values.
    processed_df = pd.concat([feature_df, one_hot_cats_df], axis=1) # processed_df contains former features along with the one hot encode features. The formar categorical feature values are used to search for nan values. If a nan is found, the row of it's corresponding one hot encoded columns will be assigned nans.
    print('%d categorical features present.' %(len(feature_df.select_dtypes(['object']).columns)))
except:
    processed_df = feature_df
    print('No Categorical Feature present.')

#processed_df.head()

for i in processed_df.columns:
    if processed_df[i].dtypes == 'object': # Former unencoded categorical feature found
        processed_df.loc[processed_df[i].isnull(), processed_df.columns.str.startswith(str(i) + '_', na=False)] = np.nan # The formar categorical feature values are used to search for nan values. If a nan is found, the row of it's corresponding one hot encoded columns will be assigned nans.
        
processed_df.drop(processed_df.select_dtypes(['object']), axis=1, inplace=True) # Drop the former unencoded feature values

final_df = pd.concat([processed_df, class_df], axis=1) # Add the class column to the processed feature dataframe
final_df.iloc[:, -1] = final_df.iloc[:, -1].astype('category').cat.codes # Encoding the class column so that the classes start from 0  
df = final_df
df.head()

In [None]:
print('Shape:', df.shape)
print('\nMissing values:', df.isnull().sum().sum())
print('\nClass distribution:\n' +str( df.iloc[:, -1].value_counts()))

# The Network

In [None]:
class Network(torch.nn.Module):   
    
    def __init__(self, number_of_features, number_of_classes):      
        self.number_of_features = number_of_features
        self.number_of_classes = number_of_classes
        
        self.number_of_dummy_neurons = 0
        self.missing_value_count = 0
        
        self.conv1_out_channels = 64
        self.conv3_out_channels = 128
        
        kernel_size = 3
        padding = (kernel_size - 1) // 2
                       
        super(Network, self).__init__()
        
        self.bn1 = torch.nn.BatchNorm1d(1)
        self.bn2 = torch.nn.BatchNorm1d(self.conv1_out_channels)
        self.bn3 = torch.nn.BatchNorm1d(self.conv3_out_channels)
        self.dp = torch.nn.Dropout(0.4)
        
        self.conv1 = torch.nn.Conv1d(in_channels=1, out_channels=self.conv1_out_channels, kernel_size=kernel_size, padding=padding)
        self.conv2 = torch.nn.Conv1d(in_channels=self.conv1_out_channels, out_channels=self.conv1_out_channels, kernel_size=2, stride=2)
        
        self.conv3 = torch.nn.Conv1d(in_channels=self.conv1_out_channels, out_channels=self.conv3_out_channels, kernel_size=kernel_size, padding=padding)
        self.conv4 = torch.nn.Conv1d(in_channels=self.conv3_out_channels, out_channels=self.conv3_out_channels, kernel_size=2, stride=2)
        
        self.fc1 = torch.nn.Linear(in_features=self.conv3_out_channels * (self.number_of_features // 4), out_features=self.number_of_classes) 
      
    def forward(self, t):       
        '''
        When thare's missing value in the input data point vector:
            1. Delete the input neurons coressponding to each missing value.
            2. Apply conv1.
            3. Apply conv2 (MaxPool).
            4. Flatten the output of previous layer.
            5. Now, we have to apply fc1 on the flattened output. 
               But the flattened output will be smaller in size than it would have been in case of no missing value.
               So, the input neurons to fc1 become reduced.
               But the in_feature argument of fc1 is fixed.
            6. So, add dummy neurons to make input neurons of fc1 equal to the defined in_feature argument.
            7. The weights associated with these dummy neurons must not be updated.
            8. So, after backpropagating the loss (loss.backward())
               Assign 0 to the grads of the weights associated with the dummy neurons.
            9. Now, update the weights (optimizer.step()).
        '''
        
        self.number_of_dummy_neurons = 0 # Needs to be reassined to 0 to prevent it from retaining it's former value.
        self.missing_value_count = torch.isnan(t).sum().item() # number of nan values in the input vector.
        vector_size = self.number_of_features - self.missing_value_count # vector_size = number of input neurons
        
        if self.missing_value_count > 0: # Missing value present in the data point
            t = t[~torch.isnan(t)].reshape(1, 1, vector_size) # deleting nan values from the input vector
        
        t = self.conv1(t) 
        t = F.relu(t)
        t = self.conv2(t) 
        t = F.relu(t)

        t = self.conv3(t) 
        t = F.relu(t)
        t = self.conv4(t) 
        t = F.relu(t)

        t = t.reshape(-1, self.conv3_out_channels * (vector_size // 4)) 
        
        t = self.dp(t)
        if self.missing_value_count > 0: # Missing value is there. Handle the fc1 layer.
            self.number_of_dummy_neurons = self.conv3_out_channels * (self.number_of_features // 4) - t.shape[1]
            left_padding, right_padding = 0, self.number_of_dummy_neurons
            new_t = F.pad(t, (left_padding, right_padding), value=0) # The number of in_features in Network.fc1 is fixied i.e. in_features=12 * (self.number_of_features // 4). So, if we deleted some input neurons flattened t will have to be padded with random values to match the defined in_features number 
            new_t = self.fc1(new_t)
            return new_t   
        t = self.fc1(t)
        return t

# Custom Dataset Class

In [None]:
class NumericalDataset(Dataset):
    
    def __init__(self, x, y):
        '''
        x: the attributes
        y: the class labels
        '''
        self.x = x
        self.y = y
        
    def __len__(self): 
        return (len(self.x))
    
    def __getitem__(self, i):
        x = self.x[i, :] # Returns a vector
        x = x.reshape(1, x.shape[0]) # Adding an extra dimension for channel [shape: (1, no_of_attributes)]
        x = x.double() # To prevent errors like -> Expected object of scalar type Double but got scalar type Float 
   
        y = self.y[i].item() # Assuming y is a single number denoting the class
        y = int(y) # Assuming the class is denoted by integer
        return (x, y)

# 10 Fold Cross Validation

In [None]:
def tenfcv(df, total_epochs=50):
    np_data = df.values # fetching data from dataset
    np.random.shuffle(np_data)

    number_of_classes = int(max(np_data[:, -1])) + 1 
    number_of_attributes = np_data.shape[1] - 1 # negate one as we dont consider class as attribute

    t_data = torch.tensor(np_data)
    x = t_data[:, :-1] # Attributes
    y = t_data[:, -1] # Classes
    dataset = NumericalDataset(x, y) # creating the Dataset object

    kf = KFold(n_splits = 10)
    total_accuracy = 0

    fold = 1
    for train_index, test_index in kf.split(dataset): # each fold
        train_set = torch.utils.data.Subset(dataset, train_index) 
        test_set = torch.utils.data.Subset(dataset, test_index) 

        batch_size = 1 
        train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size) 
        test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size)

        net = Network(number_of_attributes, number_of_classes).double().to(device) # a new object is created for each fold
        net = net.double()
        optimizer = torch.optim.Adam(net.parameters(), lr=0.001, eps=1e-5, weight_decay=1e-9) 
        criterion = torch.nn.CrossEntropyLoss()

        for epoch in range(total_epochs): 
            net = net.train()
            for batch in train_loader: # each Batch [0 to len(train_subset)/batch_size]
                x, y = batch
                x = x.to(device)
                y = y.to(device)

                preds = net(x) # Pass Batch (Forward pass)
                loss = criterion(preds, y) # Calculate loss

                optimizer.zero_grad()
                loss.backward() # Calculate Gradients
                if net.missing_value_count > 0: # Missing value present in the data point.
                    fc1_weight, fc1_bias = net.fc1.parameters() # Get the weights of the 1st fully connected layer and make the grads of the weights associated with the dummy neurons to 0.
                    fc1_weight.grad[:, -net.number_of_dummy_neurons:] = 0 # Select each row's last cnn_net.number_of_dummy_neurons number of elements.
                optimizer.step() #Update Weights

        #testing
        net = net.eval()
        correct_predictions = 0
        for batch in test_loader:
            test_x, test_y = batch
            test_x = test_x.to(device)
            test_y = test_y.to(device)

            preds = net(test_x)
            loss = F.cross_entropy(preds, test_y)
            
            correct_predictions += preds.argmax(dim=1).eq(test_y).sum().item()

        accuracy = correct_predictions / len(test_set) * 100
        print('\nFold: ', fold, '/', '10 loss:', loss.item())
        print('Correct predictions: ', correct_predictions, '/', len(test_set))
        print('Accuracy:', accuracy)
        print()
        total_accuracy += accuracy
        fold += 1

    print()
    print('10CV result:')
    final_accuracy = total_accuracy / 10
    print('Accuracy:', final_accuracy)
    return final_accuracy

In [None]:
total_accuracy = 0
for i in range(10):
    print('\n----------------------------------------------\n')
    print('%dth 10CV:' %(i+1))
    total_accuracy += tenfcv(df, total_epochs=1)

print('\n----------------------------------------------')
print('\n----------------------------------------------')
print('Final Result:')
print('Average accuracy after doing 10cv 10 times:', total_accuracy / 10)