In [1]:
import os
import numpy as np
import random
import csv

In [102]:
# All files were developed collaboratively

class dataset:
    '''
    The dataset class handles initial data loading along with all pre-processing tasks
    '''
    def __init__(self, data_path: str, processed_flag: str):
        '''
        The constructor initializes all of the self variables, and loads the data from the original .data file.
        '''
         # Instantiate self variables
        self.intake_data = []
        self.tune_set = []
        self.validate_set = []
        self.ninety_data = []

        # Data is being read in from original .DATA file
        if (processed_flag == False):
            # Separating the .data file into lines, and shuffling the lines
            with open(data_path, 'r') as file:
                lines = file.readlines()
            # Deliminate strings into lists
            for i in range(len(lines)):
                lines[i] = lines[i].strip()
                lines[i] = lines[i].split(',')  
            # Make the list into a numpy array
            self.intake_data = np.array(lines)
    def normalize(self):
        '''
        performs mim-max normalization on the last column of the intake data (example value). This will only be used for regression data.
        '''
        values = self.intake_data[:,-1].astype(float)
        normalized_values = (values - values.min()) / (values.max() - values.min())
        self.intake_data[:, -1] = normalized_values
    def oh_encode(self):
        '''
        This method goes through each item in the data array, and if the item is not a number, it is replaced with a number (continuization).
        If there are no non-numbers in the dataset, all the numbers are converted to floats.
        '''
        string_to_int = {}
        next_int = 0
        # This function continuizes a single element so it can be vectorized
        def convert_to_num(value):
            nonlocal next_int
            try:
                # Try to convert to float
                return float(value)
            except ValueError:
                # If conversion fails, map the string a number
                if value not in string_to_int:
                    string_to_int[value] = next_int
                    next_int += 1
                return string_to_int[value]

        # Apply convert_to_num to each element in the array
        vectorization = np.vectorize(convert_to_num, otypes=[float])
        self.intake_data = vectorization(self.intake_data)
    def impute(self):
        '''
        Replaces question marks in a dataset with a random value between 1 and 10.
        '''
        for ex_idx in range(len(self.intake_data)):
            for att_idx in range(len(self.intake_data[ex_idx])):
                # if this statement is entered that means there is a missing piece of attribute data, so imputation needs to occur at this location
                if (self.intake_data[ex_idx][att_idx] == '?'):
                    # This will be the imputation method using range 1-10
                        self.intake_data[ex_idx][att_idx] = str(random.randint(1,10))
    def shuffle(self):
        '''
        This method will shuffle the self.intake_data by examples.
        '''
        np.random.shuffle(self.intake_data)
    def sort(self, prediction_type_flag):
        '''
        Sorts the data by its class/target value. We can assume all labels are the last indice of an example.
        The prediction_type_flag essentially tells us if the last indice can be converted to a float or not. Regression datasets are sorted by value
        '''
        if prediction_type_flag == "regression":
            #print('REGRESSION')
            sorted_data = self.intake_data[self.intake_data[:, -1].astype(np.float32).argsort()]
        else:
            #print("CLASSIFICATION")
            sorted_data = self.intake_data[self.intake_data[:, -1].argsort()]
        self.intake_data = sorted_data
    def split(self):
        '''
        Puts the first 10% of the data into its own array (self.tune_set), then the remaining data (self.validate_set) into its own array.
        We should end up with two arrays, both are sorted and stratified. The validation will still need to be separated into partitions.
        '''
        tune_data = []
        for i, example in enumerate(self.intake_data):
            if(i % 10) == 0:
                tune_data.append(example)
            else:
                self.ninety_data.append(example)
        self.tune_set = np.array(tune_data)
        self.ninety_data = np.array(self.ninety_data)
    def fold(self):
        '''
        This method folds self.validate_set into stratified partitions
        '''
        # shape should be (10, # of examples, # of attributes)
        shape = (10, (len(self.ninety_data) // 10) + 1, len(self.ninety_data[0]))
        self.validate_set = np.full(shape, np.nan)
        fold_counts = np.zeros(10)

        # splits data into folds
        for i, example in enumerate(self.ninety_data):
            fold_index = i % 10
            example_position = fold_counts[fold_index]  #This finds the next null example
            self.validate_set[fold_index, int(example_position)] = example
            fold_counts[fold_index] += 1
    def shuffle_splits(self):
        '''
        Shuffles the tune set and validate set after they are complete and stratified
        '''
        np.random.shuffle(self.tune_set)
        for partition_idx, partition in enumerate(self.validate_set):
            np.random.shuffle(partition)
    def remove_attribute(self, indice=0):
        '''
        Takes in an attribute indice, and removes that entire indice from the dataset. This can be used to remove ID numbers
        '''
        self.intake_data = np.delete(self.intake_data, indice, 1)    
    def save(self, filename: str):
        """
        saves the tune set and validation set to a csv file for inspection purposes.
        """
        #get/create the path to the folder that the file should be saved to
        folder_path = os.path.expanduser(f"~/CSCI_447/Project_3/Datasets/processed_data")  
        os.makedirs(folder_path, exist_ok=True)
        tune_file_path = os.path.join(folder_path, (filename+'_tune_set.csv'))
        validate_file_path = os.path.join(folder_path, (filename+'_validate_set.csv'))

        # save the tune set
        shape_info = None
        with open(tune_file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            if shape_info:
                writer.writerow(["shape"] + list(shape_info))
            writer.writerows(self.tune_set)

        # save the validation set
        reshaped_array = np.array([[';'.join(str(row)) for row in batch] for batch in self.validate_set])
        shape_info = self.validate_set.shape
        with open(validate_file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            if shape_info:
                writer.writerow(["shape"] + list(shape_info))
            writer.writerows(reshaped_array)
    def extract(self, file_path: str):
        """
        Loads data from a CSV file and converts it back to a numpy array in the original format.
        """
        tune_file_path = file_path+'_tune_set.csv'
        validate_file_path = file_path+'_validate_set.csv'

        # extract the tune set
        with open(tune_file_path, mode='r') as file:
            reader = csv.reader(file)
            rows = list(reader)
        self.tune_set = np.array(rows, dtype=str)

        # extract the validate set
        with open(validate_file_path, mode='r') as file:
            reader = csv.reader(file)
            rows = list(reader)
        shape_info = tuple(map(int, rows[0][1:]))
        data = rows[1:]
        reconstructed_data = [[cell.split(';') for cell in row] for row in data]
        self.validate_set = np.array(reconstructed_data, dtype=str).reshape(shape_info)

In [103]:
user = 'carlthedog3'

abalone_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/abalone.data', False)
cancer_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/breast-cancer-wisconsin.data', False)
fire_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/forestfires.data', False)
glass_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/glass.data', False)
machine_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/machine.data', False)
soybean_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/soybean-small.data', False)

In [104]:
abalone_data.oh_encode()
print(abalone_data.intake_data)
abalone_data.shuffle()
abalone_data.sort('regression')
abalone_data.split()
abalone_data.fold()
abalone_data.shuffle_splits()
print(abalone_data.validate_set)

[[ 0.      0.455   0.365  ...  0.101   0.15   15.    ]
 [ 0.      0.35    0.265  ...  0.0485  0.07    7.    ]
 [ 1.      0.53    0.42   ...  0.1415  0.21    9.    ]
 ...
 [ 0.      0.6     0.475  ...  0.2875  0.308   9.    ]
 [ 1.      0.625   0.485  ...  0.261   0.296  10.    ]
 [ 0.      0.71    0.555  ...  0.3765  0.495  12.    ]]
[[[0.000e+00 4.250e-01 3.250e-01 ... 1.065e-01 1.050e-01 9.000e+00]
  [2.000e+00 4.400e-01 3.500e-01 ... 8.300e-02 1.250e-01 1.200e+01]
  [0.000e+00 4.900e-01 3.850e-01 ... 1.240e-01 1.695e-01 8.000e+00]
  ...
  [1.000e+00 6.300e-01 4.850e-01 ... 2.485e-01 3.400e-01 1.000e+01]
  [1.000e+00 5.100e-01 3.800e-01 ... 1.355e-01 1.850e-01 1.300e+01]
  [2.000e+00 4.050e-01 3.100e-01 ... 6.000e-02 8.700e-02 8.000e+00]]

 [[2.000e+00 4.200e-01 3.200e-01 ... 9.200e-02 1.000e-01 5.000e+00]
  [0.000e+00 5.000e-01 3.850e-01 ... 1.250e-01 2.350e-01 1.400e+01]
  [0.000e+00 4.950e-01 4.000e-01 ... 1.155e-01 3.500e-01 6.000e+00]
  ...
  [0.000e+00 6.250e-01 4.950e-01 ... 3

In [105]:
cancer_data.oh_encode()
cancer_data.impute()
cancer_data.shuffle()
cancer_data.sort('classification')
cancer_data.split()
cancer_data.fold()
cancer_data.shuffle_splits()

In [106]:
fire_data.oh_encode()
fire_data.shuffle()
fire_data.sort('regression')
fire_data.split()
fire_data.fold()
fire_data.shuffle_splits()

In [107]:
glass_data.oh_encode()
glass_data.shuffle()
glass_data.sort('classification')
glass_data.split()
glass_data.fold()
glass_data.shuffle_splits()

In [108]:
machine_data.oh_encode()
machine_data.shuffle()
machine_data.sort('regression')
machine_data.split()
machine_data.fold()
machine_data.shuffle_splits()

In [109]:
soybean_data.oh_encode()
soybean_data.shuffle()
soybean_data.sort('classification')
soybean_data.split()
soybean_data.fold()
soybean_data.shuffle_splits()

In [110]:
abalone_data.save('abalone')
cancer_data.save('cancer')
fire_data.save('fire')
glass_data.save('glass')
machine_data.save('machine')
soybean_data.save('soybean')

In [111]:
print(f"Abalone Data:\nTune Set: {abalone_data.tune_set}\nValidate Set: {abalone_data.validate_set}\n\n")

Abalone Data:
Tune Set: [[ 2.      0.44    0.32   ...  0.074   0.12    9.    ]
 [ 2.      0.34    0.26   ...  0.0525  0.055   6.    ]
 [ 0.      0.68    0.54   ...  0.3235  0.4285 11.    ]
 ...
 [ 1.      0.57    0.46   ...  0.2205  0.38   14.    ]
 [ 2.      0.435   0.33   ...  0.0945  0.11    7.    ]
 [ 0.      0.505   0.395  ...  0.1315  0.185  12.    ]]
Validate Set: [[[0.000e+00 4.250e-01 3.250e-01 ... 1.065e-01 1.050e-01 9.000e+00]
  [2.000e+00 4.400e-01 3.500e-01 ... 8.300e-02 1.250e-01 1.200e+01]
  [0.000e+00 4.900e-01 3.850e-01 ... 1.240e-01 1.695e-01 8.000e+00]
  ...
  [1.000e+00 6.300e-01 4.850e-01 ... 2.485e-01 3.400e-01 1.000e+01]
  [1.000e+00 5.100e-01 3.800e-01 ... 1.355e-01 1.850e-01 1.300e+01]
  [2.000e+00 4.050e-01 3.100e-01 ... 6.000e-02 8.700e-02 8.000e+00]]

 [[2.000e+00 4.200e-01 3.200e-01 ... 9.200e-02 1.000e-01 5.000e+00]
  [0.000e+00 5.000e-01 3.850e-01 ... 1.250e-01 2.350e-01 1.400e+01]
  [0.000e+00 4.950e-01 4.000e-01 ... 1.155e-01 3.500e-01 6.000e+00]
  ...


In [112]:
print(f"Cancer Data:\nTune Set:\n{cancer_data.tune_set}\nValidate Set:\n{cancer_data.validate_set}\n\n")

Cancer Data:
Tune Set:
[[1.182410e+06 3.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
  2.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
  2.000000e+00]
 [1.212422e+06 3.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
  2.000000e+00 1.000000e+00 3.000000e+00 1.000000e+00 1.000000e+00
  2.000000e+00]
 [8.306900e+05 5.000000e+00 2.000000e+00 2.000000e+00 2.000000e+00
  3.000000e+00 1.000000e+00 1.000000e+00 3.000000e+00 1.000000e+00
  2.000000e+00]
 [1.268275e+06 9.000000e+00 8.000000e+00 8.000000e+00 9.000000e+00
  6.000000e+00 3.000000e+00 4.000000e+00 1.000000e+00 1.000000e+00
  4.000000e+00]
 [1.201936e+06 5.000000e+00 1.000000e+01 1.000000e+01 3.000000e+00
  8.000000e+00 1.000000e+00 5.000000e+00 1.000000e+01 3.000000e+00
  4.000000e+00]
 [1.125035e+06 9.000000e+00 4.000000e+00 5.000000e+00 1.000000e+01
  6.000000e+00 1.000000e+01 4.000000e+00 8.000000e+00 1.000000e+00
  4.000000e+00]
 [5.665090e+05 5.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
  2.000000

In [113]:
print(f"Fire Data:\nTune Set:\n{fire_data.tune_set}\nValidate Set:\n{fire_data.validate_set}\n\n")

Fire Data:
Tune Set:
[[7.0000e+00 4.0000e+00 2.7000e+01 2.0000e+01 8.4700e+01 9.5000e+00
  5.8300e+01 4.1000e+00 7.5000e+00 7.1000e+01 6.3000e+00 0.0000e+00
  9.9600e+00]
 [4.0000e+00 5.0000e+00 2.1000e+01 1.8000e+01 9.1000e+01 2.7630e+02
  8.2510e+02 7.1000e+00 1.3800e+01 7.7000e+01 7.6000e+00 0.0000e+00
  0.0000e+00]
 [4.0000e+00 5.0000e+00 2.1000e+01 1.6000e+01 9.1100e+01 1.3230e+02
  8.1210e+02 1.2500e+01 1.5900e+01 3.8000e+01 5.4000e+00 0.0000e+00
  1.7500e+00]
 [4.0000e+00 6.0000e+00 2.1000e+01 2.6000e+01 9.3700e+01 8.0900e+01
  6.8520e+02 1.7900e+01 1.7600e+01 4.2000e+01 3.1000e+00 0.0000e+00
  0.0000e+00]
 [6.0000e+00 5.0000e+00 2.4000e+01 2.0000e+01 9.0400e+01 9.3300e+01
  2.9810e+02 7.5000e+00 2.0700e+01 2.5000e+01 4.9000e+00 0.0000e+00
  0.0000e+00]
 [3.0000e+00 4.0000e+00 2.1000e+01 2.0000e+01 9.1800e+01 7.8500e+01
  7.2430e+02 9.2000e+00 1.8900e+01 3.5000e+01 2.7000e+00 0.0000e+00
  0.0000e+00]
 [8.0000e+00 5.0000e+00 2.1000e+01 1.8000e+01 8.9700e+01 9.0000e+01
  7.0440e+0

In [114]:
print(f"Glass Data:\nTune Set:\n{glass_data.tune_set}\nValidate Set:\n{glass_data.validate_set}\n\n")

Glass Data:
Tune Set:
[[1.03000e+02 1.51820e+00 1.26200e+01 2.76000e+00 8.30000e-01 7.38100e+01
  3.50000e-01 9.42000e+00 0.00000e+00 2.00000e-01 2.00000e+00]
 [2.07000e+02 1.51645e+00 1.49400e+01 0.00000e+00 1.87000e+00 7.31100e+01
  0.00000e+00 8.67000e+00 1.38000e+00 0.00000e+00 7.00000e+00]
 [7.70000e+01 1.51645e+00 1.34400e+01 3.61000e+00 1.54000e+00 7.23900e+01
  6.60000e-01 8.03000e+00 0.00000e+00 0.00000e+00 2.00000e+00]
 [1.14000e+02 1.51892e+00 1.34600e+01 3.83000e+00 1.26000e+00 7.25500e+01
  5.70000e-01 8.21000e+00 0.00000e+00 1.40000e-01 2.00000e+00]
 [1.75000e+02 1.52058e+00 1.28500e+01 1.61000e+00 2.17000e+00 7.21800e+01
  7.60000e-01 9.70000e+00 2.40000e-01 5.10000e-01 5.00000e+00]
 [3.60000e+01 1.51567e+00 1.32900e+01 3.45000e+00 1.21000e+00 7.27400e+01
  5.60000e-01 8.57000e+00 0.00000e+00 0.00000e+00 1.00000e+00]
 [4.30000e+01 1.51779e+00 1.32100e+01 3.39000e+00 1.33000e+00 7.27600e+01
  5.90000e-01 8.59000e+00 0.00000e+00 0.00000e+00 1.00000e+00]
 [1.05000e+02 1.524

In [115]:
print(f"Machine Data:\nTune Set:\n{machine_data.tune_set}\nValidate Set:\n{machine_data.validate_set}\n\n")

Machine Data:
Tune Set:
[[8.700e+01 9.800e+01 1.400e+02 2.000e+03 3.200e+04 3.200e+01 1.000e+00
  5.400e+01 1.410e+02 1.810e+02]
 [1.850e+02 1.860e+02 7.000e+01 4.000e+03 1.200e+04 8.000e+00 6.000e+00
  8.000e+00 7.500e+01 6.700e+01]
 [1.300e+02 4.445e+03 5.000e+01 2.000e+03 8.000e+03 8.000e+00 1.000e+00
  6.000e+00 5.600e+01 4.400e+01]
 [2.800e+01 3.300e+01 3.200e+02 5.120e+02 5.000e+03 4.000e+00 1.000e+00
  5.000e+00 7.700e+01 2.800e+01]
 [8.600e+01 3.000e+02 3.000e+02 7.680e+02 3.000e+03 6.000e+00 6.000e+00
  2.400e+01 4.400e+01 2.500e+01]
 [7.400e+01 7.600e+01 7.500e+01 2.000e+03 1.600e+04 6.400e+01 1.000e+00
  3.800e+01 1.440e+02 1.130e+02]
 [1.900e+01 2.600e+01 1.430e+02 2.300e+03 6.200e+03 0.000e+00 6.000e+00
  6.400e+01 6.100e+01 4.000e+01]
 [1.010e+02 1.020e+02 5.700e+01 4.000e+03 1.600e+04 1.000e+00 6.000e+00
  1.200e+01 1.320e+02 8.200e+01]
 [1.010e+02 8.140e+03 8.000e+02 7.680e+02 2.000e+03 0.000e+00 0.000e+00
  0.000e+00 2.000e+01 2.000e+01]
 [8.600e+01 1.000e+02 3.000e+02

In [116]:
print(f"Soybean Data:\nTune Set:\n{soybean_data.tune_set}\nValidate Set:\n{soybean_data.validate_set}\n\n")

Soybean Data:
Tune Set:
[[3. 0. 2. 1. 0. 2. 1. 1. 0. 1. 1. 1. 0. 2. 2. 0. 0. 0. 1. 1. 3. 0. 1. 1.
  0. 0. 0. 0. 4. 0. 0. 0. 0. 0. 0. 0.]
 [3. 0. 0. 1. 0. 1. 2. 1. 0. 0. 1. 1. 0. 2. 2. 0. 0. 0. 1. 0. 0. 3. 0. 0.
  0. 2. 1. 0. 4. 0. 0. 0. 0. 0. 0. 1.]
 [1. 1. 2. 0. 0. 3. 1. 1. 1. 2. 1. 1. 0. 2. 2. 0. 0. 0. 1. 0. 2. 2. 0. 0.
  0. 0. 0. 3. 4. 0. 0. 0. 0. 0. 1. 3.]
 [0. 1. 2. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 2. 2. 0. 0. 0. 1. 0. 1. 2. 0. 1.
  0. 0. 0. 3. 4. 0. 0. 0. 0. 0. 1. 3.]
 [4. 0. 2. 0. 1. 0. 1. 2. 0. 2. 1. 1. 0. 2. 2. 0. 0. 0. 1. 1. 1. 1. 0. 1.
  1. 0. 0. 3. 4. 0. 0. 0. 0. 0. 0. 2.]]
Validate Set:
[[[ 0.  1.  2. ...  0.  0.  2.]
  [ 4.  0.  2. ...  0.  0.  0.]
  [ 6.  0.  0. ...  0.  0.  1.]
  [ 2.  1.  2. ...  0.  1.  3.]
  [ 3.  1.  2. ...  0.  1.  3.]]

 [[ 6.  0.  2. ...  0.  0.  0.]
  [ 1.  1.  2. ...  0.  1.  3.]
  [ 3.  1.  1. ...  0.  1.  3.]
  [ 5.  0.  0. ...  0.  0.  1.]
  [ 0.  1.  2. ...  0.  0.  2.]]

 [[ 2.  1.  1. ...  0.  1.  3.]
  [nan nan nan ... nan nan nan]
  [ 3.  