In [1]:
import os
import numpy as np
import random
import csv

In [1]:
# All files were developed collaboratively

class dataset:
    '''
    The dataset class handles initial data loading along with all pre-processing tasks
    '''
    def __init__(self, data_path: str, processed_flag: str):
        '''
        The constructor initializes all of the self variables, and loads the data from the original .data file.
        '''
         # Instantiate self variables
        self.intake_data = []
        self.tune_set = []
        self.validate_set = []
        self.ninety_data = []

        # Data is being read in from original .DATA file
        if (processed_flag == False):
            # Separating the .data file into lines, and shuffling the lines
            with open(data_path, 'r') as file:
                lines = file.readlines()
            # Deliminate strings into lists
            for i in range(len(lines)):
                lines[i] = lines[i].strip()
                lines[i] = lines[i].split(',')  
            # Make the list into a numpy array
            self.intake_data = np.array(lines)

    def normalize(self, prediction_type: str):
        '''
        performs mim-max normalization on the last column of the intake data (example value). This will only be used for regression data.
        '''
        # Separate features and labels
        features = self.intake_data[:, :-1]  # All columns except the last one (features)
        labels = self.intake_data[:, -1]     # Last column (labels)

        # Apply min-max normalization to features
        features_min = features.min(axis=0)
        features_max = features.max(axis=0)
        normalized_features = (features - features_min) / (features_max - features_min)

        if prediction_type == "regression":
            normalized_labels = (labels - labels.min()) / (labels.max() - labels.min())
        else:
            # Create a mapping from the original labels to new labels starting from 0
            unique_labels = np.unique(labels)
            label_mapping = {old_label: new_label for new_label, old_label in enumerate(unique_labels)}

            # Map the labels to new values starting from 0
            new_labels = np.array([label_mapping[label] for label in labels])

            normalized_labels = new_labels

        # Combine normalized features with labels
        # if this does not work make self.intake_data[:,:-1] and self.intake_data[:,-1] equal the normalized labels and features matrices
        normalized_data = np.hstack((normalized_features, normalized_labels.reshape(-1, 1)))
        self.intake_data = normalized_data

        '''
        if prediction_type == "classification":
            # normalize features, not labels
            values = self.intake_data[:,-1].astype(float)
            normalized_values = (values - values.min()) / (values.max() - values.min())
            self.intake_data[:, -1] = normalized_values
        else:
            # normalize entire matrix
        values = self.intake_data[:,-1].astype(float)
        normalized_values = (values - values.min()) / (values.max() - values.min())
        self.intake_data[:, -1] = normalized_values
        '''
    def oh_encode(self):
        '''
        This method goes through each item in the data array, and if the item is not a number, it is replaced with a number (continuization).
        If there are no non-numbers in the dataset, all the numbers are converted to floats.
        '''
        string_to_int = {}
        next_int = 0
        # This function continuizes a single element so it can be vectorized
        def convert_to_num(value):
            nonlocal next_int
            try:
                # Try to convert to float
                return float(value)
            except ValueError:
                # If conversion fails, map the string a number
                if value not in string_to_int:
                    string_to_int[value] = next_int
                    next_int += 1
                return string_to_int[value]

        # Apply convert_to_num to each element in the array
        vectorization = np.vectorize(convert_to_num, otypes=[float])
        self.intake_data = vectorization(self.intake_data)
    def impute(self):
        '''
        Replaces question marks in a dataset with a random value between 1 and 10.
        '''
        for ex_idx in range(len(self.intake_data)):
            for att_idx in range(len(self.intake_data[ex_idx])):
                # if this statement is entered that means there is a missing piece of attribute data, so imputation needs to occur at this location
                if (self.intake_data[ex_idx][att_idx] == '?'):
                    # This will be the imputation method using range 1-10
                        self.intake_data[ex_idx][att_idx] = str(random.randint(1,10))
    def shuffle(self):
        '''
        This method will shuffle the self.intake_data by examples.
        '''
        np.random.shuffle(self.intake_data)
    def sort(self, prediction_type_flag):
        '''
        Sorts the data by its class/target value. We can assume all labels are the last indice of an example.
        The prediction_type_flag essentially tells us if the last indice can be converted to a float or not. Regression datasets are sorted by value
        '''
        if prediction_type_flag == "regression":
            #print('REGRESSION')
            sorted_data = self.intake_data[self.intake_data[:, -1].astype(np.float32).argsort()]
        else:
            #print("CLASSIFICATION")
            sorted_data = self.intake_data[self.intake_data[:, -1].argsort()]
        self.intake_data = sorted_data
    def split(self):
        '''
        Puts the first 10% of the data into its own array (self.tune_set), then the remaining data (self.validate_set) into its own array.
        We should end up with two arrays, both are sorted and stratified. The validation will still need to be separated into partitions.
        '''
        tune_data = []
        for i, example in enumerate(self.intake_data):
            if(i % 10) == 0:
                tune_data.append(example)
            else:
                self.ninety_data.append(example)
        self.tune_set = np.array(tune_data)
        self.ninety_data = np.array(self.ninety_data)
    def fold(self):
        '''
        This method folds self.validate_set into stratified partitions
        '''
        # shape should be (10, # of examples, # of attributes)
        shape = (10, (len(self.ninety_data) // 10) + 1, len(self.ninety_data[0]))
        self.validate_set = np.full(shape, np.nan)
        fold_counts = np.zeros(10)

        # splits data into folds
        for i, example in enumerate(self.ninety_data):
            fold_index = i % 10
            example_position = fold_counts[fold_index]  #This finds the next null example
            self.validate_set[fold_index, int(example_position)] = example
            fold_counts[fold_index] += 1
    def shuffle_splits(self):
        '''
        Shuffles the tune set and validate set after they are complete and stratified
        '''
        np.random.shuffle(self.tune_set)
        for partition_idx, partition in enumerate(self.validate_set):
            np.random.shuffle(partition)
    def remove_attribute(self, indice=0):
        '''
        Takes in an attribute indice, and removes that entire indice from the dataset. This can be used to remove ID numbers
        '''
        self.intake_data = np.delete(self.intake_data, indice, 1)    
    def save(self, filename: str):
        """
        saves the tune set and validation set to a csv file for inspection purposes.
        """
        #get/create the path to the folder that the file should be saved to
        folder_path = os.path.expanduser(f"~/CSCI_447/Project_3/Datasets/processed_data")  
        os.makedirs(folder_path, exist_ok=True)
        tune_file_path = os.path.join(folder_path, (filename+'_tune_set.csv'))
        validate_file_path = os.path.join(folder_path, (filename+'_validate_set.csv'))

        # save the tune set
        shape_info = None
        with open(tune_file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            if shape_info:
                writer.writerow(["shape"] + list(shape_info))
            writer.writerows(self.tune_set)

        # save the validation set
        reshaped_array = np.array([[';'.join(str(row)) for row in batch] for batch in self.validate_set])
        shape_info = self.validate_set.shape
        with open(validate_file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            if shape_info:
                writer.writerow(["shape"] + list(shape_info))
            writer.writerows(reshaped_array)
    def extract(self, file_path: str):
        """
        Loads data from a CSV file and converts it back to a numpy array in the original format.
        """
        tune_file_path = file_path+'_tune_set.csv'
        validate_file_path = file_path+'_validate_set.csv'

        # extract the tune set
        with open(tune_file_path, mode='r') as file:
            reader = csv.reader(file)
            rows = list(reader)
        self.tune_set = np.array(rows, dtype=str)

        # extract the validate set
        with open(validate_file_path, mode='r') as file:
            reader = csv.reader(file)
            rows = list(reader)
        shape_info = tuple(map(int, rows[0][1:]))
        data = rows[1:]
        reconstructed_data = [[cell.split(';') for cell in row] for row in data]
        self.validate_set = np.array(reconstructed_data, dtype=str).reshape(shape_info)

IndentationError: expected an indented block after 'if' statement on line 32 (498359540.py, line 34)

In [3]:
user = 'carlthedog3'

abalone_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/abalone.data', False)
cancer_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/breast-cancer-wisconsin.data', False)
fire_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/forestfires.data', False)
glass_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/glass.data', False)
machine_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/machine.data', False)
soybean_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/soybean-small.data', False)

In [4]:
abalone_data.oh_encode()
print(abalone_data.intake_data)
abalone_data.shuffle()
abalone_data.sort('regression')
abalone_data.split()
abalone_data.fold()
abalone_data.shuffle_splits()
print(abalone_data.validate_set)

[[ 0.      0.455   0.365  ...  0.101   0.15   15.    ]
 [ 0.      0.35    0.265  ...  0.0485  0.07    7.    ]
 [ 1.      0.53    0.42   ...  0.1415  0.21    9.    ]
 ...
 [ 0.      0.6     0.475  ...  0.2875  0.308   9.    ]
 [ 1.      0.625   0.485  ...  0.261   0.296  10.    ]
 [ 0.      0.71    0.555  ...  0.3765  0.495  12.    ]]
[[[ 2.      0.325   0.25   ...  0.0345  0.049   7.    ]
  [ 0.      0.595   0.48   ...  0.1825  0.289   9.    ]
  [ 2.      0.435   0.335  ...  0.058   0.115   7.    ]
  ...
  [ 2.      0.475   0.35   ...  0.099   0.14    7.    ]
  [ 1.      0.49    0.38   ...  0.1075  0.174  10.    ]
  [ 2.      0.4     0.3    ...  0.071   0.075   6.    ]]

 [[ 2.      0.4     0.315  ...  0.0735  0.091   6.    ]
  [ 1.      0.435   0.35   ...  0.1005  0.13    7.    ]
  [ 1.      0.585   0.42   ...  0.2155  0.2875 13.    ]
  ...
  [ 0.      0.595   0.47   ...  0.2325  0.3345  9.    ]
  [ 2.      0.465   0.345  ...  0.0345  0.109   6.    ]
  [ 2.      0.375   0.285  ...  0.

In [5]:
cancer_data.oh_encode()
cancer_data.impute()
cancer_data.shuffle()
cancer_data.sort('classification')
cancer_data.split()
cancer_data.fold()
cancer_data.shuffle_splits()
print(cancer_data.tune_set)

[[8.322260e+05 3.000000e+00 4.000000e+00 4.000000e+00 1.000000e+01
  5.000000e+00 1.000000e+00 3.000000e+00 3.000000e+00 1.000000e+00
  4.000000e+00]
 [1.182404e+06 4.000000e+00 2.000000e+00 1.000000e+00 1.000000e+00
  2.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
  2.000000e+00]
 [7.308810e+05 7.000000e+00 6.000000e+00 3.000000e+00 2.000000e+00
  5.000000e+00 1.000000e+01 7.000000e+00 4.000000e+00 6.000000e+00
  4.000000e+00]
 [1.115282e+06 5.000000e+00 3.000000e+00 5.000000e+00 5.000000e+00
  3.000000e+00 3.000000e+00 4.000000e+00 1.000000e+01 1.000000e+00
  4.000000e+00]
 [8.508310e+05 2.000000e+00 7.000000e+00 1.000000e+01 1.000000e+01
  7.000000e+00 1.000000e+01 4.000000e+00 9.000000e+00 4.000000e+00
  4.000000e+00]
 [1.017023e+06 4.000000e+00 1.000000e+00 1.000000e+00 3.000000e+00
  2.000000e+00 1.000000e+00 3.000000e+00 1.000000e+00 1.000000e+00
  2.000000e+00]
 [7.984290e+05 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
  2.000000e+00 1.000000e+00 3.000

In [6]:
fire_data.oh_encode()
fire_data.shuffle()
fire_data.sort('regression')
fire_data.split()
fire_data.fold()
fire_data.shuffle_splits()

In [7]:
glass_data.oh_encode()
glass_data.shuffle()
glass_data.sort('classification')
glass_data.split()
glass_data.fold()
glass_data.shuffle_splits()

In [8]:
machine_data.oh_encode()
machine_data.shuffle()
machine_data.sort('regression')
machine_data.split()
machine_data.fold()
machine_data.shuffle_splits()

In [9]:
soybean_data.oh_encode()
soybean_data.shuffle()
soybean_data.sort('classification')
soybean_data.split()
soybean_data.fold()
soybean_data.shuffle_splits()

In [10]:
abalone_data.save('abalone')
cancer_data.save('cancer')
fire_data.save('fire')
glass_data.save('glass')
machine_data.save('machine')
soybean_data.save('soybean')

In [11]:
print(f"Abalone Data:\nTune Set: {abalone_data.tune_set}\nValidate Set: {abalone_data.validate_set}\n\n")

Abalone Data:
Tune Set: [[ 0.      0.63    0.505  ...  0.252   0.34   12.    ]
 [ 2.      0.375   0.275  ...  0.05    0.0605  7.    ]
 [ 1.      0.53    0.395  ...  0.1375  0.161   9.    ]
 ...
 [ 1.      0.68    0.55   ...  0.3655  0.515  11.    ]
 [ 0.      0.645   0.5    ...  0.278   0.395  17.    ]
 [ 2.      0.44    0.305  ...  0.091   0.11    9.    ]]
Validate Set: [[[ 2.      0.325   0.25   ...  0.0345  0.049   7.    ]
  [ 0.      0.595   0.48   ...  0.1825  0.289   9.    ]
  [ 2.      0.435   0.335  ...  0.058   0.115   7.    ]
  ...
  [ 2.      0.475   0.35   ...  0.099   0.14    7.    ]
  [ 1.      0.49    0.38   ...  0.1075  0.174  10.    ]
  [ 2.      0.4     0.3    ...  0.071   0.075   6.    ]]

 [[ 2.      0.4     0.315  ...  0.0735  0.091   6.    ]
  [ 1.      0.435   0.35   ...  0.1005  0.13    7.    ]
  [ 1.      0.585   0.42   ...  0.2155  0.2875 13.    ]
  ...
  [ 0.      0.595   0.47   ...  0.2325  0.3345  9.    ]
  [ 2.      0.465   0.345  ...  0.0345  0.109   6.  

In [12]:
print(f"Cancer Data:\nTune Set:\n{cancer_data.tune_set}\nValidate Set:\n{cancer_data.validate_set}\n\n")

Cancer Data:
Tune Set:
[[8.322260e+05 3.000000e+00 4.000000e+00 4.000000e+00 1.000000e+01
  5.000000e+00 1.000000e+00 3.000000e+00 3.000000e+00 1.000000e+00
  4.000000e+00]
 [1.182404e+06 4.000000e+00 2.000000e+00 1.000000e+00 1.000000e+00
  2.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
  2.000000e+00]
 [7.308810e+05 7.000000e+00 6.000000e+00 3.000000e+00 2.000000e+00
  5.000000e+00 1.000000e+01 7.000000e+00 4.000000e+00 6.000000e+00
  4.000000e+00]
 [1.115282e+06 5.000000e+00 3.000000e+00 5.000000e+00 5.000000e+00
  3.000000e+00 3.000000e+00 4.000000e+00 1.000000e+01 1.000000e+00
  4.000000e+00]
 [8.508310e+05 2.000000e+00 7.000000e+00 1.000000e+01 1.000000e+01
  7.000000e+00 1.000000e+01 4.000000e+00 9.000000e+00 4.000000e+00
  4.000000e+00]
 [1.017023e+06 4.000000e+00 1.000000e+00 1.000000e+00 3.000000e+00
  2.000000e+00 1.000000e+00 3.000000e+00 1.000000e+00 1.000000e+00
  2.000000e+00]
 [7.984290e+05 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
  2.000000

In [13]:
print(f"Fire Data:\nTune Set:\n{fire_data.tune_set}\nValidate Set:\n{fire_data.validate_set}\n\n")

Fire Data:
Tune Set:
[[3.0000e+00 4.0000e+00 2.5000e+01 2.0000e+01 9.4600e+01 1.6000e+02
  5.6720e+02 1.6700e+01 1.7900e+01 4.8000e+01 2.7000e+00 0.0000e+00
  0.0000e+00]
 [8.0000e+00 6.0000e+00 2.1000e+01 2.0000e+01 9.1900e+01 1.1170e+02
  7.7030e+02 6.5000e+00 1.5700e+01 5.1000e+01 2.2000e+00 0.0000e+00
  0.0000e+00]
 [5.0000e+00 4.0000e+00 2.1000e+01 1.4000e+01 9.2100e+01 9.9000e+01
  7.4530e+02 9.6000e+00 1.0100e+01 7.5000e+01 3.6000e+00 0.0000e+00
  0.0000e+00]
 [4.0000e+00 5.0000e+00 2.1000e+01 1.4000e+01 9.4300e+01 8.5100e+01
  6.9230e+02 1.5900e+01 1.7700e+01 3.7000e+01 3.6000e+00 0.0000e+00
  0.0000e+00]
 [6.0000e+00 5.0000e+00 1.9000e+01 2.2000e+01 9.2100e+01 1.1120e+02
  6.5410e+02 9.6000e+00 1.6600e+01 4.7000e+01 9.0000e-01 0.0000e+00
  2.2900e+00]
 [8.0000e+00 6.0000e+00 2.5000e+01 1.8000e+01 8.8900e+01 2.6310e+02
  7.9590e+02 5.2000e+00 2.9300e+01 2.7000e+01 3.6000e+00 0.0000e+00
  6.3000e+00]
 [6.0000e+00 5.0000e+00 2.3000e+01 2.6000e+01 8.1500e+01 9.1000e+00
  5.5200e+0

In [14]:
print(f"Glass Data:\nTune Set:\n{glass_data.tune_set}\nValidate Set:\n{glass_data.validate_set}\n\n")

Glass Data:
Tune Set:
[[1.57000e+02 1.51655e+00 1.34100e+01 3.39000e+00 1.28000e+00 7.26400e+01
  5.20000e-01 8.65000e+00 0.00000e+00 0.00000e+00 3.00000e+00]
 [1.81000e+02 1.51299e+00 1.44000e+01 1.74000e+00 1.54000e+00 7.45500e+01
  0.00000e+00 7.59000e+00 0.00000e+00 0.00000e+00 6.00000e+00]
 [5.20000e+01 1.51926e+00 1.32000e+01 3.33000e+00 1.28000e+00 7.23600e+01
  6.00000e-01 9.14000e+00 0.00000e+00 1.10000e-01 1.00000e+00]
 [8.00000e+00 1.51756e+00 1.31500e+01 3.61000e+00 1.05000e+00 7.32400e+01
  5.70000e-01 8.24000e+00 0.00000e+00 0.00000e+00 1.00000e+00]
 [1.46000e+02 1.51839e+00 1.28500e+01 3.67000e+00 1.24000e+00 7.25700e+01
  6.20000e-01 8.68000e+00 0.00000e+00 3.50000e-01 2.00000e+00]
 [3.30000e+01 1.51775e+00 1.28500e+01 3.48000e+00 1.23000e+00 7.29700e+01
  6.10000e-01 8.56000e+00 9.00000e-02 2.20000e-01 1.00000e+00]
 [1.42000e+02 1.51851e+00 1.32000e+01 3.63000e+00 1.07000e+00 7.28300e+01
  5.70000e-01 8.41000e+00 9.00000e-02 1.70000e-01 2.00000e+00]
 [3.10000e+01 1.517

In [15]:
print(f"Machine Data:\nTune Set:\n{machine_data.tune_set}\nValidate Set:\n{machine_data.validate_set}\n\n")

Machine Data:
Tune Set:
[[5.100e+01 5.200e+01 1.330e+02 1.000e+03 1.200e+04 9.000e+00 3.000e+00
  1.200e+01 7.200e+01 5.400e+01]
 [1.400e+02 1.540e+02 4.000e+01 8.000e+03 3.200e+04 6.400e+01 8.000e+00
  2.400e+01 2.770e+02 2.660e+02]
 [1.740e+02 1.770e+02 2.000e+02 2.000e+03 8.000e+03 6.400e+01 1.000e+00
  5.000e+00 4.100e+01 6.700e+01]
 [2.000e+00 6.000e+00 2.900e+01 8.000e+03 1.600e+04 3.200e+01 8.000e+00
  1.600e+01 1.320e+02 1.320e+02]
 [5.100e+01 5.500e+01 8.100e+02 1.000e+03 5.000e+03 0.000e+00 1.000e+00
  1.000e+00 2.000e+01 2.800e+01]
 [1.800e+01 5.000e+03 3.500e+02 6.400e+01 6.400e+01 0.000e+00 1.000e+00
  4.000e+00 1.000e+01 1.500e+01]
 [2.000e+00 9.000e+00 2.300e+01 1.600e+04 3.200e+04 6.400e+01 1.600e+01
  3.200e+01 4.890e+02 3.810e+02]
 [7.800e+01 8.300e+01 7.500e+01 2.000e+03 8.000e+03 8.000e+00 3.000e+00
  2.400e+01 6.200e+01 4.700e+01]
 [1.300e+02 4.445e+03 5.000e+01 2.000e+03 8.000e+03 8.000e+00 1.000e+00
  6.000e+00 5.600e+01 4.400e+01]
 [1.300e+02 4.446e+03 5.000e+01

In [16]:
print(f"Soybean Data:\nTune Set:\n{soybean_data.tune_set}\nValidate Set:\n{soybean_data.validate_set}\n\n")

Soybean Data:
Tune Set:
[[1. 1. 2. 1. 1. 3. 1. 2. 0. 1. 1. 1. 0. 2. 2. 0. 0. 0. 1. 1. 1. 2. 0. 1.
  0. 0. 0. 3. 4. 0. 0. 0. 0. 0. 1. 3.]
 [6. 0. 2. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 2. 2. 0. 0. 0. 1. 1. 3. 1. 1. 1.
  0. 0. 0. 0. 4. 0. 0. 0. 0. 0. 0. 0.]
 [2. 1. 1. 0. 0. 3. 1. 2. 0. 2. 1. 1. 0. 2. 2. 0. 0. 0. 1. 0. 1. 2. 0. 0.
  0. 0. 0. 3. 4. 0. 0. 0. 0. 0. 1. 3.]
 [0. 1. 2. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 2. 2. 0. 0. 0. 1. 0. 1. 1. 0. 1.
  0. 0. 0. 3. 4. 0. 0. 0. 0. 0. 1. 2.]
 [4. 0. 0. 1. 1. 1. 3. 1. 1. 1. 1. 1. 0. 2. 2. 0. 0. 0. 1. 1. 0. 3. 0. 0.
  0. 2. 1. 0. 4. 0. 0. 0. 0. 0. 0. 1.]]
Validate Set:
[[[ 3.  1.  1. ...  0.  1.  3.]
  [ 3.  0.  2. ...  0.  0.  2.]
  [ 0.  1.  2. ...  0.  1.  3.]
  [ 3.  0.  0. ...  0.  0.  1.]
  [ 5.  0.  2. ...  0.  0.  0.]]

 [[ 1.  1.  2. ...  0.  1.  3.]
  [ 5.  0.  2. ...  0.  0.  0.]
  [ 0.  1.  2. ...  0.  0.  2.]
  [ 4.  0.  0. ...  0.  0.  1.]
  [ 3.  1.  2. ...  0.  1.  3.]]

 [[ 6.  0.  2. ...  0.  0.  0.]
  [ 0.  1.  2. ...  0.  0.  2.]
  [ 0.  