In [101]:
import random
import os
import numpy as np

In [102]:
import os
import numpy as np
import random
import csv

class dataset:
    def __init__(self, data_path: str, processed_flag: str):
        '''
        - INSTANTIATE ALL self VARIABLES IN THE INIT
        - take in the .data file, process it where we get a numpy array of strings where dimensions are as follows: self.intake_data[example][features]
        - MAKE SURE TO ADD EXTRACT FUNCTIONALITY FOR BOTH THE TUNING SET AND VALIDATION SET
        '''
        # FINN ADDS UP HERE
        self.intake_data = []
        self.tune_set = []
        self.validate_set = []
        self.ninety_data = []
        # CARLOS ADDS DOWN HERE

        # Data is being read in from original .DATA file
        if (processed_flag == False):
            # Separating the .data file into lines, and shuffling the lines
            with open(data_path, 'r') as file:
                lines = file.readlines()

            # Deliminate strings into lists
            for i in range(len(lines)):
                lines[i] = lines[i].strip()
                lines[i] = lines[i].split(',')
            
            # Make the list into a numpy array
            self.intake_data = np.array(lines)

        '''
        # Data is being extracted from a saved CSV File
        else:
            #extract_data()
        '''

    def continuize(self):
        '''
        This method takes in the indices that need to be continuized. This will look like replacing values that are strings with numbers.
        We want to make sure we call this method BEFORE we shuffle so that we do not have to keep track of which number corresponds to which
        original value. We can figure this out later
        '''
        string_to_int = {}
        next_int = 0
        # This function continuizes a single element so it can be vectorized
        def convert_to_num(value):
            nonlocal next_int
            try:
                # Try to convert to float
                return float(value)
            except ValueError:
                # If conversion fails, map the string a number
                if value not in string_to_int:
                    string_to_int[value] = next_int
                    next_int += 1
                return string_to_int[value]

        # Apply convert_to_num to each element in the array
        vectorization = np.vectorize(convert_to_num, otypes=[float])
        self.intake_data = vectorization(self.intake_data)
        return
    def impute(self):
        # Replaces question marks in a dataset with a random value between the min/max of an attribute value
        # Breast cancer has a range of 1-10 for the attribute that is missing values
        for ex_idx in range(len(self.intake_data)):
            for att_idx in range(len(self.intake_data[ex_idx])):
                # if this statement is entered that means there is a missing piece of attribute data, so imputation needs to occur at this location
                if (self.intake_data[ex_idx][att_idx] == '?'):
                    # This will be the imputation method using range 1-10
                        self.intake_data[ex_idx][att_idx] = str(random.randint(1,10))
        return
    def shuffle(self):
        '''
        ONLY CALLED AFTER CONTINUIZING AND IMPUTING
        - This method will shuffle the self.intake_data by examples
        - Consider adding a flag where this can shuffle higher dimensional array (not explicitly necessary)
        '''
        np.random.shuffle(self.intake_data)
        return
    def sort(self, prediction_type_flag):
        '''
        - Sorts the data by its class/target value. We can assume all labels are the last indice of an example.
        - The prediction_type_flag essentially tells us if the last indice can be converted to a float or not. Regression datasets are sorted by value
        '''
        if prediction_type_flag == "regression":
            #print('REGRESSION')
            sorted_data = self.intake_data[self.intake_data[:, -1].astype(np.float32).argsort()]
        else:
            #print("CLASSIFICATION")
            sorted_data = self.intake_data[self.intake_data[:, -1].argsort()]

        self.intake_data = sorted_data
        return
    def split(self):
        '''
        Puts the first 10% of the data into its own array (self.tune_set), then the remaining data (self.validate_set) into its own array.
        We should end up with two arrays, both are sorted and stratified. The validation still will need to be separated into partitions.
        '''
        tune_data = []

        for i, example in enumerate(self.intake_data):
            if(i % 10) == 0:
                tune_data.append(example)
            else:
                self.ninety_data.append(example)

        self.tune_set = np.array(tune_data)
        self.ninety_data = np.array(self.ninety_data)
        
        return
    def fold(self):
        '''
        This method folds self.validate_set into stratified partitions
        '''
        shape = (10, (len(self.ninety_data) // 10) + 1, len(self.ninety_data[0]))
        null_string = "null"
        self.validate_set = np.full(shape, null_string)
        fold_counts = np.zeros(10)

        for i, example in enumerate(self.ninety_data):
            fold_index = i % 10
            
            example_position = fold_counts[fold_index]  #This finds the next null example
            self.validate_set[fold_index, int(example_position)] = example

        
            fold_counts[fold_index] += 1
        return
    def shuffle_splits(self):
        '''
        Shuffles the tune set and validate set after they are complete and stratified
        '''
        np.random.shuffle(self.tune_set)
        for partition_idx, partition in enumerate(self.validate_set):
            np.random.shuffle(partition)
        return
    
    def remove_attribute(self, indice=0):
        # Takes in an attribute indice, and removes that entire indice from the dataset. This can be used to remove ID numbers
        self.intake_data = np.delete(self.intake_data, indice, 1)

    
    def save(self, filename: str):
        """
        Saves a 2D or 3D numpy array (full of strings) to a CSV file.
        """
        folder_path = os.path.expanduser(f"~/CSCI_447/Project_2/Datasets/processed_data")  
        os.makedirs(folder_path, exist_ok=True)
        #get/create the path to the folder that the file should be saved to
        tune_file_path = os.path.join(folder_path, (filename+'_tune_set'))
        validate_file_path = os.path.join(folder_path, (filename+'_validate_set'))

        shape_info = None
        with open(tune_file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            # Write shape information if it is a 3D array
            if shape_info:
                writer.writerow(["shape"] + list(shape_info))
            # Write data
            writer.writerows(self.tune_set)

        reshaped_array = np.array([[';'.join(row) for row in batch] for batch in self.validate_set])
        shape_info = self.validate_set.shape
        with open(validate_file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            # Write shape information if it is a 3D array
            if shape_info:
                writer.writerow(["shape"] + list(shape_info))
            # Write data
            writer.writerows(reshaped_array)

    def extract(self, file_path: str):
        """
        Loads data from a CSV file and converts it back to a numpy array in the original format.
        """
        tune_file_path = file_path+'_tune_set.csv'
        validate_file_path = file_path+'_validate_set.csv'

        with open(tune_file_path, mode='r') as file:
            reader = csv.reader(file)
            rows = list(reader)
        self.tune_set = np.array(rows, dtype=str)


        with open(validate_file_path, mode='r') as file:
            reader = csv.reader(file)
            rows = list(reader)
        shape_info = tuple(map(int, rows[0][1:]))
        data = rows[1:]
        # Split semicolon-delimited strings back into lists for the third dimension
        reconstructed_data = [[cell.split(';') for cell in row] for row in data]
        self.validate_set = np.array(reconstructed_data, dtype=str).reshape(shape_info)


In [103]:
user = 'carlthedog3'

abalone_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/abalone.data', False)
cancer_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/breast-cancer-wisconsin.data', False)
fire_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/forestfires.data', False)
glass_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/glass.data', False)
machine_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/machine.data', False)
soybean_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/soybean-small.data', False)

In [104]:
abalone_data.continuize()
print(abalone_data.intake_data)
abalone_data.shuffle()
abalone_data.sort('regression')
abalone_data.split()
abalone_data.fold()
abalone_data.shuffle_splits()

[[ 0.      0.455   0.365  ...  0.101   0.15   15.    ]
 [ 0.      0.35    0.265  ...  0.0485  0.07    7.    ]
 [ 1.      0.53    0.42   ...  0.1415  0.21    9.    ]
 ...
 [ 0.      0.6     0.475  ...  0.2875  0.308   9.    ]
 [ 1.      0.625   0.485  ...  0.261   0.296  10.    ]
 [ 0.      0.71    0.555  ...  0.3765  0.495  12.    ]]


In [105]:
cancer_data.impute()
cancer_data.shuffle()
cancer_data.sort('classification')
cancer_data.split()
cancer_data.fold()
cancer_data.shuffle_splits()

In [106]:
fire_data.continuize()
fire_data.shuffle()
fire_data.sort('regression')
fire_data.split()
fire_data.fold()
fire_data.shuffle_splits()

In [107]:
glass_data.shuffle()
glass_data.sort('classification')
glass_data.split()
glass_data.fold()
glass_data.shuffle_splits()

In [108]:
machine_data.continuize()
machine_data.shuffle()
machine_data.sort('regression')
machine_data.split()
machine_data.fold()
machine_data.shuffle_splits()

In [109]:
soybean_data.shuffle()
soybean_data.sort('classification')
soybean_data.split()
soybean_data.fold()
soybean_data.shuffle_splits()

In [110]:
abalone_data.save('abalone')
cancer_data.save('cancer')
fire_data.save('fire')
glass_data.save('glass')
machine_data.save('machine')
soybean_data.save('soybean')

In [111]:
print(f"Abalone Data:\nTune Set: {abalone_data.tune_set}\nValidate Set: {abalone_data.validate_set}\n\n")

Abalone Data:
Tune Set: [[0.     0.49   0.395  ... 0.1235 0.185  9.    ]
 [0.     0.415  0.315  ... 0.087  0.097  8.    ]
 [0.     0.275  0.2    ... 0.024  0.03   5.    ]
 ...
 [0.     0.64   0.5    ... 0.3575 0.354  9.    ]
 [0.     0.625  0.48   ... 0.2625 0.2785 9.    ]
 [2.     0.375  0.28   ... 0.06   0.055  6.    ]]
Validate Set: [[['2.0' '0.52' '0.4' ... '0.16' '0.25' '10.0']
  ['0.0' '0.56' '0.44' ... '0.21' '0.24' '9.0']
  ['2.0' '0.43' '0.32' ... '0.08' '0.09' '6.0']
  ...
  ['0.0' '0.61' '0.49' ... '0.20' '0.36' '23.0']
  ['1.0' '0.55' '0.44' ... '0.19' '0.18' '8.0']
  ['0.0' '0.70' '0.55' ... '0.32' '0.46' '11.0']]

 [['2.0' '0.3' '0.23' ... '0.02' '0.04' '8.0']
  ['0.0' '0.65' '0.49' ... '0.31' '0.37' '9.0']
  ['2.0' '0.33' '0.25' ... '0.03' '0.04' '7.0']
  ...
  ['0.0' '0.47' '0.36' ... '0.09' '0.11' '7.0']
  ['0.0' '0.57' '0.45' ... '0.16' '0.25' '8.0']
  ['1.0' '0.65' '0.5' ... '0.31' '0.28' '12.0']]

 [['0.0' '0.57' '0.41' ... '0.19' '0.23' '13.0']
  ['0.0' '0.62' '0.5

In [112]:
print(f"Cancer Data:\nTune Set:\n{cancer_data.tune_set}\nValidate Set:\n{cancer_data.validate_set}\n\n")

Cancer Data:
Tune Set:
[['529329' '10' '10' '10' '10' '10' '10' '4' '10' '10' '4']
 ['1186936' '2' '1' '3' '2' '2' '1' '2' '1' '1' '2']
 ['1201834' '2' '1' '1' '1' '2' '1' '3' '1' '1' '2']
 ['616240' '5' '3' '4' '3' '4' '5' '4' '7' '1' '2']
 ['1140597' '7' '1' '2' '3' '2' '1' '2' '1' '1' '2']
 ['814265' '2' '1' '1' '1' '2' '1' '1' '1' '1' '2']
 ['1180523' '3' '1' '1' '1' '2' '1' '2' '2' '1' '2']
 ['1239967' '1' '1' '1' '1' '2' '1' '2' '1' '1' '2']
 ['1182404' '4' '2' '1' '1' '2' '1' '1' '1' '1' '2']
 ['492268' '10' '4' '6' '1' '2' '10' '5' '3' '1' '4']
 ['1219525' '8' '10' '10' '10' '5' '10' '8' '10' '6' '4']
 ['1073836' '5' '1' '1' '1' '2' '1' '2' '1' '1' '2']
 ['855524' '1' '1' '1' '1' '2' '1' '2' '1' '1' '2']
 ['785208' '5' '4' '6' '6' '4' '10' '4' '3' '1' '4']
 ['896404' '2' '1' '1' '1' '2' '1' '3' '1' '1' '2']
 ['1216947' '1' '1' '1' '1' '2' '1' '3' '1' '1' '2']
 ['1321942' '5' '1' '1' '1' '2' '1' '3' '1' '1' '2']
 ['1114570' '2' '1' '1' '1' '2' '1' '2' '2' '1' '2']
 ['672113' '7'

In [113]:
print(f"Fire Data:\nTune Set:\n{fire_data.tune_set}\nValidate Set:\n{fire_data.validate_set}\n\n")

Fire Data:
Tune Set:
[[9.0000e+00 4.0000e+00 2.4000e+01 1.7000e+01 9.0500e+01 6.1100e+01
  2.5260e+02 9.4000e+00 2.4500e+01 5.0000e+01 3.1000e+00 0.0000e+00
  7.0320e+01]
 [9.0000e+00 5.0000e+00 2.4000e+01 2.2000e+01 9.3300e+01 4.9500e+01
  2.9770e+02 1.4000e+01 2.8000e+01 3.4000e+01 4.5000e+00 0.0000e+00
  0.0000e+00]
 [1.0000e+00 4.0000e+00 2.1000e+01 1.8000e+01 9.1000e+01 2.7630e+02
  8.2510e+02 7.1000e+00 1.4500e+01 7.6000e+01 7.6000e+00 0.0000e+00
  3.7100e+00]
 [3.0000e+00 4.0000e+00 2.1000e+01 1.4000e+01 9.2100e+01 9.9000e+01
  7.4530e+02 9.6000e+00 1.7400e+01 5.7000e+01 4.5000e+00 0.0000e+00
  0.0000e+00]
 [8.0000e+00 5.0000e+00 1.9000e+01 2.2000e+01 9.3100e+01 1.5730e+02
  6.6670e+02 1.3500e+01 2.4000e+01 3.6000e+01 3.1000e+00 0.0000e+00
  2.4000e-01]
 [1.0000e+00 4.0000e+00 2.1000e+01 2.0000e+01 9.1500e+01 1.3010e+02
  8.0710e+02 7.5000e+00 2.1300e+01 3.5000e+01 2.2000e+00 0.0000e+00
  2.8190e+01]
 [1.0000e+00 4.0000e+00 2.5000e+01 1.6000e+01 9.2300e+01 9.6200e+01
  4.5020e+0

In [114]:
print(f"Glass Data:\nTune Set:\n{glass_data.tune_set}\nValidate Set:\n{glass_data.validate_set}\n\n")

Glass Data:
Tune Set:
[['67' '1.52152' '13.05' '3.65' '0.87' '72.22' '0.19' '9.85' '0.00'
  '0.17' '1']
 ['90' '1.51640' '12.55' '3.48' '1.87' '73.23' '0.63' '8.08' '0.00'
  '0.09' '2']
 ['164' '1.51514' '14.01' '2.68' '3.50' '69.89' '1.68' '5.87' '2.20'
  '0.00' '5']
 ['148' '1.51610' '13.33' '3.53' '1.34' '72.67' '0.56' '8.33' '0.00'
  '0.00' '3']
 ['137' '1.51806' '13.00' '3.80' '1.08' '73.07' '0.56' '8.38' '0.00'
  '0.12' '2']
 ['19' '1.51911' '13.90' '3.73' '1.18' '72.12' '0.06' '8.89' '0.00'
  '0.00' '1']
 ['73' '1.51593' '13.09' '3.59' '1.52' '73.10' '0.67' '7.83' '0.00'
  '0.00' '2']
 ['89' '1.51618' '13.01' '3.50' '1.48' '72.89' '0.60' '8.12' '0.00'
  '0.00' '2']
 ['211' '1.51685' '14.92' '0.00' '1.99' '73.06' '0.00' '8.40' '1.59'
  '0.00' '7']
 ['87' '1.51569' '13.24' '3.49' '1.47' '73.25' '0.38' '8.03' '0.00'
  '0.00' '2']
 ['187' '1.51838' '14.32' '3.26' '2.22' '71.25' '1.46' '5.79' '1.63'
  '0.00' '7']
 ['131' '1.52177' '13.75' '1.01' '1.36' '72.19' '0.33' '11.14' '0.00'
 

In [115]:
print(f"Machine Data:\nTune Set:\n{machine_data.tune_set}\nValidate Set:\n{machine_data.validate_set}\n\n")

Machine Data:
Tune Set:
[[3.500e+01 4.400e+01 5.000e+01 2.000e+03 8.000e+03 8.000e+00 1.000e+00
  5.000e+00 7.100e+01 4.400e+01]
 [1.010e+02 1.190e+02 2.250e+02 2.000e+03 4.000e+03 8.000e+00 3.000e+00
  6.000e+00 3.400e+01 3.100e+01]
 [1.010e+02 1.060e+02 2.600e+01 8.000e+03 1.600e+04 0.000e+00 8.000e+00
  1.600e+01 1.850e+02 1.130e+02]
 [3.500e+01 4.200e+01 5.000e+01 5.000e+02 2.000e+03 8.000e+00 1.000e+00
  4.000e+00 2.000e+01 2.300e+01]
 [8.600e+01 3.000e+02 3.000e+02 7.680e+02 3.000e+03 6.000e+00 6.000e+00
  2.400e+01 4.400e+01 2.500e+01]
 [1.900e+01 2.600e+01 1.430e+02 2.300e+03 6.200e+03 0.000e+00 6.000e+00
  6.400e+01 6.100e+01 4.000e+01]
 [1.800e+01 5.000e+03 3.500e+02 6.400e+01 6.400e+01 0.000e+00 1.000e+00
  4.000e+00 1.000e+01 1.500e+01]
 [5.100e+01 5.200e+01 1.330e+02 1.000e+03 1.200e+04 9.000e+00 3.000e+00
  1.200e+01 7.200e+01 5.400e+01]
 [1.010e+02 1.290e+02 1.500e+03 7.680e+02 2.000e+03 0.000e+00 0.000e+00
  0.000e+00 1.800e+01 2.000e+01]
 [2.000e+00 6.000e+00 2.900e+01

In [116]:
print(f"Soybean Data:\nTune Set:\n{soybean_data.tune_set}\nValidate Set:\n{soybean_data.validate_set}\n\n")

Soybean Data:
Tune Set:
[['0' '1' '2' '0' '0' '1' '1' '1' '1' '1' '1' '0' '0' '2' '2' '0' '0' '0'
  '1' '0' '1' '1' '0' '1' '1' '0' '0' '3' '4' '0' '0' '0' '0' '0' '0'
  'D3']
 ['0' '1' '2' '1' '0' '3' '1' '1' '0' '0' '1' '1' '0' '2' '2' '0' '0' '0'
  '1' '0' '1' '2' '0' '0' '0' '0' '0' '3' '4' '0' '0' '0' '0' '0' '1'
  'D4']
 ['4' '0' '0' '1' '0' '2' '3' '1' '1' '1' '1' '1' '0' '2' '2' '0' '0' '0'
  '1' '0' '0' '3' '0' '0' '0' '2' '1' '0' '4' '0' '0' '0' '0' '0' '0'
  'D2']
 ['3' '1' '1' '0' '0' '2' '1' '2' '1' '2' '1' '1' '0' '2' '2' '0' '0' '0'
  '1' '0' '2' '2' '0' '0' '0' '0' '0' '3' '4' '0' '0' '0' '0' '0' '1'
  'D4']
 ['4' '0' '2' '1' '0' '3' '0' '2' '0' '2' '1' '1' '0' '2' '2' '0' '0' '0'
  '1' '0' '3' '1' '1' '1' '0' '0' '0' '0' '4' '0' '0' '0' '0' '0' '0'
  'D1']]
Validate Set:
[[['0' '1' '2' ... '0' '0' 'D3']
  ['1' '1' '2' ... '0' '1' 'D4']
  ['2' '1' '2' ... '0' '1' 'D4']
  ['6' '0' '2' ... '0' '0' 'D1']
  ['6' '0' '0' ... '0' '0' 'D2']]

 [['2' '1' '2' ... '0' '0' 'D3']
 