In [500]:
import random
import os
import numpy as np

In [501]:
class dataset:
    def __init__(self, data_path: str, processed_flag: str):
        '''
        - INSTANTIATE ALL self VARIABLES IN THE INIT
        - take in the .data file, process it where we get a numpy array of strings where dimensions are as follows: self.intake_data[example][features]
        - MAKE SURE TO ADD EXTRACT FUNCTIONALITY FOR BOTH THE TUNING SET AND VALIDATION SET
        '''
        # FINN ADDS UP HERE
        self.intake_data = []
        self.tune_set = []
        self.validate_set = []
        self.ninety_data = []
        # CARLOS ADDS DOWN HERE

        # Data is being read in from original .DATA file
        if (processed_flag == False):
            # Separating the .data file into lines, and shuffling the lines
            with open(data_path, 'r') as file:
                lines = file.readlines()

            # Deliminate strings into lists
            for i in range(len(lines)):
                lines[i] = lines[i].strip()
                lines[i] = lines[i].split(',')
            
            # Make the list into a numpy array
            self.intake_data = np.array(lines)

        '''
        # Data is being extracted from a saved CSV File
        else:
            #extract_data()
        '''

    def continuize(self):
        '''
        This method takes in the indices that need to be continuized. This will look like replacing values that are strings with numbers.
        We want to make sure we call this method BEFORE we shuffle so that we do not have to keep track of which number corresponds to which
        original value. We can figure this out later
        '''
        string_to_int = {}
        next_int = 0
        # This function continuizes a single element so it can be vectorized
        def convert_to_num(value):
            nonlocal next_int
            try:
                # Try to convert to float
                return float(value)
            except ValueError:
                # If conversion fails, map the string a number
                if value not in string_to_int:
                    string_to_int[value] = next_int
                    next_int += 1
                return string_to_int[value]

        # Apply convert_to_num to each element in the array
        vectorization = np.vectorize(convert_to_num)
        self.intake_data = vectorization(self.intake_data)
        return
    def impute(self):
        # Replaces question marks in a dataset with a random value between the min/max of an attribute value
        # Breast cancer has a range of 1-10 for the attribute that is missing values
        for ex_idx in range(len(self.intake_data)):
            for att_idx in range(len(self.intake_data[ex_idx])):
                # if this statement is entered that means there is a missing piece of attribute data, so imputation needs to occur at this location
                if (self.intake_data[ex_idx][att_idx] == '?'):
                    # This will be the imputation method using range 1-10
                        self.intake_data[ex_idx][att_idx] = str(random.randint(1,10))
        return
    def shuffle(self):
        '''
        ONLY CALLED AFTER CONTINUIZING AND IMPUTING
        - This method will shuffle the self.intake_data by examples
        - Consider adding a flag where this can shuffle higher dimensional array (not explicitly necessary)
        '''
        np.random.shuffle(self.intake_data)
        return
    def sort(self, prediction_type_flag):
        '''
        - Sorts the data by its class/target value. We can assume all labels are the last indice of an example.
        - The prediction_type_flag essentially tells us if the last indice can be converted to a float or not. Regression datasets are sorted by value
        '''
        if prediction_type_flag == "regression":
            print('REGRESSION')
            sorted_data = self.intake_data[self.intake_data[:, -1].astype(float).argsort()]
        else:
            print("CLASSIFICATION")
            sorted_data = self.intake_data[self.intake_data[:, -1].argsort()]

        self.intake_data = sorted_data
        return
    def split(self):
        '''
        Puts the first 10% of the data into its own array (self.tune_set), then the remaining data (self.validate_set) into its own array.
        We should end up with two arrays, both are sorted and stratified. The validation still will need to be separated into partitions.
        '''
        tune_data = []

        for i, example in enumerate(self.intake_data):
            if(i % 10) == 0:
                tune_data.append(example)
            else:
                self.ninety_data.append(example)

        self.tune_set = np.array(tune_data)
        self.ninety_data = np.array(self.ninety_data)
        
        return
    def fold(self):
        '''
        This method folds self.validate_set into stratified partitions
        '''
        shape = (10, (len(self.ninety_data) // 10) + 1, len(self.ninety_data[0]))
        null_string = "null"
        self.validate_set = np.full(shape, null_string)
        fold_counts = np.zeros(10)

        for i, example in enumerate(self.ninety_data):
            fold_index = i % 10
            
            example_position = fold_counts[fold_index]  #This finds the next null example
            self.validate_set[fold_index, int(example_position)] = example

        
            fold_counts[fold_index] += 1
        return
    def shuffle_splits(self):
        '''
        Shuffles the tune set and validate set after they are complete and stratified
        '''
        np.random.shuffle(self.tune_set)
        for partition_idx, partition in enumerate(self.validate_set):
            np.random.shuffle(partition)
        return
    

    ## Don't worry about saving for now
    def save_validate_set(self, save_file_name, save_folder):
        # Saves the data based on our convention: Each line is a partition, semicolons separate examples, commas separate attributes/labels
        folder_path = os.path.expanduser(f"{save_folder}/processed_data_new")  
        os.makedirs(folder_path, exist_ok=True)
        #get/create the path to the folder that the file should be saved to
        file_path = os.path.join(folder_path, save_file_name)
        #create the file path
        with open(f"{file_path}.csv", "w") as file:
            #open a csv file in the desired location
            for line in self.partitions:
                partition_lines = ";".join([",".join(map(str, sub_array)) for sub_array in line])
                #for each partition, join each example by a semi colon and each attribute by a comma
                file.write(partition_lines + "\n")
                #write each partition into the file with each 
        #print(f"CSV file saved to {file_path}")
        return

In [502]:
user = 'carlthedog3'

abalone_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/abalone.data', False)
cancer_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/breast-cancer-wisconsin.data', False)
fire_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/forestfires.data', False)
glass_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/glass.data', False)
machine_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/machine.data', False)
soybean_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/soybean-small.data', False)

In [503]:
abalone_data.continuize()
abalone_data.shuffle()
abalone_data.sort('regression')
abalone_data.split()
abalone_data.fold()
abalone_data.shuffle_splits()

REGRESSION


In [504]:
cancer_data.impute()
cancer_data.shuffle()
cancer_data.sort('classification')
cancer_data.split()
cancer_data.fold()
cancer_data.shuffle_splits()

CLASSIFICATION


In [505]:
fire_data.continuize()
fire_data.shuffle()
fire_data.sort('regression')
fire_data.split()
fire_data.fold()
fire_data.shuffle_splits()

REGRESSION


In [506]:
glass_data.shuffle()
glass_data.sort('classification')
glass_data.split()
glass_data.fold()
glass_data.shuffle_splits()

CLASSIFICATION


In [507]:
machine_data.continuize()
machine_data.shuffle()
machine_data.sort('regression')
machine_data.split()
machine_data.fold()
machine_data.shuffle_splits()

REGRESSION


In [508]:
soybean_data.shuffle()
soybean_data.sort('classification')
soybean_data.split()
soybean_data.fold()
soybean_data.shuffle_splits()

CLASSIFICATION


In [509]:
print(f"Abalone Data:\nTune Set: {abalone_data.tune_set}\nValidate Set: {abalone_data.validate_set}\n\n")

Abalone Data:
Tune Set: [[ 2  0  0 ...  0  0  7]
 [ 0  0  0 ...  0  0  9]
 [ 1  0  0 ...  0  0 10]
 ...
 [ 1  0  0 ...  0  0 13]
 [ 2  0  0 ...  0  0  4]
 [ 1  0  0 ...  0  0 10]]
Validate Set: [[['2' '0' '0' ... '0' '0' '9']
  ['2' '0' '0' ... '0' '0' '7']
  ['2' '0' '0' ... '0' '0' '8']
  ...
  ['2' '0' '0' ... '0' '0' '8']
  ['1' '0' '0' ... '0' '0' '11']
  ['0' '0' '0' ... '0' '0' '11']]

 [['0' '0' '0' ... '0' '0' '10']
  ['2' '0' '0' ... '0' '0' '11']
  ['1' '0' '0' ... '0' '0' '9']
  ...
  ['0' '0' '0' ... '0' '0' '17']
  ['0' '0' '0' ... '0' '0' '9']
  ['1' '0' '0' ... '0' '0' '16']]

 [['1' '0' '0' ... '0' '0' '9']
  ['2' '0' '0' ... '0' '0' '7']
  ['1' '0' '0' ... '0' '0' '20']
  ...
  ['0' '0' '0' ... '0' '0' '11']
  ['1' '0' '0' ... '0' '0' '8']
  ['0' '0' '0' ... '0' '0' '15']]

 ...

 [['1' '0' '0' ... '0' '0' '10']
  ['0' '0' '0' ... '0' '0' '9']
  ['2' '0' '0' ... '0' '0' '8']
  ...
  ['1' '0' '0' ... '0' '0' '16']
  ['0' '0' '0' ... '0' '0' '10']
  ['2' '0' '0' ... '0'

In [510]:
print(f"Cancer Data:\nTune Set:\n{cancer_data.tune_set}\nValidate Set:\n{cancer_data.validate_set}\n\n")

Cancer Data:
Tune Set:
[['672113' '7' '5' '6' '10' '4' '10' '5' '3' '1' '4']
 ['1210963' '10' '10' '10' '8' '6' '8' '7' '10' '1' '4']
 ['1174009' '5' '1' '1' '2' '1' '1' '2' '1' '1' '2']
 ['1334015' '7' '8' '8' '7' '3' '10' '7' '2' '3' '4']
 ['557583' '5' '10' '10' '10' '10' '10' '10' '1' '1' '4']
 ['1155967' '5' '1' '2' '10' '4' '5' '2' '1' '1' '2']
 ['1071084' '3' '3' '2' '2' '3' '1' '1' '2' '3' '2']
 ['488173' '1' '4' '3' '10' '4' '10' '5' '6' '1' '4']
 ['1075123' '3' '1' '2' '1' '2' '1' '2' '1' '1' '2']
 ['1033078' '4' '2' '1' '1' '2' '1' '2' '1' '1' '2']
 ['1236043' '3' '3' '2' '1' '3' '1' '3' '6' '1' '2']
 ['1049837' '1' '1' '1' '1' '2' '1' '1' '1' '1' '2']
 ['1313325' '4' '10' '4' '7' '3' '10' '9' '10' '1' '4']
 ['1315506' '4' '8' '6' '3' '4' '10' '7' '1' '1' '4']
 ['826923' '1' '1' '1' '1' '2' '1' '1' '1' '1' '2']
 ['1293966' '4' '1' '1' '1' '2' '1' '1' '1' '1' '2']
 ['846423' '10' '6' '3' '6' '4' '10' '7' '8' '4' '4']
 ['1219406' '5' '1' '1' '1' '1' '1' '3' '1' '1' '2']
 ['111

In [511]:
print(f"Fire Data:\nTune Set:\n{fire_data.tune_set}\nValidate Set:\n{fire_data.validate_set}\n\n")

Fire Data:
Tune Set:
[[  3   3  21  17  92 102 751   8  24  27   3   0   6]
 [  5   4  21  14  92  99 745   9  12  64   3   0   1]
 [  2   5  25  17  91 104 474   9  18  53   1   0   0]
 [  1   3  13  20  87  52 103   5   8  72   3   0   0]
 [  7   4  19  18  91 142 601  10  20  39   5   0   2]
 [  1   4  21  26  92 119 783   7  16  28   4   0   7]
 [  4   4  19  14  94 167 684  13  21  53   3   0   6]
 [  5   4  21  14  94  85 692  15  20  47   4   0   1]
 [  4   4  29  20  85  25 349   2   4  21   8   0  22]
 [  2   4  19  18  93 235 723  10  20  66   4   0  15]
 [  6   5  13  18  90  37  83   7  12  54   3   0  12]
 [  4   3  21  26  92 137 706   9  27  24   2   0   0]
 [  1   2  21  18  93 149 728   8  25  36   3   0   0]
 [  4   4  19  16  95 141 605  17  19  71   7   0  46]
 [  2   5  21  14  90 290 855   7  16  58   3   0   9]
 [  6   3  21  26  92 119 783   7  18  34   7   0  34]
 [  8   6  19  20  91 103 638   5  23  22   2   0   0]
 [  9   4  24  17  90  61 252   9  24  50   

In [512]:
print(f"Glass Data:\nTune Set:\n{glass_data.tune_set}\nValidate Set:\n{glass_data.validate_set}\n\n")

Glass Data:
Tune Set:
[['9' '1.51918' '14.04' '3.58' '1.37' '72.08' '0.56' '8.30' '0.00' '0.00'
  '1']
 ['158' '1.52121' '14.03' '3.76' '0.58' '71.79' '0.11' '9.65' '0.00'
  '0.00' '3']
 ['70' '1.52300' '13.31' '3.58' '0.82' '71.99' '0.12' '10.17' '0.00'
  '0.03' '1']
 ['96' '1.51860' '13.36' '3.43' '1.43' '72.26' '0.51' '8.60' '0.00'
  '0.00' '2']
 ['115' '1.51847' '13.10' '3.97' '1.19' '72.44' '0.60' '8.43' '0.00'
  '0.00' '2']
 ['33' '1.51775' '12.85' '3.48' '1.23' '72.97' '0.61' '8.56' '0.09'
  '0.22' '1']
 ['176' '1.52119' '12.97' '0.33' '1.51' '73.39' '0.13' '11.27' '0.00'
  '0.28' '5']
 ['3' '1.51618' '13.53' '3.55' '1.54' '72.99' '0.39' '7.78' '0.00' '0.00'
  '1']
 ['94' '1.51590' '13.24' '3.34' '1.47' '73.10' '0.39' '8.22' '0.00'
  '0.00' '2']
 ['118' '1.51708' '13.72' '3.68' '1.81' '72.06' '0.64' '7.88' '0.00'
  '0.00' '2']
 ['134' '1.51800' '13.71' '3.93' '1.54' '71.81' '0.54' '8.21' '0.00'
  '0.15' '2']
 ['108' '1.53393' '12.30' '0.00' '1.00' '70.16' '0.12' '16.19' '0.00'
 

In [513]:
print(f"Machine Data:\nTune Set:\n{machine_data.tune_set}\nValidate Set:\n{machine_data.validate_set}\n\n")

Machine Data:
Tune Set:
[[  130  4446    50  2000 16000    24     1     6    70    82]
 [  140   154    40  8000 32000    64     8    24   277   266]
 [   87    88   330  1000  3000     0     2     4    16    23]
 [   72    73   125   512  1000     0     8    20    36    19]
 [  185     7   105  2000  8000    16     4    14    58    47]
 [   19    26   143  2300  6200     0     6    64    61    40]
 [  130  4443    50  2000  8000     8     3     6    45    44]
 [   86   300   300   768  3000     6     6    24    44    25]
 [   66    68   800   256  8000     0     1     4    14    34]
 [   18  5000   350    64    64     0     1     4    10    15]
 [   74    75    75  2000  8000    64     1    38   144    75]
 [   74    76    75  2000 16000    64     1    38   144   113]
 [   28    33   320   512  5000     4     1     5    77    28]
 [  101   119   225  2000  4000     8     3     6    34    31]
 [    2     6    29  8000 16000    32     8    16   132   132]
 [  174   177   200  2000  8000

In [514]:
print(f"Soybean Data:\nTune Set:\n{soybean_data.tune_set}\nValidate Set:\n{soybean_data.validate_set}\n\n")

Soybean Data:
Tune Set:
[['6' '0' '2' '1' '0' '3' '0' '1' '1' '1' '1' '1' '0' '2' '2' '0' '0' '0'
  '1' '0' '3' '1' '1' '1' '0' '0' '0' '0' '4' '0' '0' '0' '0' '0' '0'
  'D1']
 ['0' '1' '2' '0' '0' '0' '1' '1' '0' '1' '1' '0' '0' '2' '2' '0' '0' '0'
  '1' '0' '1' '1' '0' '1' '0' '0' '0' '3' '4' '0' '0' '0' '0' '0' '1'
  'D3']
 ['1' '1' '2' '1' '0' '0' '1' '2' '1' '1' '1' '1' '0' '2' '2' '0' '0' '0'
  '1' '0' '2' '2' '0' '0' '0' '0' '0' '3' '4' '0' '0' '0' '0' '0' '1'
  'D4']
 ['2' '1' '2' '0' '0' '1' '1' '2' '0' '0' '1' '1' '0' '2' '2' '0' '0' '0'
  '1' '0' '1' '2' '0' '0' '0' '0' '0' '3' '4' '0' '0' '0' '0' '0' '1'
  'D4']
 ['6' '0' '0' '2' '1' '0' '2' '1' '0' '0' '1' '1' '0' '2' '2' '0' '0' '0'
  '1' '1' '0' '3' '0' '0' '0' '2' '1' '0' '4' '0' '0' '0' '0' '0' '0'
  'D2']]
Validate Set:
[[['3' '0' '2' ... '0' '0' 'D1']
  ['1' '1' '2' ... '0' '1' 'D4']
  ['4' '0' '0' ... '0' '0' 'D2']
  ['2' '1' '2' ... '0' '0' 'D3']
  ['3' '1' '2' ... '0' '1' 'D4']]

 [['4' '0' '0' ... '0' '0' 'D2']
 