In [16]:
import numpy as np
import pandas as pd

class Preprocessor():
    def __init__(self, file_directory: str, not_number: list, filetype: str = "csv", 
                 encodetype: str = "label", replaceNAN: bool = True):
        """Init function
            Inputs: 
            file_directory: directory of the file relative to script.
            filetype: type of file to be imported (defaulted as csv).
            not_number: map of columns -> bool based on if the values of the column are not numbers.
            encodetype: type of encoder to be used (ONLY onehot or label). Defaulted to label
            replace: true if all N/A values should be replaced by the mean of the column, false if row
            containing N/A should be ommitted (defaulted to true).
            
            Returns: None"""
        
        # add attributes
        self.dir = file_directory
        self.boolmap = not_number
        self.filetype = filetype
        self.replaceNAN = replaceNAN
        
        # make sure encodetype is either onehot or label
        check = (encodetype == "onehot" or encodetype == "label")
        assert check, f"Unexpected encode type '{encodetype}'. Preprocessor only takes onehot or label encoding"
        
        #add encoder attribute
        self.encodetype = encodetype
        

    def Process(self) -> pd.DataFrame:
        """Calls all other functions within class and runs until a 
            clean, encoded Dataframe is returned to the user
            Inputs: 
            self: contains attributes (refer to notes at __init__)
            
            Returns: pandas dataframe of fully preprocessed data"""
        
        # first we import the file:
        raw_data = self.ImportAsDf()
        # we then clean the data:
        clean_data, worded_columns = self.CleanData(raw_data)
        # finally, we encode the clean data
        final = self.Encode(clean_data, worded_columns)
        
        return final
    
    
    def ImportAsDf(self) -> pd.DataFrame:
        """Grabs file and imports it as a pandas Dataframe.
            Inputs: 
            self: contains attributes (refer to notes at __init__).

            Returns: pandas dataframe of the raw data"""

        if(self.filetype == "csv"):
            return pd.read_csv(self.dir)
        return None

    
    def CleanData(self, data: pd.DataFrame) -> tuple:
        """Goes through the data and removes/replaces any N/A values, as well
           returning any worded columns to to be encoded.
            Inputs: 
            self: contains attributes (refer to notes at __init__).
            data: data to be cleaned. 

            Returns: tuple: 
                   0: pandas dataframe of the cleaned data
                   1: list of all worded columns to be encoded"""

        #reset index of data 
        data = data.reset_index(drop=True)
        #remove unnecessary columns
        data = data.drop("Compound", axis=1)
        #replace all "-" with NAN
        data = data.replace("-", np.nan)

        #get two lists of columns split on whether their values are numbers or not:
        numerated_columns = [data.columns[i] for i in range(len(data.columns)) if self.boolmap[i] == False]
        worded_columns = [data.columns[i] for i in range(len(data.columns)) if self.boolmap[i] == True]

        # we first search through worded columns and remove row where NA is present, since mean method will not work:
        for column in worded_columns:
            removal_list=[]
            
            # go through every value in worded column
            for i, value in enumerate(data[column]):
                
                # add row index of each occurance of NAN
                if(pd.isna(value)): removal_list.append(i)
            
            # remove once iteration is complete to avoid issues with index i:
            data.drop(removal_list, inplace=True)

        
        # we then search through numbered columns, and either replace NA with a mean or remove column    
        if(self.replaceNAN== True):
            for column in numerated_columns:
                #find mean of column (convert all values in columns to floats before averaging)
                column_mean = data[column].apply(lambda x: float(x)).mean(skipna= True)
                
                #replace na values on column with mean
                data[column] = data[column].apply(lambda x: column_mean if pd.isna(x) == True else x)
            
            return (data.reset_index(drop=True), worded_columns)
        
        
        else: # remove NA values if repalce is set to false:
            return (data.dropna().reset_index(drop=True), worded_columns)

    
    def Encode(self, data: pd.DataFrame, encode_columns: list) -> pd.DataFrame:
        """Encodes columns either using a label encoder or a onehot encoder.
            Inputs:
            self: contains attributes (refer to notes at __init__)
            data: data to be encoded.
            encode_columns: list of columns by label to be encoded
            
            Returns:
            Pandas dataframe of the encoded data"""
        
        # exclude final column from being encoded since it is the dependant variable
        encode_columns = encode_columns[:-1]
        
        #check if onehot or label encoder is being used
        if(self.encodetype == "onehot"):
            for column in encode_columns:
                # grab a list of distinct categories
                categories = sorted(set(data[column]))
                
                # go through each category and make a new column with binary values 0 or 1
                for category in categories:
                    data[f"{column}_{category}"] = data[column].apply(lambda x: 1 if x == category else 0)
                
                #drop column
                data.drop(column, axis=1, inplace = True)
                
            return data
        
        else: # must be label due to previous assertion
            for column in encode_columns:
                # grab a list of distinct categories
                categories = sorted(set(data[column]))
                
                # map these categories to integers:
                category_map = {categories[i] : i for i in range(len(categories))}
                
                # apply value using the map
                data[column] = data[column].apply(lambda x: category_map[x])
            
            return data
    

    
    
def SplitData(data: pd.DataFrame, train_ratio: float = 0.1) -> tuple:
    """Splits data into train and test samples
        Inputs:
        data: data to be split
        train_ratio: the ratio of train to test array size for the data.
        (defaulted to 10% train with 90% test data)
        
        Returns: tuple:
                 0: pandas DataFrame of train data for independent variables.
                 1: pandas DataFrame of test data for independent variables.
                 2: pandas Series of train data for dependent variable.
                 3: pandas Series of test data for dependent variable."""
    
    # find index where data should be split
    index = int(train_ratio * len(data.index))
    
    # Make a list of dependent and independant variables:
    X_columns = list(data.columns)
    y_columns = X_columns.pop(-1)
    
    # split X and y data into test and training data for each
    X_train = data[X_columns][:index].reset_index(drop=True)
    X_test = data[X_columns][index:].reset_index(drop=True)
    y_train = data[y_columns][:index].reset_index(drop=True)
    y_test = data[y_columns][index:].reset_index(drop=True)
    
    return X_train, X_test, y_train, y_test



# create boolean map of columns where non-numeric columns are True, excluding items that want to be ignored
classmap = [True, True, True, False, False, False, False, False, False,
           False, False, False, False, False, False, False, True]

file_directory = "Crystal_structure.csv"

# get preprocessed data
clean = Preprocessor(file_directory, classmap, replaceNAN = True).Process()

# split data
X_train, X_test, y_train, y_test = SplitData(clean)

X_train

Unnamed: 0,A,B,In literature,v(A),v(B),r(AXII)(Å),r(AVI)(Å),r(BVI)(Å),EN(A),EN(B),l(A-O)(Å),l(B-O)(Å),ΔENR,tG,τ,μ
0,0,0,0,0,0,1.12,1.12,1.12,1.10,1.10,0.000000,0.000000,-3.248000,0.707107,2.503861,0.800000
1,0,1,0,0,0,1.12,1.12,0.95,1.10,1.93,0.000000,2.488353,-2.565071,0.758259,2.503861,0.678571
2,0,2,0,0,0,1.12,1.12,0.54,1.10,1.61,0.000000,1.892894,-1.846714,0.918510,2.503861,0.385714
3,0,3,0,0,0,1.12,1.12,0.52,1.10,2.18,0.000000,1.932227,-1.577429,0.928078,2.503861,0.371429
4,0,4,0,0,0,1.12,1.12,0.93,1.10,2.54,0.000000,2.313698,-2.279786,0.764768,2.503861,0.664286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522,7,11,0,2,4,0.45,0.45,0.87,1.57,1.12,2.178342,2.268642,-2.022750,0.576276,-3.959996282,0.621429
523,7,12,0,2,4,0.45,0.45,0.53,1.57,1.88,2.178342,1.930311,-1.255821,0.677797,-11.73628918,0.378571
524,7,13,0,2,4,0.45,0.45,0.55,1.57,1.66,2.178342,1.960053,-1.329107,0.670845,-9.609017798,0.392857
525,7,14,0,1.609165,2.266492,0.45,0.45,1.67,1.57,0.79,2.178342,3.009747,-3.592929,0.426107,2.503861,1.192857
