In [52]:
import numpy as np
import pandas as pd

class Preprocessor():
    def __init__(self, file_directory: str, not_number: list, filetype: str = "csv", 
                 encodetype: str = "label", replaceNAN: bool = True):
        """Init function
            Inputs: 
            file_directory: directory of the file relative to script.
            filetype: type of file to be imported (defaulted as csv).
            not_number: map of columns -> bool based on if the values of the column are not numbers.
            encodetype: type of encoder to be used (ONLY onehot or label). Defaulted to label
            replace: true if all N/A values should be replaced by the mean of the column, false if row
            containing N/A should be ommitted (defaulted to true).
            
            Returns: None"""
        
        # add attributes
        self.dir = file_directory
        self.boolmap = not_number
        self.filetype = filetype
        self.replaceNAN = replaceNAN
        
        # make sure encodetype is either onehot or label
        check = (encodetype == "onehot" or encodetype == "label")
        assert check, f"Unexpected encode type '{encodetype}'. Preprocessor only takes onehot or label encoding"
        
        #add encoder attribute
        self.encodetype = encodetype
        

    def Process(self) -> pd.DataFrame:
        """Calls all other functions within class and runs until a 
            clean, encoded Dataframe is returned to the user
            Inputs: 
            self: contains attributes (refer to notes at __init__)
            
            Returns: pandas dataframe of fully preprocessed data"""
        
        # first we import the file:
        raw_data = self.ImportAsDf()
        # we then clean the data:
        clean_data, worded_columns = self.CleanData(raw_data)
        # finally, we encode the clean data
        final = self.Encode(clean_data, worded_columns)
        
        return final
    
    
    def ImportAsDf(self) -> pd.DataFrame:
        """Grabs file and imports it as a pandas Dataframe.
            Inputs: 
            self: contains attributes (refer to notes at __init__).

            Returns: pandas dataframe of the raw data"""

        if(self.filetype == "csv"):
            return pd.read_csv(self.dir)
        return None

    
    def CleanData(self, data: pd.DataFrame) -> tuple:
        """Goes through the data and removes/replaces any N/A values, as well
           returning any worded columns to to be encoded.
            Inputs: 
            self: contains attributes (refer to notes at __init__).
            data: data to be cleaned. 

            Returns: 
            Tuple: 0: pandas dataframe of the cleaned data
                   1: list of all worded columns to be encoded"""

        #reset index of data 
        data = data.reset_index(drop=True)
        #remove unnecessary columns
        data = data.drop("Compound", axis=1)
        #replace all "-" with NAN
        data = data.replace("-", np.nan)

        #get two lists of columns split on whether their values are numbers or not:
        numerated_columns = [data.columns[i] for i in range(len(data.columns)) if self.boolmap[i] == False]
        worded_columns = [data.columns[i] for i in range(len(data.columns)) if self.boolmap[i] == True]

        # we first search through worded columns and remove row where NA is present, since mean method will not work:
        for column in worded_columns:
            removal_list=[]
            
            # go through every value in worded column
            for i, value in enumerate(data[column]):
                
                # add row index of each occurance of NAN
                if(pd.isna(value)): removal_list.append(i)
            
            # remove once iteration is complete to avoid issues with index i:
            data.drop(removal_list, inplace=True)

        
        # we then search through numbered columns, and either replace NA with a mean or remove column    
        if(self.replaceNAN== True):
            for column in numerated_columns:
                #find mean of column (convert all values in columns to floats before averaging)
                column_mean = data[column].apply(lambda x: float(x)).mean(skipna= True)
                
                #replace na values on column with mean
                data[column] = data[column].apply(lambda x: column_mean if pd.isna(x) == True else x)
            
            return (data.reset_index(drop=True), worded_columns)
        
        
        else: # remove NA values if repalce is set to false:
            return (data.dropna().reset_index(drop=True), worded_columns)

    
    def Encode(self, data: pd.DataFrame, encode_columns: list) -> pd.DataFrame:
        """Encodes columns either using a label encoder or a onehot encoder.
            Inputs:
            self: contains attributes (refer to notes at __init__)
            data: data to be encoded.
            encode_columns: list of columns by label to be encoded
            
            Returns:
            Pandas dataframe of the encoded data"""
        
        # exclude final column from being encoded since it is the dependant variable
        encode_columns = encode_columns[:-1]
        print(encode_columns)
        
        #check if onehot or label encoder is being used
        if(self.encodetype == "onehot"):
            for column in encode_columns:
                # grab a list of distinct categories
                categories = sorted(set(data[column]))
                
                # go through each category and make a new column with binary values 0 or 1
                for category in categories:
                    data[f"{column}_{category}"] = data[column].apply(lambda x: 1 if x == category else 0)
                
                #drop column
                data.drop(column, axis=1, inplace = True)
                
            return data
        
        else: # must be label due to previous assertion
            for column in encode_columns:
                # grab a list of distinct categories
                categories = sorted(set(data[column]))
                
                # map these categories to integers:
                category_map = {categories[i] : i for i in range(len(categories))}
                
                # apply value using the map
                data[column] = data[column].apply(lambda x: category_map[x])
            
            return data
    

    
# create boolean map of columns where non-numeric columns are True, excluding items that want to be ignored
classmap = [True, True, True, False, False, False, False, False, False,
           False, False, False, False, False, False, False, True]

file_directory = "Crystal_structure.csv"

clean = Preprocessor(file_directory, classmap, replaceNAN = True).Process()
clean

['A', 'B', 'In literature']


Unnamed: 0,A,B,In literature,v(A),v(B),r(AXII)(Å),r(AVI)(Å),r(BVI)(Å),EN(A),EN(B),l(A-O)(Å),l(B-O)(Å),ΔENR,tG,τ,μ,Lowest distortion
0,0,0,0,0,0,1.12,1.12,1.12,1.10,1.10,0.00000,0.000000,-3.248000,0.707107,2.503861,0.800000,cubic
1,0,1,0,0,0,1.12,1.12,0.95,1.10,1.93,0.00000,2.488353,-2.565071,0.758259,2.503861,0.678571,orthorhombic
2,0,2,0,0,0,1.12,1.12,0.54,1.10,1.61,0.00000,1.892894,-1.846714,0.918510,2.503861,0.385714,cubic
3,0,3,0,0,0,1.12,1.12,0.52,1.10,2.18,0.00000,1.932227,-1.577429,0.928078,2.503861,0.371429,orthorhombic
4,0,4,0,0,0,1.12,1.12,0.93,1.10,2.54,0.00000,2.313698,-2.279786,0.764768,2.503861,0.664286,orthorhombic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5271,72,68,0,1,5,0.89,0.72,0.62,1.33,2.36,2.38342,1.745600,-1.572214,0.801621,5.228952455,0.442857,cubic
5272,72,69,0,1.609165,2.266492,0.89,0.72,0.90,1.33,1.22,2.38342,2.235124,-2.489571,0.704032,2.503861,0.642857,cubic
5273,72,70,0,1.609165,2.266492,0.89,0.72,0.95,1.33,1.10,2.38342,2.223981,-2.626821,0.689053,2.503861,0.678571,orthorhombic
5274,72,71,0,1.609165,2.266492,0.89,0.72,0.74,1.33,1.65,2.38342,2.096141,-2.035750,0.756670,2.503861,0.528571,cubic
