In [88]:
import numpy as np
import pandas as pd

def ImportAsDf(file_directory: str, filetype: str = "csv") -> pd.DataFrame:
    """Grabs file and imports it as a pandas Dataframe.
        Inputs: file_directory: directory of the file relative to script,
        filetype: type of file to be imported (defaulted as csv)
        
        Returns: pandas dataframe of the raw data"""
    
    if(filetype == "csv"):
        return pd.read_csv(file_directory)
    return None

def CleanData(data: pd.DataFrame, isclass: list, replace=True) -> pd.DataFrame:
    """Goes through the data and removes/replaces any N/A values.
    Inputs: 
    data: data to be cleaned, 
    replace: true if N/A values should be replaced by the mean of the value, false if row
    containing N/A should be ommitted (defaulted to true),
    isclass: map of columns based on if the values of the column is worded.
    
    Returns: pandas dataframe of the cleaned data"""
    
    #reset index of data 
    data = data.reset_index(drop=True)
    #replace all "-" with NAN
    data = data.replace("-", np.nan)
    
    #get two lists of columns split on whether their values are numbers or categories:
    numerated_columns = [data.columns[i] for i in range(len(data.columns)) if isclass[i] == False]
    worded_columns = [data.columns[i] for i in range(len(data.columns)) if isclass[i] == True]
    
    # we first search through worded columns and remove row where NA is present, since mean method will not work:
    for column in worded_columns:
        removal_list=[]
        # go through every value in worded column
        for i, value in enumerate(data[column]):
            # add row index of each occurance of NAN
            if(pd.isna(value)): removal_list.append(i)
        # remove once iteration is complete to avoid issues with index i:
        data.drop(removal_list, inplace=True)
 
    
    # we then search through numbered columns, and either replace NA with a mean or remove column    
    # iterate over columns to replace NAN with column mean if replace is set to true:
    if(replace== True):
        for column in numerated_columns:
            #find mean of column (convert all values in columns to floats before averaging)
            column_mean = data[column].apply(lambda x: float(x)).mean(skipna= True)
            #replace na values on column with mean
            data[column] = data[column].apply(lambda x: column_mean if pd.isna(x) == True else x)
        return data.reset_index()
    else: # remove NA values if repalce is set to false:
        return data.dropna().reset_index()


    
    
# create boolean map of columns where non-numeric columns are True
classmap = [True, True, True, True, False, False, False, False, False, False,
           False, False, False, False, False, False, False, True]
file_directory = "Crystal_structure.csv"
data = ImportAsDf(file_directory)

clean = CleanData(data, classmap, replace=True)
clean.iloc[1030:1060]

#for column in clean.columns:
    #clean[column] = clean[column].apply(lambda x: 1 if pd.isna(x) == True else 0)
#clean = clean.sum(axis=0)
#clean

Unnamed: 0,index,Compound,A,B,In literature,v(A),v(B),r(AXII)(Å),r(AVI)(Å),r(BVI)(Å),EN(A),EN(B),l(A-O)(Å),l(B-O)(Å),ΔENR,tG,τ,μ,Lowest distortion
1030,1036,Cs2O3,Cs,Cs,False,1.609165,2.266492,1.88,1.67,1.67,0.79,0.79,3.300176,3.009747,-5.411536,0.755476,2.503861,1.192857,cubic
1031,1037,CsCuO3,Cs,Cu,False,1.609165,2.266492,1.88,1.67,0.68,0.79,1.9,3.300176,2.005574,-2.513,1.115053,2.503861,0.485714,cubic
1032,1038,CsDyO3,Cs,Dy,False,1.609165,2.266492,1.88,1.67,0.99,0.79,1.22,3.300176,2.255581,-3.643821,0.970423,2.503861,0.707143,cubic
1033,1039,CsErO3,Cs,Er,False,1.609165,2.266492,1.88,1.67,0.89,0.79,1.24,3.300176,2.250677,-3.412893,1.012799,2.503861,0.635714,cubic
1034,1040,CsEuO3,Cs,Eu,False,1.609165,2.266492,1.88,1.67,1.06,0.79,1.2,3.300176,2.354367,-3.8095,0.942809,2.503861,0.757143,cubic
1035,1041,CsFeO3,Cs,Fe,False,1.609165,2.266492,1.88,1.67,0.67,0.79,1.83,3.300176,1.999522,-2.53825,1.12044,2.503861,0.478571,tetragonal
1036,1042,CsGaO3,Cs,Ga,False,1.609165,2.266492,1.88,1.67,0.62,0.79,1.81,3.300176,1.974644,-2.442929,1.148173,2.503861,0.442857,cubic
1037,1043,CsGdO3,Cs,Gd,False,1.609165,2.266492,1.88,1.67,0.94,0.79,1.2,3.300176,2.284156,-3.5485,0.991158,2.503861,0.671429,rhombohedral
1038,1044,CsGeO3,Cs,Ge,False,1.609165,2.266492,1.88,1.67,0.63,0.79,2.01,3.300176,1.899964,-2.330393,1.142517,2.503861,0.45,tetragonal
1039,1045,CsHfO3,Cs,Hf,False,1.609165,2.266492,1.88,1.67,0.71,0.79,1.3,3.300176,2.027412,-2.981107,1.099199,2.503861,0.507143,cubic
