In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from typing import Union


class DataLoader:
    def __init__(self, file_path: str) -> None:
        self._file_path = file_path
        self.constant_columns = None

        print("Loading DataFrame ...\n")
        try:
            self.df = pd.read_csv(file_path).drop(labels='Unnamed: 0', axis=1)
        except:
            self.df = pd.read_csv(file_path).drop(labels='Unnamed: 0', axis=1)

        self._constant_columns = get_constant_columns(df=self.df)
        if len(self._constant_columns) > 0:
            print(f'Removing constant columns\n{self._constant_columns}\n')
            self.constant_columns = {
                key : arg for key, arg in zip(self._constant_columns,np.unique(self.df[self._constant_columns]))
            }
            self.df = remove_columns(df=self.df, key=self._constant_columns)
        elif len(self._constant_columns) == 0:
            pass


    def feature_space(self, target: str, scaling: bool=True):
        # target definition
        self.target = target
        self.y = self.df[self.target]

        # creating the feature space
        self.X = self.df.drop(columns=self.target)
        # feature space labels keys
        self.fspace_keys = [k for k in self.df.columns if k != self.target]

        print(f'Feature space: {self.fspace_keys},\nTarget property: {target}')

        if scaling:
            self.scaler = StandardScaler()
            self.X = self.scaler.fit_transform(X=self.X)
            self.X = pd.DataFrame(data=self.X, columns=self.fspace_keys)
        else:
            pass


def get_constant_columns(df: pd.DataFrame) -> list[str]:
    const_val = list()

    for k, v in df.items():
        if len(v.unique()) == 1:
            const_val.append(k)

    return const_val


def remove_columns(df: pd.DataFrame, 
                   key: Union[list[str], str]) -> pd.DataFrame:
    
    if isinstance(key, list):
        to_drop = sum([[s for s in df.columns if k in s] for k in key], [])

    elif isinstance(key, str):
        to_drop = [s for s in df.columns if key in s]

    else:
        raise TypeError("Key parameters can only be `str` or `list`.")
    
    df = df.drop(labels=to_drop, 
                 axis=1)

    return df

In [2]:
data_dir = '/home/andreag/Work/1.main_project/git_repo/CoacervsOpti/1.dataset_manipulation/0.sliced_dataframe/'

In [3]:
df_2d = 'lys_asp_MWca_38.0_13.2.csv'
df_3d = 'lys_asp_MWca_16.525_37.0_v2.csv'

In [4]:
data = DataLoader(file_path=data_dir+df_3d)
data.feature_space(target='Phase')

Loading DataFrame ...

Removing constant columns
['MW_Anion_C(C(C(=O)O)N)C(=O)O', 'MW_Cation_C(CCN)CC(C(=O)O)N']

Feature space: ['Conc_Anion_C(C(C(=O)O)N)C(=O)O', 'Conc_Cation_C(CCN)CC(C(=O)O)N', 'Conc_NaCl'],
Target property: Phase


In [5]:
data.X

Unnamed: 0,Conc_Anion_C(C(C(=O)O)N)C(=O)O,Conc_Cation_C(CCN)CC(C(=O)O)N,Conc_NaCl
0,1.004048,-0.298957,-1.556378
1,0.678273,0.026792,-1.556378
2,0.352498,0.352541,-1.556378
3,0.026723,0.678291,-1.556378
4,-0.299052,1.004040,-1.556378
...,...,...,...
6878,1.818485,1.818414,0.175892
6879,1.818485,1.818414,0.368366
6880,1.818485,1.818414,0.560841
6881,1.818485,1.818414,1.138264


In [6]:
data.scaler.inverse_transform(data.X)

array([[   5.6,    2.4,  100. ],
       [   4.8,    3.2,  100. ],
       [   4. ,    4. ,  100. ],
       ...,
       [   7.6,    7.6, 1200. ],
       [   7.6,    7.6, 1500. ],
       [   7.6,    7.6, 2000. ]])