# Import libraries

In [1]:
import os
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from math import log

from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm

In [2]:
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 50)

In [3]:
path_to_save = './../results/02_clustering'

# Loading data

In [4]:
path_to_colloidal_data = './../data/cp_features_and_description.xlsx'

In [5]:
initial_data = pd.read_excel(path_to_colloidal_data, index_col=0)

print(initial_data.shape)
initial_data.head(3)

(25, 7)


Unnamed: 0,$PDI$,"$d_{av}, nm$","$ζ, mV$",Description,Type,Class,Label
1,0.496,1417.0,-8.2,"MFGO, разб. 1:20",$GO$,1,Train
2,0.301,352.9,-28.0,"HGO, 0.0116 г в 25 мл H2O",$GO$,1,Train
3,0.916,3286.3,-33.5,"GO, RusGraphene, 2 г в 1000 мл H2O",$GO$,1,Train


# Dataset wrangling

In [6]:
data = initial_data.copy()

print(data.shape)
data.head(3)

(25, 7)


Unnamed: 0,$PDI$,"$d_{av}, nm$","$ζ, mV$",Description,Type,Class,Label
1,0.496,1417.0,-8.2,"MFGO, разб. 1:20",$GO$,1,Train
2,0.301,352.9,-28.0,"HGO, 0.0116 г в 25 мл H2O",$GO$,1,Train
3,0.916,3286.3,-33.5,"GO, RusGraphene, 2 г в 1000 мл H2O",$GO$,1,Train


## Features data type validation

In [7]:
pd.DataFrame(data.dtypes).T

Unnamed: 0,$PDI$,"$d_{av}, nm$","$ζ, mV$",Description,Type,Class,Label
0,float64,float64,float64,object,object,int64,object


## Additional mapping columns `Type`

In [8]:
# type_dict = {
#     '$rGO$': '$вОГ$',
#     '$GO$': '$ОГ$',
#     '$GO_{d}$': '$ОГ_{диализ}$',
# }

# data['Type'] = data['Type'].apply(lambda x: type_dict[x])

# print(data.shape)
# data.head(3)

## Selecting a part of data with features

In [9]:
base_features_df = data.iloc[:, :3]

base_features_df.columns = ['$PDI_{n}$', '$d_{av, n}$', '$ζ_{n}$']

print(base_features_df.shape)
base_features_df.head(3)

(25, 3)


Unnamed: 0,$PDI_{n}$,"$d_{av, n}$",$ζ_{n}$
1,0.496,1417.0,-8.2
2,0.301,352.9,-28.0
3,0.916,3286.3,-33.5


In [10]:
engineered_data = base_features_df.copy()

print(engineered_data.shape)
engineered_data.head(3)

(25, 3)


Unnamed: 0,$PDI_{n}$,"$d_{av, n}$",$ζ_{n}$
1,0.496,1417.0,-8.2
2,0.301,352.9,-28.0
3,0.916,3286.3,-33.5


# Feature Engineering

## Extraction base features

In [11]:
base_features = engineered_data.columns.tolist()

In [12]:
dict_initial_raw = dict(zip(base_features, ['PDI', 'd_{av}', 'ζ']))

## Creating features in the form of their ratios

In [13]:
for i, feature1 in enumerate(base_features):
    feature1_raw = dict_initial_raw[feature1]
    
    for j, feature2 in enumerate(base_features):
        if j <= i:
            continue
        else:
            feature2_raw = dict_initial_raw[feature2]
            new_feature_name = r'$(\frac{' + feature1_raw + '}{' + feature2_raw + '})_{n}$'
            new_feature_value = base_features_df[feature1] / base_features_df[feature2]

            engineered_data[new_feature_name] = new_feature_value

## Raising features to power *n* and –*n*

In [14]:
for feature1 in base_features:
    feature1_raw = dict_initial_raw[feature1]

    new_feature_name = f'$({feature1_raw}^' + '{' + f'{-1}' + '})_{n}$'
    new_feature_value = base_features_df[feature1] ** (-1)
    engineered_data[new_feature_name] = new_feature_value

    for n in range(2, 6):
        new_feature_name = f'$({feature1_raw}^' + f'{n}' + ')_{n}$'
        new_feature_value = base_features_df[feature1] ** (n)
        engineered_data[new_feature_name] = new_feature_value

        new_feature_name = f'$({feature1_raw}^' + '{' + f'{-n}' + '})_{n}$'
        new_feature_value = base_features_df[feature1] ** (-n)
        engineered_data[new_feature_name] = new_feature_value

## *Logarithm* of features

In [15]:
for feature1 in base_features:
    feature1_raw = dict_initial_raw[feature1]

    new_feature_name = f'$(ln|{feature1_raw}|' + ')_{n}$'
    new_feature_value = np.log(np.abs(base_features_df[feature1]))
    engineered_data[new_feature_name] = new_feature_value

## Scaling features

In [16]:
columns = engineered_data.columns
indexes = engineered_data.index

scaler = MinMaxScaler()

engineered_data = pd.DataFrame(scaler.fit_transform(engineered_data), columns=columns, index=indexes)

print(engineered_data.shape)
engineered_data.head(3)

(25, 36)


Unnamed: 0,$PDI_{n}$,"$d_{av, n}$",$ζ_{n}$,$(\frac{PDI}{d_{av}})_{n}$,$(\frac{PDI}{ζ})_{n}$,$(\frac{d_{av}}{ζ})_{n}$,$(PDI^{-1})_{n}$,$(PDI^2)_{n}$,$(PDI^{-2})_{n}$,$(PDI^3)_{n}$,$(PDI^{-3})_{n}$,$(PDI^4)_{n}$,$(PDI^{-4})_{n}$,$(PDI^5)_{n}$,$(PDI^{-5})_{n}$,$(d_{av}^{-1})_{n}$,$(d_{av}^2)_{n}$,$(d_{av}^{-2})_{n}$,$(d_{av}^3)_{n}$,$(d_{av}^{-3})_{n}$,$(d_{av}^4)_{n}$,$(d_{av}^{-4})_{n}$,$(d_{av}^5)_{n}$,$(d_{av}^{-5})_{n}$,$(ζ^{-1})_{n}$,$(ζ^2)_{n}$,$(ζ^{-2})_{n}$,$(ζ^3)_{n}$,$(ζ^{-3})_{n}$,$(ζ^4)_{n}$,$(ζ^{-4})_{n}$,$(ζ^5)_{n}$,$(ζ^{-5})_{n}$,$(ln|PDI|)_{n}$,$(ln|d_{av}|)_{n}$,$(ln|ζ|)_{n}$
1,0.317073,0.290777,1.0,0.107421,0.0,0.102205,0.414437,0.207648,0.291802,0.127819,0.194922,0.075187,0.125428,0.042884,0.078775,0.140393,0.110436,0.034858,0.038417,0.007459,0.013073,0.001515,0.004428,0.000303,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.448794,0.599089,0.0
2,0.0,0.018524,0.333333,0.352948,0.979169,0.977305,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.780119,0.002625,0.630116,0.000298,0.502012,3e-05,0.399128,3e-06,0.317252,0.902381,0.5235,0.040867,0.602873,0.984858,0.296361,0.005176,0.780283,0.998319,0.0,0.084927,0.802232
3,1.0,0.769042,0.148148,0.07261,0.652504,0.510288,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.019713,0.613719,0.002814,0.482671,0.000322,0.378768,3.3e-05,0.297155,3e-06,0.963737,0.770547,0.013748,0.312581,0.995416,0.609555,0.001402,0.460673,0.999595,1.0,0.910232,0.919386


In [17]:
engineered_data.agg(['max', 'min'])

Unnamed: 0,$PDI_{n}$,"$d_{av, n}$",$ζ_{n}$,$(\frac{PDI}{d_{av}})_{n}$,$(\frac{PDI}{ζ})_{n}$,$(\frac{d_{av}}{ζ})_{n}$,$(PDI^{-1})_{n}$,$(PDI^2)_{n}$,$(PDI^{-2})_{n}$,$(PDI^3)_{n}$,$(PDI^{-3})_{n}$,$(PDI^4)_{n}$,$(PDI^{-4})_{n}$,$(PDI^5)_{n}$,$(PDI^{-5})_{n}$,$(d_{av}^{-1})_{n}$,$(d_{av}^2)_{n}$,$(d_{av}^{-2})_{n}$,$(d_{av}^3)_{n}$,$(d_{av}^{-3})_{n}$,$(d_{av}^4)_{n}$,$(d_{av}^{-4})_{n}$,$(d_{av}^5)_{n}$,$(d_{av}^{-5})_{n}$,$(ζ^{-1})_{n}$,$(ζ^2)_{n}$,$(ζ^{-2})_{n}$,$(ζ^3)_{n}$,$(ζ^{-3})_{n}$,$(ζ^4)_{n}$,$(ζ^{-4})_{n}$,$(ζ^5)_{n}$,$(ζ^{-5})_{n}$,$(ln|PDI|)_{n}$,$(ln|d_{av}|)_{n}$,$(ln|ζ|)_{n}$
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Saving data

In [18]:
engineered_data.to_excel('./../data/data_for_modeling.xlsx')