# Import libraries

In [1]:
import os
import warnings

import numpy as np
import pandas as pd

import json

import re

from tqdm import tqdm

from itertools import combinations

In [2]:
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

In [3]:
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

In [4]:
path_to_save = './../results/03a_features'

# Loading data

In [5]:
path_to_ftir_data= './../data/processed_data/ftir_extracted_features.csv'
path_to_samples_description = './../data/parsed_data/samples_description.csv'

## FTIR-features

In [6]:
initial_features_df = pd.read_csv(path_to_ftir_data, sep=';', index_col=0)

print(initial_features_df.shape)
initial_features_df.head(3)

(75, 40)


Unnamed: 0,$\overline{I}_{3800\text{–}2500}$,$I_{3733}$,$\overline{I}_{3733 \pm 32}$,$I_{3383}$,$\overline{I}_{3383 \pm 32}$,$I_{3297}$,$\overline{I}_{3297 \pm 32}$,$I_{2930}$,$\overline{I}_{2930 \pm 32}$,$\overline{I}_{1800\text{–}900}$,$\overline{I}_{1800\text{–}1500}$,$I_{1724}$,$\overline{I}_{1724 \pm 32}$,$I_{1641}$,$\overline{I}_{1641 \pm 32}$,$I_{1596}$,$\overline{I}_{1596 \pm 32}$,$I_{1544}$,$\overline{I}_{1544 \pm 32}$,$\overline{I}_{1500\text{–}1300}$,$I_{1430}$,$\overline{I}_{1430 \pm 32}$,$I_{1391}$,$\overline{I}_{1391 \pm 32}$,$\overline{I}_{1300\text{–}900}$,$I_{1243}$,$\overline{I}_{1243 \pm 32}$,$I_{1182}$,$\overline{I}_{1182 \pm 32}$,$I_{1117}$,$\overline{I}_{1117 \pm 32}$,$I_{1067}$,$\overline{I}_{1067 \pm 32}$,$I_{876}$,$\overline{I}_{876 \pm 32}$,"$\Sigma I_{p,\ 3800\text{–}2500}$","$\Sigma I_{p,\ 1800\text{–}900}$","$\Sigma I_{p,\ 1800\text{–}1500}$","$\Sigma I_{p,\ 1500\text{–}1300}$","$\Sigma I_{p,\ 1300\text{–}900}$"
division_1_size_bulk,0.396756,0.050403,0.040558,1.0,0.970571,0.945211,0.938813,0.265624,0.26642,0.550025,0.550451,0.423149,0.417009,1.0,0.877687,0.873005,0.845403,0.431357,0.460425,0.633387,0.677517,0.661219,0.815626,0.785468,0.508124,0.53737,0.528082,0.468417,0.484329,0.67056,0.6615,0.733507,0.721275,0.078995,0.079059,2.261238,6.630507,2.727511,1.493143,2.409853
division_1_size_5,0.430882,0.006416,0.005691,1.0,0.995591,0.908319,0.905618,0.317784,0.311382,0.494561,0.532219,0.415541,0.409133,1.0,0.883698,0.85691,0.827012,0.386789,0.41698,0.568436,0.621224,0.605414,0.739353,0.712254,0.429493,0.497679,0.493277,0.476609,0.489566,0.633977,0.614535,0.547669,0.54026,0.077024,0.079152,2.232519,6.175752,2.659241,1.360577,2.155934
division_1_size_3,0.407828,0.041028,0.029982,1.0,0.98195,0.901208,0.898619,0.430246,0.405804,0.555338,0.562808,0.377704,0.38113,1.0,0.89399,0.903661,0.882435,0.518637,0.539162,0.577782,0.648574,0.631076,0.744396,0.716405,0.538545,0.469872,0.469877,0.57469,0.591524,0.875649,0.846835,0.749219,0.74455,0.138402,0.140829,2.372481,6.862403,2.800002,1.39297,2.669431


## Samples description

In [7]:
samples_description = pd.read_csv(f'{path_to_samples_description}', sep=';', index_col=0)

print(samples_description.shape)
samples_description.head(3)

(75, 6)


Unnamed: 0,Row_ID,Division,Size,Fraction_hue,Fraction_grouped_hue,Class
division_1_size_bulk,1,1,bulk,$> 0$,$d > 5$,2
division_1_size_5,2,1,5,$< 5$,$2 < d \leq 5$,1
division_1_size_3,3,1,3,$< 3$,$2 < d \leq 5$,1


# Feature engineering

## Function for processing

In [8]:
def generate_transformed_features(data, feature_list, tolerance=50, zero_substitute=1e-8):
    data_transformed = pd.DataFrame(index=data.index)

    def clean(col):
        if col.startswith('$') and col.endswith('$'):
            col = col[1:-1]
        return col.replace("Σ", r"\Sigma")

    def wrap_log(col):
        return fr"$\log {clean(col)}$"

    def wrap_pow(col, power):
        return fr"$({clean(col)})^{{{power}}}$"

    def wrap_div(col1, col2):
        return fr"$\frac{{{clean(col1)}}}{{{clean(col2)}}}$"

    def extract_center(name):
        nums = re.findall(r'\d+', name)
        return int(nums[0]) if nums else None

    def get_interval_tag(center):
        if center is None:
            return None
        if 2500 <= center <= 3800:
            return "high"
        elif 800 <= center <= 1900:
            return "low"
        return None

    def extract_center_and_type(name):
        center = extract_center(name)
        is_avg = r'\overline{I}' in name or '±' in name
        return center, is_avg

    def is_near_duplicate(f1, f2):
        c1, avg1 = extract_center_and_type(f1)
        c2, avg2 = extract_center_and_type(f2)
        if c1 is None or c2 is None:
            return False
        return abs(c1 - c2) <= tolerance and (avg1 or avg2)

    np.seterr(divide='ignore', invalid='ignore')

    for col in feature_list:
        base = data[col].copy()
        base_safe = base.replace(0, zero_substitute)

        data_transformed[wrap_log(col)] = np.log(base_safe)

        for power in [-1]:
            transformed = np.power(base_safe, power)
            transformed.replace([np.inf, -np.inf], np.nan, inplace=True)
            data_transformed[wrap_pow(col, power)] = transformed

    for f1, f2 in combinations(feature_list, 2):
        if is_near_duplicate(f1, f2):
            continue
        c1 = extract_center(f1)
        c2 = extract_center(f2)
        tag1 = get_interval_tag(c1)
        tag2 = get_interval_tag(c2)
        if tag1 != tag2:
            continue

        numerator = data[f1].replace(0, zero_substitute)
        denominator = data[f2].replace(0, zero_substitute)
        ratio = numerator / denominator
        ratio.replace([np.inf, -np.inf], np.nan, inplace=True)

        data_transformed[wrap_div(f1, f2)] = ratio

    return data_transformed

## Calculation process

In [9]:
engineered_features_df = generate_transformed_features(data=initial_features_df, feature_list=initial_features_df.columns, tolerance=50)

print(engineered_features_df.shape)
engineered_features_df.head(3)

(75, 525)


Unnamed: 0,$\log \overline{I}_{3800\text{–}2500}$,$(\overline{I}_{3800\text{–}2500})^{-1}$,$\log I_{3733}$,$(I_{3733})^{-1}$,$\log \overline{I}_{3733 \pm 32}$,$(\overline{I}_{3733 \pm 32})^{-1}$,$\log I_{3383}$,$(I_{3383})^{-1}$,$\log \overline{I}_{3383 \pm 32}$,$(\overline{I}_{3383 \pm 32})^{-1}$,$\log I_{3297}$,$(I_{3297})^{-1}$,$\log \overline{I}_{3297 \pm 32}$,$(\overline{I}_{3297 \pm 32})^{-1}$,$\log I_{2930}$,$(I_{2930})^{-1}$,$\log \overline{I}_{2930 \pm 32}$,$(\overline{I}_{2930 \pm 32})^{-1}$,$\log \overline{I}_{1800\text{–}900}$,$(\overline{I}_{1800\text{–}900})^{-1}$,$\log \overline{I}_{1800\text{–}1500}$,$(\overline{I}_{1800\text{–}1500})^{-1}$,$\log I_{1724}$,$(I_{1724})^{-1}$,$\log \overline{I}_{1724 \pm 32}$,$(\overline{I}_{1724 \pm 32})^{-1}$,$\log I_{1641}$,$(I_{1641})^{-1}$,$\log \overline{I}_{1641 \pm 32}$,$(\overline{I}_{1641 \pm 32})^{-1}$,$\log I_{1596}$,$(I_{1596})^{-1}$,$\log \overline{I}_{1596 \pm 32}$,$(\overline{I}_{1596 \pm 32})^{-1}$,$\log I_{1544}$,$(I_{1544})^{-1}$,$\log \overline{I}_{1544 \pm 32}$,$(\overline{I}_{1544 \pm 32})^{-1}$,$\log \overline{I}_{1500\text{–}1300}$,$(\overline{I}_{1500\text{–}1300})^{-1}$,$\log I_{1430}$,$(I_{1430})^{-1}$,$\log \overline{I}_{1430 \pm 32}$,$(\overline{I}_{1430 \pm 32})^{-1}$,$\log I_{1391}$,$(I_{1391})^{-1}$,$\log \overline{I}_{1391 \pm 32}$,$(\overline{I}_{1391 \pm 32})^{-1}$,$\log \overline{I}_{1300\text{–}900}$,$(\overline{I}_{1300\text{–}900})^{-1}$,...,"$\frac{I_{1182}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{\overline{I}_{1182 \pm 32}}{I_{1117}}$,$\frac{\overline{I}_{1182 \pm 32}}{\overline{I}_{1117 \pm 32}}$,$\frac{\overline{I}_{1182 \pm 32}}{I_{1067}}$,$\frac{\overline{I}_{1182 \pm 32}}{\overline{I}_{1067 \pm 32}}$,$\frac{\overline{I}_{1182 \pm 32}}{I_{876}}$,$\frac{\overline{I}_{1182 \pm 32}}{\overline{I}_{876 \pm 32}}$,"$\frac{\overline{I}_{1182 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1182 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1182 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1182 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{I_{1117}}{I_{1067}}$,$\frac{I_{1117}}{I_{876}}$,$\frac{I_{1117}}{\overline{I}_{876 \pm 32}}$,"$\frac{I_{1117}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{1117}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{1117}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{1117}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{\overline{I}_{1117 \pm 32}}{I_{876}}$,$\frac{\overline{I}_{1117 \pm 32}}{\overline{I}_{876 \pm 32}}$,"$\frac{\overline{I}_{1117 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1117 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1117 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1117 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{I_{1067}}{I_{876}}$,$\frac{I_{1067}}{\overline{I}_{876 \pm 32}}$,"$\frac{I_{1067}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{1067}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{1067}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{1067}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{\overline{I}_{1067 \pm 32}}{I_{876}}$,$\frac{\overline{I}_{1067 \pm 32}}{\overline{I}_{876 \pm 32}}$,"$\frac{\overline{I}_{1067 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1067 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1067 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1067 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1800\text{–}1500}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1800\text{–}1500}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1500\text{–}1300}}{\Sigma I_{p,\ 1300\text{–}900}}$"
division_1_size_bulk,-0.924433,2.520439,-2.987704,19.84008,-3.205016,24.655887,0.0,1.0,-0.029871,1.030321,-0.056347,1.057965,-0.063139,1.065174,-1.325672,3.764716,-1.322682,3.753476,-0.597792,1.8181,-0.597017,1.816691,-0.86003,2.363231,-0.874648,2.39803,0.0,1.0,-0.130466,1.139359,-0.135814,1.145468,-0.167942,1.182868,-0.84082,2.318268,-0.775605,2.171905,-0.456674,1.578815,-0.38932,1.475977,-0.41367,1.512358,-0.2038,1.226053,-0.241475,1.273126,-0.677029,1.968022,...,0.194376,0.722276,0.732168,0.660291,0.67149,6.13117,6.126205,0.073046,0.177572,0.324369,0.200979,0.914182,8.488686,8.481811,0.101132,0.24585,0.449093,0.278257,8.373996,8.367214,0.099766,0.242529,0.443025,0.274498,9.285551,9.278031,0.110626,0.268929,0.491251,0.304378,9.130697,9.123303,0.108781,0.264444,0.483058,0.299302,0.011914,0.028962,0.052905,0.03278,0.011923,0.028986,0.052948,0.032806,2.430973,4.440638,2.751415,1.826691,1.131816,0.619599
division_1_size_5,-0.841921,2.320821,-5.048926,155.855059,-5.168913,175.723662,0.0,1.0,-0.004419,1.004429,-0.09616,1.100935,-0.099137,1.104218,-1.146384,3.146794,-1.166735,3.211491,-0.704084,2.021994,-0.630701,1.878927,-0.878173,2.406499,-0.893714,2.444191,0.0,1.0,-0.12364,1.131608,-0.154422,1.166984,-0.189936,1.209172,-0.949875,2.585386,-0.874716,2.398194,-0.564867,1.759214,-0.476064,1.609725,-0.501842,1.651762,-0.30198,1.352534,-0.33932,1.403993,-0.845149,2.328326,...,0.221068,0.772214,0.796645,0.893907,0.906166,6.356036,6.185116,0.079272,0.1841,0.359822,0.227078,1.157591,8.230929,8.009591,0.102656,0.238405,0.465962,0.294061,7.978509,7.763959,0.099508,0.231094,0.451672,0.285043,7.110396,6.919191,0.088681,0.205949,0.402527,0.254029,7.014204,6.825585,0.087481,0.203163,0.397082,0.250592,0.012472,0.028965,0.056611,0.035726,0.012817,0.029765,0.058175,0.036714,2.322374,4.539069,2.864536,1.954495,1.233452,0.631085
division_1_size_3,-0.89691,2.452015,-3.193496,24.373499,-3.507158,33.353355,0.0,1.0,-0.018215,1.018382,-0.10402,1.109622,-0.106896,1.112819,-0.843399,2.324253,-0.901884,2.464242,-0.588179,1.800706,-0.574817,1.776806,-0.973645,2.647577,-0.964613,2.623773,0.0,1.0,-0.112061,1.118581,-0.101301,1.10661,-0.12507,1.133227,-0.65655,1.928129,-0.61774,1.854732,-0.548558,1.730755,-0.432979,1.541844,-0.460329,1.584595,-0.295182,1.343371,-0.33351,1.395859,-0.618885,1.856856,...,0.215286,0.675527,0.698512,0.789521,0.794472,4.273951,4.200315,0.086198,0.211258,0.42465,0.221592,1.168748,6.326843,6.217837,0.127601,0.312732,0.62862,0.328028,6.118654,6.013234,0.123402,0.302441,0.607935,0.317234,5.413349,5.320082,0.109177,0.267578,0.537858,0.280666,5.379613,5.286926,0.108497,0.265911,0.534506,0.278917,0.020168,0.049429,0.099358,0.051847,0.020522,0.050296,0.1011,0.052756,2.450856,4.926455,2.570737,2.010095,1.048914,0.521823


*NaN-values analysis*

In [10]:
nan_df = engineered_features_df.isna().sum()
nan_summary = nan_df[nan_df > 0]
print(nan_summary)

Series([], dtype: int64)


## Merging process

In [11]:
features_df = pd.merge(
    left=initial_features_df,
    right=engineered_features_df,
    how='inner',
    left_index=True,
    right_index=True
)

print(features_df.shape)
features_df.head(3)

(75, 565)


Unnamed: 0,$\overline{I}_{3800\text{–}2500}$,$I_{3733}$,$\overline{I}_{3733 \pm 32}$,$I_{3383}$,$\overline{I}_{3383 \pm 32}$,$I_{3297}$,$\overline{I}_{3297 \pm 32}$,$I_{2930}$,$\overline{I}_{2930 \pm 32}$,$\overline{I}_{1800\text{–}900}$,$\overline{I}_{1800\text{–}1500}$,$I_{1724}$,$\overline{I}_{1724 \pm 32}$,$I_{1641}$,$\overline{I}_{1641 \pm 32}$,$I_{1596}$,$\overline{I}_{1596 \pm 32}$,$I_{1544}$,$\overline{I}_{1544 \pm 32}$,$\overline{I}_{1500\text{–}1300}$,$I_{1430}$,$\overline{I}_{1430 \pm 32}$,$I_{1391}$,$\overline{I}_{1391 \pm 32}$,$\overline{I}_{1300\text{–}900}$,$I_{1243}$,$\overline{I}_{1243 \pm 32}$,$I_{1182}$,$\overline{I}_{1182 \pm 32}$,$I_{1117}$,$\overline{I}_{1117 \pm 32}$,$I_{1067}$,$\overline{I}_{1067 \pm 32}$,$I_{876}$,$\overline{I}_{876 \pm 32}$,"$\Sigma I_{p,\ 3800\text{–}2500}$","$\Sigma I_{p,\ 1800\text{–}900}$","$\Sigma I_{p,\ 1800\text{–}1500}$","$\Sigma I_{p,\ 1500\text{–}1300}$","$\Sigma I_{p,\ 1300\text{–}900}$",$\log \overline{I}_{3800\text{–}2500}$,$(\overline{I}_{3800\text{–}2500})^{-1}$,$\log I_{3733}$,$(I_{3733})^{-1}$,$\log \overline{I}_{3733 \pm 32}$,$(\overline{I}_{3733 \pm 32})^{-1}$,$\log I_{3383}$,$(I_{3383})^{-1}$,$\log \overline{I}_{3383 \pm 32}$,$(\overline{I}_{3383 \pm 32})^{-1}$,...,"$\frac{I_{1182}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{\overline{I}_{1182 \pm 32}}{I_{1117}}$,$\frac{\overline{I}_{1182 \pm 32}}{\overline{I}_{1117 \pm 32}}$,$\frac{\overline{I}_{1182 \pm 32}}{I_{1067}}$,$\frac{\overline{I}_{1182 \pm 32}}{\overline{I}_{1067 \pm 32}}$,$\frac{\overline{I}_{1182 \pm 32}}{I_{876}}$,$\frac{\overline{I}_{1182 \pm 32}}{\overline{I}_{876 \pm 32}}$,"$\frac{\overline{I}_{1182 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1182 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1182 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1182 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{I_{1117}}{I_{1067}}$,$\frac{I_{1117}}{I_{876}}$,$\frac{I_{1117}}{\overline{I}_{876 \pm 32}}$,"$\frac{I_{1117}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{1117}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{1117}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{1117}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{\overline{I}_{1117 \pm 32}}{I_{876}}$,$\frac{\overline{I}_{1117 \pm 32}}{\overline{I}_{876 \pm 32}}$,"$\frac{\overline{I}_{1117 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1117 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1117 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1117 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{I_{1067}}{I_{876}}$,$\frac{I_{1067}}{\overline{I}_{876 \pm 32}}$,"$\frac{I_{1067}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{1067}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{1067}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{1067}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{\overline{I}_{1067 \pm 32}}{I_{876}}$,$\frac{\overline{I}_{1067 \pm 32}}{\overline{I}_{876 \pm 32}}$,"$\frac{\overline{I}_{1067 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1067 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1067 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1067 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1800\text{–}1500}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1800\text{–}1500}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1500\text{–}1300}}{\Sigma I_{p,\ 1300\text{–}900}}$"
division_1_size_bulk,0.396756,0.050403,0.040558,1.0,0.970571,0.945211,0.938813,0.265624,0.26642,0.550025,0.550451,0.423149,0.417009,1.0,0.877687,0.873005,0.845403,0.431357,0.460425,0.633387,0.677517,0.661219,0.815626,0.785468,0.508124,0.53737,0.528082,0.468417,0.484329,0.67056,0.6615,0.733507,0.721275,0.078995,0.079059,2.261238,6.630507,2.727511,1.493143,2.409853,-0.924433,2.520439,-2.987704,19.84008,-3.205016,24.655887,0.0,1.0,-0.029871,1.030321,...,0.194376,0.722276,0.732168,0.660291,0.67149,6.13117,6.126205,0.073046,0.177572,0.324369,0.200979,0.914182,8.488686,8.481811,0.101132,0.24585,0.449093,0.278257,8.373996,8.367214,0.099766,0.242529,0.443025,0.274498,9.285551,9.278031,0.110626,0.268929,0.491251,0.304378,9.130697,9.123303,0.108781,0.264444,0.483058,0.299302,0.011914,0.028962,0.052905,0.03278,0.011923,0.028986,0.052948,0.032806,2.430973,4.440638,2.751415,1.826691,1.131816,0.619599
division_1_size_5,0.430882,0.006416,0.005691,1.0,0.995591,0.908319,0.905618,0.317784,0.311382,0.494561,0.532219,0.415541,0.409133,1.0,0.883698,0.85691,0.827012,0.386789,0.41698,0.568436,0.621224,0.605414,0.739353,0.712254,0.429493,0.497679,0.493277,0.476609,0.489566,0.633977,0.614535,0.547669,0.54026,0.077024,0.079152,2.232519,6.175752,2.659241,1.360577,2.155934,-0.841921,2.320821,-5.048926,155.855059,-5.168913,175.723662,0.0,1.0,-0.004419,1.004429,...,0.221068,0.772214,0.796645,0.893907,0.906166,6.356036,6.185116,0.079272,0.1841,0.359822,0.227078,1.157591,8.230929,8.009591,0.102656,0.238405,0.465962,0.294061,7.978509,7.763959,0.099508,0.231094,0.451672,0.285043,7.110396,6.919191,0.088681,0.205949,0.402527,0.254029,7.014204,6.825585,0.087481,0.203163,0.397082,0.250592,0.012472,0.028965,0.056611,0.035726,0.012817,0.029765,0.058175,0.036714,2.322374,4.539069,2.864536,1.954495,1.233452,0.631085
division_1_size_3,0.407828,0.041028,0.029982,1.0,0.98195,0.901208,0.898619,0.430246,0.405804,0.555338,0.562808,0.377704,0.38113,1.0,0.89399,0.903661,0.882435,0.518637,0.539162,0.577782,0.648574,0.631076,0.744396,0.716405,0.538545,0.469872,0.469877,0.57469,0.591524,0.875649,0.846835,0.749219,0.74455,0.138402,0.140829,2.372481,6.862403,2.800002,1.39297,2.669431,-0.89691,2.452015,-3.193496,24.373499,-3.507158,33.353355,0.0,1.0,-0.018215,1.018382,...,0.215286,0.675527,0.698512,0.789521,0.794472,4.273951,4.200315,0.086198,0.211258,0.42465,0.221592,1.168748,6.326843,6.217837,0.127601,0.312732,0.62862,0.328028,6.118654,6.013234,0.123402,0.302441,0.607935,0.317234,5.413349,5.320082,0.109177,0.267578,0.537858,0.280666,5.379613,5.286926,0.108497,0.265911,0.534506,0.278917,0.020168,0.049429,0.099358,0.051847,0.020522,0.050296,0.1011,0.052756,2.450856,4.926455,2.570737,2.010095,1.048914,0.521823


# Creating features description

## Functions for processing

In [12]:
def classify_feature_type(base_columns, columns):
    base_columns = [col for col in base_columns if "Unnamed" not in col]
    columns = [col for col in columns if "Unnamed" not in col]

    def clean(col):
        if col.startswith('$') and col.endswith('$'):
            col = col[1:-1]
        return col.replace("Σ", r"\Sigma")

    def is_interval(col):
        col = clean(col)
        return r"\overline{I}_{" in col or r"\Sigma I_{p," in col

    def is_peak(col):
        return not is_interval(col)

    def extract_numbers(col):
        return list(map(int, re.findall(r"\d{3,4}", col)))

    def is_in_range(col, low, high):
        return any(low <= num <= high for num in extract_numbers(clean(col)))

    def classify(col, base=True):
        col_clean = clean(col)
        source = "intervals" if is_interval(col_clean) else "peaks"

        if is_in_range(col_clean, 2500, 3800):
            region = "3800_2500"
        elif is_in_range(col_clean, 800, 1900):
            region = "1900_800"
        else:
            region = "unknown"

        kind = "base" if base else "engineered"
        return f"{kind}_features_{source}_{region}"

    groups = {}

    for col in base_columns:
        key = classify(col, base=True)
        groups.setdefault(key, []).append(col)

    for col in columns:
        if col not in base_columns:
            key = classify(col, base=False)
            groups.setdefault(key, []).append(col)

    groups["base_features"] = base_columns
    groups["engineered_features"] = [col for col in columns if col not in base_columns]

    return groups

## Feature groups selection

In [13]:
base_columns = initial_features_df.columns.tolist()
columns = features_df.columns.tolist()

In [14]:
features_description_dict = classify_feature_type(base_columns=base_columns, columns=columns)

In [15]:
for feature_type, feature_lst in features_description_dict.items():
    print(f'(*) {feature_type}: {len(feature_lst)}')

(*) base_features_intervals_3800_2500: 6
(*) base_features_peaks_3800_2500: 4
(*) base_features_intervals_1900_800: 19
(*) base_features_peaks_1900_800: 11
(*) engineered_features_intervals_3800_2500: 46
(*) engineered_features_peaks_3800_2500: 14
(*) engineered_features_intervals_1900_800: 388
(*) engineered_features_peaks_1900_800: 77
(*) base_features: 40
(*) engineered_features: 525


# Saving data

In [16]:
features_df.to_csv('./../data/processed_data/ftir_features_long_list.csv', sep=';')

In [17]:
with open('./../data/processed_data/features_description.json', 'w', encoding='utf-8') as f:
    json.dump(features_description_dict, f, ensure_ascii=False, indent=2)