# Import libraries

In [1]:
import os
import warnings

import numpy as np
import pandas as pd

import json

import re

from tqdm import tqdm

from itertools import combinations

In [2]:
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

In [3]:
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

In [4]:
path_to_save = './../results/03a_features'

# Loading data

In [5]:
path_to_ftir_data= './../data/processed_data/ftir_extracted_features.csv'
path_to_samples_description = './../data/parsed_data/samples_description.csv'

## FTIR-features

In [6]:
initial_features_df = pd.read_csv(path_to_ftir_data, sep=';', index_col=0)

print(initial_features_df.shape)
initial_features_df.head(3)

(75, 32)


Unnamed: 0,$\overline{I}_{3800\text{–}2500}$,$I_{3375}$,$\overline{I}_{3375 \pm 32}$,$I_{3293}$,$\overline{I}_{3293 \pm 32}$,$I_{2928}$,$\overline{I}_{2928 \pm 32}$,$\overline{I}_{1800\text{–}900}$,$\overline{I}_{1800\text{–}1500}$,$I_{1722}$,$\overline{I}_{1722 \pm 32}$,$I_{1626}$,$\overline{I}_{1626 \pm 32}$,$I_{1550}$,$\overline{I}_{1550 \pm 32}$,$\overline{I}_{1500\text{–}1300}$,$I_{1396}$,$\overline{I}_{1396 \pm 32}$,$\overline{I}_{1300\text{–}900}$,$I_{1240}$,$\overline{I}_{1240 \pm 32}$,$I_{1155}$,$\overline{I}_{1155 \pm 32}$,$I_{1074}$,$\overline{I}_{1074 \pm 32}$,$I_{877}$,$\overline{I}_{877 \pm 32}$,"$\Sigma I_{p,\ 3800\text{–}2500}$","$\Sigma I_{p,\ 1800\text{–}900}$","$\Sigma I_{p,\ 1800\text{–}1500}$","$\Sigma I_{p,\ 1500\text{–}1300}$","$\Sigma I_{p,\ 1300\text{–}900}$"
division_1_size_bulk,0.526201,0.981207,0.97598,1.0,0.963685,0.49172,0.492079,0.504366,0.537705,0.42653,0.422144,1.0,0.925096,0.421988,0.453781,0.571735,0.760582,0.726142,0.44578,0.452765,0.443662,0.472792,0.479528,0.681294,0.665771,0.048779,0.049258,2.472926,4.21595,1.848518,0.760582,1.606851
division_1_size_5,0.477909,1.0,0.995136,0.92509,0.923226,0.406937,0.401135,0.451487,0.497036,0.387542,0.383206,1.0,0.922786,0.357085,0.390761,0.502368,0.687458,0.65602,0.391974,0.426756,0.423141,0.497397,0.498701,0.606138,0.536587,0.093475,0.095869,2.332027,3.962376,1.744626,0.687458,1.530291
division_1_size_3,0.535789,1.0,0.985816,0.937538,0.935769,0.618445,0.59893,0.517003,0.549005,0.375077,0.381174,1.0,0.938225,0.522055,0.542602,0.523513,0.696553,0.668677,0.489773,0.400246,0.402681,0.683292,0.676252,0.836176,0.731684,0.117973,0.120394,2.555983,4.513399,1.897132,0.696553,1.919714


## Samples description

In [7]:
samples_description = pd.read_csv(f'{path_to_samples_description}', sep=';', index_col=0)

print(samples_description.shape)
samples_description.head(3)

(75, 6)


Unnamed: 0,Row_ID,Division,Size,Fraction_hue,Fraction_grouped_hue,Class
division_1_size_bulk,1,1,bulk,$> 0$,$d > 5$,2
division_1_size_5,2,1,5,$< 5$,$2 < d \leq 5$,1
division_1_size_3,3,1,3,$< 3$,$2 < d \leq 5$,1


# Feature engineering

## Function for processing

In [8]:
def generate_transformed_features(data, feature_list, tolerance=50, zero_substitute=1e-8):
    data_transformed = pd.DataFrame(index=data.index)

    def clean(col):
        if col.startswith('$') and col.endswith('$'):
            col = col[1:-1]
        return col.replace("Σ", r"\Sigma")

    def wrap_log(col):
        return fr"$\log {clean(col)}$"

    def wrap_pow(col, power):
        return fr"$({clean(col)})^{{{power}}}$"

    def wrap_div(col1, col2):
        return fr"$\frac{{{clean(col1)}}}{{{clean(col2)}}}$"

    def extract_center(name):
        nums = re.findall(r'\d+', name)
        return int(nums[0]) if nums else None

    def get_interval_tag(center):
        if center is None:
            return None
        if 2500 <= center <= 3800:
            return "high"
        elif 800 <= center <= 1900:
            return "low"
        return None

    def extract_center_and_type(name):
        center = extract_center(name)
        is_avg = r'\overline{I}' in name or '±' in name
        return center, is_avg

    def is_near_duplicate(f1, f2):
        c1, avg1 = extract_center_and_type(f1)
        c2, avg2 = extract_center_and_type(f2)
        if c1 is None or c2 is None:
            return False
        return abs(c1 - c2) <= tolerance and (avg1 or avg2)

    np.seterr(divide='ignore', invalid='ignore')

    for col in feature_list:
        base = data[col].copy()
        base_safe = base.replace(0, zero_substitute)

        data_transformed[wrap_log(col)] = np.log(base_safe)

        for power in [-1]:
            transformed = np.power(base_safe, power)
            transformed.replace([np.inf, -np.inf], np.nan, inplace=True)
            data_transformed[wrap_pow(col, power)] = transformed

    for f1, f2 in combinations(feature_list, 2):
        if is_near_duplicate(f1, f2):
            continue
        c1 = extract_center(f1)
        c2 = extract_center(f2)
        tag1 = get_interval_tag(c1)
        tag2 = get_interval_tag(c2)
        if tag1 != tag2:
            continue

        numerator = data[f1].replace(0, zero_substitute)
        denominator = data[f2].replace(0, zero_substitute)
        ratio = numerator / denominator
        ratio.replace([np.inf, -np.inf], np.nan, inplace=True)

        data_transformed[wrap_div(f1, f2)] = ratio

    return data_transformed

## Calculation process

In [9]:
engineered_features_df = generate_transformed_features(data=initial_features_df, feature_list=initial_features_df.columns, tolerance=50)

print(engineered_features_df.shape)
engineered_features_df.head(3)

(75, 346)


Unnamed: 0,$\log \overline{I}_{3800\text{–}2500}$,$(\overline{I}_{3800\text{–}2500})^{-1}$,$\log I_{3375}$,$(I_{3375})^{-1}$,$\log \overline{I}_{3375 \pm 32}$,$(\overline{I}_{3375 \pm 32})^{-1}$,$\log I_{3293}$,$(I_{3293})^{-1}$,$\log \overline{I}_{3293 \pm 32}$,$(\overline{I}_{3293 \pm 32})^{-1}$,$\log I_{2928}$,$(I_{2928})^{-1}$,$\log \overline{I}_{2928 \pm 32}$,$(\overline{I}_{2928 \pm 32})^{-1}$,$\log \overline{I}_{1800\text{–}900}$,$(\overline{I}_{1800\text{–}900})^{-1}$,$\log \overline{I}_{1800\text{–}1500}$,$(\overline{I}_{1800\text{–}1500})^{-1}$,$\log I_{1722}$,$(I_{1722})^{-1}$,$\log \overline{I}_{1722 \pm 32}$,$(\overline{I}_{1722 \pm 32})^{-1}$,$\log I_{1626}$,$(I_{1626})^{-1}$,$\log \overline{I}_{1626 \pm 32}$,$(\overline{I}_{1626 \pm 32})^{-1}$,$\log I_{1550}$,$(I_{1550})^{-1}$,$\log \overline{I}_{1550 \pm 32}$,$(\overline{I}_{1550 \pm 32})^{-1}$,$\log \overline{I}_{1500\text{–}1300}$,$(\overline{I}_{1500\text{–}1300})^{-1}$,$\log I_{1396}$,$(I_{1396})^{-1}$,$\log \overline{I}_{1396 \pm 32}$,$(\overline{I}_{1396 \pm 32})^{-1}$,$\log \overline{I}_{1300\text{–}900}$,$(\overline{I}_{1300\text{–}900})^{-1}$,$\log I_{1240}$,$(I_{1240})^{-1}$,$\log \overline{I}_{1240 \pm 32}$,$(\overline{I}_{1240 \pm 32})^{-1}$,$\log I_{1155}$,$(I_{1155})^{-1}$,$\log \overline{I}_{1155 \pm 32}$,$(\overline{I}_{1155 \pm 32})^{-1}$,$\log I_{1074}$,$(I_{1074})^{-1}$,$\log \overline{I}_{1074 \pm 32}$,$(\overline{I}_{1074 \pm 32})^{-1}$,...,$\frac{\overline{I}_{1240 \pm 32}}{I_{1074}}$,$\frac{\overline{I}_{1240 \pm 32}}{\overline{I}_{1074 \pm 32}}$,$\frac{\overline{I}_{1240 \pm 32}}{I_{877}}$,$\frac{\overline{I}_{1240 \pm 32}}{\overline{I}_{877 \pm 32}}$,"$\frac{\overline{I}_{1240 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1240 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1240 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1240 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{I_{1155}}{I_{1074}}$,$\frac{I_{1155}}{\overline{I}_{1074 \pm 32}}$,$\frac{I_{1155}}{I_{877}}$,$\frac{I_{1155}}{\overline{I}_{877 \pm 32}}$,"$\frac{I_{1155}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{1155}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{1155}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{1155}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{\overline{I}_{1155 \pm 32}}{I_{1074}}$,$\frac{\overline{I}_{1155 \pm 32}}{\overline{I}_{1074 \pm 32}}$,$\frac{\overline{I}_{1155 \pm 32}}{I_{877}}$,$\frac{\overline{I}_{1155 \pm 32}}{\overline{I}_{877 \pm 32}}$,"$\frac{\overline{I}_{1155 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1155 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1155 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1155 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{I_{1074}}{I_{877}}$,$\frac{I_{1074}}{\overline{I}_{877 \pm 32}}$,"$\frac{I_{1074}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{1074}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{1074}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{1074}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{\overline{I}_{1074 \pm 32}}{I_{877}}$,$\frac{\overline{I}_{1074 \pm 32}}{\overline{I}_{877 \pm 32}}$,"$\frac{\overline{I}_{1074 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1074 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1074 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1074 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{I_{877}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{877}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{877}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{877}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\overline{I}_{877 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{877 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{877 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{877 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1800\text{–}1500}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1800\text{–}1500}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1500\text{–}1300}}{\Sigma I_{p,\ 1300\text{–}900}}$"
division_1_size_bulk,-0.642072,1.900414,-0.018972,1.019153,-0.024313,1.024611,0.0,1.0,-0.036991,1.037683,-0.709847,2.033679,-0.709117,2.032196,-0.684452,1.982686,-0.620446,1.859757,-0.852073,2.344501,-0.862408,2.368859,0.0,1.0,-0.077857,1.080968,-0.862779,2.369737,-0.790141,2.203707,-0.55908,1.749062,-0.273672,1.314783,-0.320009,1.377141,-0.80793,2.24326,-0.792382,2.208651,-0.812693,2.25397,-0.7491,2.115096,-0.734954,2.085386,-0.383761,1.467794,-0.40681,1.502018,...,0.651204,0.666388,9.095343,9.006882,0.105234,0.240009,0.583319,0.276106,0.693961,0.710142,9.692527,9.598258,0.112144,0.255768,0.621619,0.294235,0.703848,0.720259,9.830617,9.735005,0.113741,0.259412,0.630475,0.298427,13.966961,13.831119,0.161599,0.368562,0.895754,0.423993,13.648724,13.515977,0.157917,0.360165,0.875345,0.414333,0.01157,0.026388,0.064134,0.030357,0.011684,0.026647,0.064764,0.030655,2.280719,5.543061,2.623735,2.430401,1.150398,0.473337
division_1_size_5,-0.738335,2.092448,0.0,1.0,-0.004876,1.004888,-0.077864,1.080976,-0.079881,1.083158,-0.899098,2.457385,-0.913456,2.492924,-0.795208,2.214902,-0.699093,2.011927,-0.947932,2.580368,-0.959183,2.609562,0.0,1.0,-0.080358,1.083675,-1.029782,2.800456,-0.939659,2.559109,-0.688422,1.990572,-0.374754,1.454634,-0.421564,1.524343,-0.936559,2.551188,-0.851543,2.343259,-0.860049,2.363276,-0.698366,2.010466,-0.695749,2.00521,-0.500648,1.64979,-0.622526,1.86363,...,0.698094,0.788579,4.526795,4.41373,0.10679,0.24254,0.615516,0.27651,0.820601,0.926965,5.321188,5.188282,0.12553,0.285102,0.723531,0.325034,0.822752,0.929394,5.335134,5.201879,0.125859,0.28585,0.725427,0.325886,6.484502,6.32254,0.152973,0.347431,0.881708,0.396093,5.740444,5.597066,0.135421,0.307566,0.780538,0.350644,0.023591,0.053579,0.135972,0.061083,0.024195,0.054951,0.139455,0.062648,2.271189,5.763805,2.589295,2.537792,1.140062,0.449234
division_1_size_3,-0.624015,1.866407,0.0,1.0,-0.014285,1.014388,-0.064498,1.066623,-0.066386,1.06864,-0.480547,1.616959,-0.512611,1.669645,-0.659707,1.934225,-0.599647,1.821476,-0.980625,2.666122,-0.964499,2.623473,0.0,1.0,-0.063766,1.065843,-0.649981,1.915505,-0.611378,1.84297,-0.647194,1.910173,-0.361611,1.43564,-0.402454,1.495491,-0.713813,2.041762,-0.915676,2.498463,-0.90961,2.483353,-0.380833,1.463503,-0.391189,1.478739,-0.178917,1.195921,-0.312406,1.36671,...,0.481575,0.550349,3.413345,3.344686,0.089219,0.212258,0.578105,0.209761,0.817163,0.933862,5.791953,5.675449,0.151392,0.360171,0.980961,0.355934,0.808744,0.92424,5.732277,5.616974,0.149832,0.35646,0.970854,0.352267,7.087876,6.945305,0.185265,0.440758,1.200447,0.435573,6.20215,6.077395,0.162114,0.385679,1.050435,0.381142,0.026138,0.062185,0.169366,0.061453,0.026675,0.063461,0.172843,0.062715,2.379064,6.479617,2.351079,2.723599,0.988237,0.362842


*NaN-values analysis*

In [10]:
nan_df = engineered_features_df.isna().sum()
nan_summary = nan_df[nan_df > 0]
print(nan_summary)

Series([], dtype: int64)


## Merging process

In [11]:
features_df = pd.merge(
    left=initial_features_df,
    right=engineered_features_df,
    how='inner',
    left_index=True,
    right_index=True
)

print(features_df.shape)
features_df.head(3)

(75, 378)


Unnamed: 0,$\overline{I}_{3800\text{–}2500}$,$I_{3375}$,$\overline{I}_{3375 \pm 32}$,$I_{3293}$,$\overline{I}_{3293 \pm 32}$,$I_{2928}$,$\overline{I}_{2928 \pm 32}$,$\overline{I}_{1800\text{–}900}$,$\overline{I}_{1800\text{–}1500}$,$I_{1722}$,$\overline{I}_{1722 \pm 32}$,$I_{1626}$,$\overline{I}_{1626 \pm 32}$,$I_{1550}$,$\overline{I}_{1550 \pm 32}$,$\overline{I}_{1500\text{–}1300}$,$I_{1396}$,$\overline{I}_{1396 \pm 32}$,$\overline{I}_{1300\text{–}900}$,$I_{1240}$,$\overline{I}_{1240 \pm 32}$,$I_{1155}$,$\overline{I}_{1155 \pm 32}$,$I_{1074}$,$\overline{I}_{1074 \pm 32}$,$I_{877}$,$\overline{I}_{877 \pm 32}$,"$\Sigma I_{p,\ 3800\text{–}2500}$","$\Sigma I_{p,\ 1800\text{–}900}$","$\Sigma I_{p,\ 1800\text{–}1500}$","$\Sigma I_{p,\ 1500\text{–}1300}$","$\Sigma I_{p,\ 1300\text{–}900}$",$\log \overline{I}_{3800\text{–}2500}$,$(\overline{I}_{3800\text{–}2500})^{-1}$,$\log I_{3375}$,$(I_{3375})^{-1}$,$\log \overline{I}_{3375 \pm 32}$,$(\overline{I}_{3375 \pm 32})^{-1}$,$\log I_{3293}$,$(I_{3293})^{-1}$,$\log \overline{I}_{3293 \pm 32}$,$(\overline{I}_{3293 \pm 32})^{-1}$,$\log I_{2928}$,$(I_{2928})^{-1}$,$\log \overline{I}_{2928 \pm 32}$,$(\overline{I}_{2928 \pm 32})^{-1}$,$\log \overline{I}_{1800\text{–}900}$,$(\overline{I}_{1800\text{–}900})^{-1}$,$\log \overline{I}_{1800\text{–}1500}$,$(\overline{I}_{1800\text{–}1500})^{-1}$,...,$\frac{\overline{I}_{1240 \pm 32}}{I_{1074}}$,$\frac{\overline{I}_{1240 \pm 32}}{\overline{I}_{1074 \pm 32}}$,$\frac{\overline{I}_{1240 \pm 32}}{I_{877}}$,$\frac{\overline{I}_{1240 \pm 32}}{\overline{I}_{877 \pm 32}}$,"$\frac{\overline{I}_{1240 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1240 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1240 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1240 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{I_{1155}}{I_{1074}}$,$\frac{I_{1155}}{\overline{I}_{1074 \pm 32}}$,$\frac{I_{1155}}{I_{877}}$,$\frac{I_{1155}}{\overline{I}_{877 \pm 32}}$,"$\frac{I_{1155}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{1155}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{1155}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{1155}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{\overline{I}_{1155 \pm 32}}{I_{1074}}$,$\frac{\overline{I}_{1155 \pm 32}}{\overline{I}_{1074 \pm 32}}$,$\frac{\overline{I}_{1155 \pm 32}}{I_{877}}$,$\frac{\overline{I}_{1155 \pm 32}}{\overline{I}_{877 \pm 32}}$,"$\frac{\overline{I}_{1155 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1155 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1155 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1155 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{I_{1074}}{I_{877}}$,$\frac{I_{1074}}{\overline{I}_{877 \pm 32}}$,"$\frac{I_{1074}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{1074}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{1074}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{1074}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{\overline{I}_{1074 \pm 32}}{I_{877}}$,$\frac{\overline{I}_{1074 \pm 32}}{\overline{I}_{877 \pm 32}}$,"$\frac{\overline{I}_{1074 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1074 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1074 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1074 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{I_{877}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{877}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{877}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{877}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\overline{I}_{877 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{877 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{877 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{877 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1800\text{–}1500}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1800\text{–}1500}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1500\text{–}1300}}{\Sigma I_{p,\ 1300\text{–}900}}$"
division_1_size_bulk,0.526201,0.981207,0.97598,1.0,0.963685,0.49172,0.492079,0.504366,0.537705,0.42653,0.422144,1.0,0.925096,0.421988,0.453781,0.571735,0.760582,0.726142,0.44578,0.452765,0.443662,0.472792,0.479528,0.681294,0.665771,0.048779,0.049258,2.472926,4.21595,1.848518,0.760582,1.606851,-0.642072,1.900414,-0.018972,1.019153,-0.024313,1.024611,0.0,1.0,-0.036991,1.037683,-0.709847,2.033679,-0.709117,2.032196,-0.684452,1.982686,-0.620446,1.859757,...,0.651204,0.666388,9.095343,9.006882,0.105234,0.240009,0.583319,0.276106,0.693961,0.710142,9.692527,9.598258,0.112144,0.255768,0.621619,0.294235,0.703848,0.720259,9.830617,9.735005,0.113741,0.259412,0.630475,0.298427,13.966961,13.831119,0.161599,0.368562,0.895754,0.423993,13.648724,13.515977,0.157917,0.360165,0.875345,0.414333,0.01157,0.026388,0.064134,0.030357,0.011684,0.026647,0.064764,0.030655,2.280719,5.543061,2.623735,2.430401,1.150398,0.473337
division_1_size_5,0.477909,1.0,0.995136,0.92509,0.923226,0.406937,0.401135,0.451487,0.497036,0.387542,0.383206,1.0,0.922786,0.357085,0.390761,0.502368,0.687458,0.65602,0.391974,0.426756,0.423141,0.497397,0.498701,0.606138,0.536587,0.093475,0.095869,2.332027,3.962376,1.744626,0.687458,1.530291,-0.738335,2.092448,0.0,1.0,-0.004876,1.004888,-0.077864,1.080976,-0.079881,1.083158,-0.899098,2.457385,-0.913456,2.492924,-0.795208,2.214902,-0.699093,2.011927,...,0.698094,0.788579,4.526795,4.41373,0.10679,0.24254,0.615516,0.27651,0.820601,0.926965,5.321188,5.188282,0.12553,0.285102,0.723531,0.325034,0.822752,0.929394,5.335134,5.201879,0.125859,0.28585,0.725427,0.325886,6.484502,6.32254,0.152973,0.347431,0.881708,0.396093,5.740444,5.597066,0.135421,0.307566,0.780538,0.350644,0.023591,0.053579,0.135972,0.061083,0.024195,0.054951,0.139455,0.062648,2.271189,5.763805,2.589295,2.537792,1.140062,0.449234
division_1_size_3,0.535789,1.0,0.985816,0.937538,0.935769,0.618445,0.59893,0.517003,0.549005,0.375077,0.381174,1.0,0.938225,0.522055,0.542602,0.523513,0.696553,0.668677,0.489773,0.400246,0.402681,0.683292,0.676252,0.836176,0.731684,0.117973,0.120394,2.555983,4.513399,1.897132,0.696553,1.919714,-0.624015,1.866407,0.0,1.0,-0.014285,1.014388,-0.064498,1.066623,-0.066386,1.06864,-0.480547,1.616959,-0.512611,1.669645,-0.659707,1.934225,-0.599647,1.821476,...,0.481575,0.550349,3.413345,3.344686,0.089219,0.212258,0.578105,0.209761,0.817163,0.933862,5.791953,5.675449,0.151392,0.360171,0.980961,0.355934,0.808744,0.92424,5.732277,5.616974,0.149832,0.35646,0.970854,0.352267,7.087876,6.945305,0.185265,0.440758,1.200447,0.435573,6.20215,6.077395,0.162114,0.385679,1.050435,0.381142,0.026138,0.062185,0.169366,0.061453,0.026675,0.063461,0.172843,0.062715,2.379064,6.479617,2.351079,2.723599,0.988237,0.362842


# Creating features description

## Functions for processing

In [12]:
def classify_feature_type(base_columns, columns):
    base_columns = [col for col in base_columns if "Unnamed" not in col]
    columns = [col for col in columns if "Unnamed" not in col]

    def clean(col):
        if col.startswith('$') and col.endswith('$'):
            col = col[1:-1]
        return col.replace("Σ", r"\Sigma")

    def is_interval(col):
        col = clean(col)
        return r"\overline{I}_{" in col or r"\Sigma I_{p," in col

    def is_peak(col):
        return not is_interval(col)

    def extract_numbers(col):
        return list(map(int, re.findall(r"\d{3,4}", col)))

    def is_in_range(col, low, high):
        return any(low <= num <= high for num in extract_numbers(clean(col)))

    def classify(col, base=True):
        col_clean = clean(col)
        source = "intervals" if is_interval(col_clean) else "peaks"

        if is_in_range(col_clean, 2500, 3800):
            region = "3800_2500"
        elif is_in_range(col_clean, 800, 1900):
            region = "1900_800"
        else:
            region = "unknown"

        kind = "base" if base else "engineered"
        return f"{kind}_features_{source}_{region}"

    groups = {}

    for col in base_columns:
        key = classify(col, base=True)
        groups.setdefault(key, []).append(col)

    for col in columns:
        if col not in base_columns:
            key = classify(col, base=False)
            groups.setdefault(key, []).append(col)

    groups["base_features"] = base_columns
    groups["engineered_features"] = [col for col in columns if col not in base_columns]

    return groups

## Feature groups selection

In [13]:
base_columns = initial_features_df.columns.tolist()
columns = features_df.columns.tolist()

In [14]:
features_description_dict = classify_feature_type(base_columns=base_columns, columns=columns)

In [15]:
for feature_type, feature_lst in features_description_dict.items():
    print(f'(*) {feature_type}: {len(feature_lst)}')

(*) base_features_intervals_3800_2500: 5
(*) base_features_peaks_3800_2500: 3
(*) base_features_intervals_1900_800: 16
(*) base_features_peaks_1900_800: 8
(*) engineered_features_intervals_3800_2500: 31
(*) engineered_features_peaks_3800_2500: 9
(*) engineered_features_intervals_1900_800: 262
(*) engineered_features_peaks_1900_800: 44
(*) base_features: 32
(*) engineered_features: 346


# Saving data

In [16]:
features_df.to_csv('./../data/processed_data/ftir_features_long_list.csv', sep=';')

In [17]:
with open('./../data/processed_data/features_description.json', 'w', encoding='utf-8') as f:
    json.dump(features_description_dict, f, ensure_ascii=False, indent=2)