# Import libraries

In [1]:
import os
import warnings

import numpy as np
import pandas as pd

import json

import re

from tqdm import tqdm

from itertools import combinations

In [2]:
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

In [3]:
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

In [4]:
path_to_save = './../results/03a_features'

# Loading data

In [5]:
path_to_ftir_data= './../data/processed_data/ftir_extracted_features.csv'
path_to_samples_description = './../data/parsed_data/samples_description.csv'

## FTIR-features

In [6]:
initial_features_df = pd.read_csv(path_to_ftir_data, sep=';', index_col=0)

print(initial_features_df.shape)
initial_features_df.head(3)

(75, 40)


Unnamed: 0,$\overline{I}_{3800\text{–}2200}$,$I_{3733}$,$\overline{I}_{3733 \pm 32}$,$I_{3383}$,$\overline{I}_{3383 \pm 32}$,$I_{3298}$,$\overline{I}_{3298 \pm 32}$,$I_{2930}$,$\overline{I}_{2930 \pm 32}$,$I_{2348}$,$\overline{I}_{2348 \pm 32}$,$\overline{I}_{1800\text{–}900}$,$\overline{I}_{1800\text{–}1500}$,$I_{1722}$,$\overline{I}_{1722 \pm 32}$,$I_{1635}$,$\overline{I}_{1635 \pm 32}$,$I_{1588}$,$\overline{I}_{1588 \pm 32}$,$I_{1547}$,$\overline{I}_{1547 \pm 32}$,$\overline{I}_{1500\text{–}1300}$,$I_{1397}$,$\overline{I}_{1397 \pm 32}$,$\overline{I}_{1300\text{–}900}$,$I_{1243}$,$\overline{I}_{1243 \pm 32}$,$I_{1187}$,$\overline{I}_{1187 \pm 32}$,$I_{1132}$,$\overline{I}_{1132 \pm 32}$,$I_{1075}$,$\overline{I}_{1075 \pm 32}$,$I_{876}$,$\overline{I}_{876 \pm 32}$,"$\Sigma I_{p,\ 3800\text{–}2200}$","$\Sigma I_{p,\ 1800\text{–}900}$","$\Sigma I_{p,\ 1800\text{–}1500}$","$\Sigma I_{p,\ 1500\text{–}1300}$","$\Sigma I_{p,\ 1300\text{–}900}$"
division_1_size_bulk,0.335219,0.050403,0.040558,1.0,0.970571,0.946997,0.941083,0.265624,0.26642,0.179887,0.155429,0.550025,0.550451,0.425727,0.421626,1.0,0.904861,0.81037,0.787646,0.447818,0.476634,0.633387,0.815626,0.780998,0.508124,0.53737,0.528082,0.460874,0.477456,0.633975,0.623632,0.733507,0.719963,0.078995,0.079059,2.442911,5.865267,2.683915,0.815626,2.365726
division_1_size_5,0.364626,0.006416,0.005691,1.0,0.995591,0.909946,0.907866,0.317784,0.311382,0.124856,0.114118,0.494561,0.532219,0.417861,0.413922,1.0,0.909491,0.784825,0.764213,0.404502,0.433921,0.568436,0.739353,0.709564,0.429493,0.497679,0.493277,0.468835,0.482559,0.606951,0.596917,0.633977,0.564463,0.077024,0.079152,2.359002,5.553983,2.607188,0.739353,2.207442
division_1_size_3,0.34887,0.041669,0.03063,1.0,0.981962,0.90354,0.901479,0.430626,0.406201,0.254556,0.218397,0.555338,0.562808,0.382219,0.388092,1.0,0.918714,0.858514,0.836568,0.537376,0.556364,0.577782,0.744396,0.717214,0.538545,0.469872,0.469877,0.550605,0.568517,0.843303,0.820172,0.875649,0.774647,0.138402,0.140829,2.630391,6.261932,2.778108,0.744396,2.739428


## Samples description

In [7]:
samples_description = pd.read_csv(f'{path_to_samples_description}', sep=';', index_col=0)

print(samples_description.shape)
samples_description.head(3)

(75, 6)


Unnamed: 0,Row_ID,Division,Size,Fraction_hue,Fraction_grouped_hue,Class
division_1_size_bulk,1,1,bulk,$> 0$,$d > 5$,2
division_1_size_5,2,1,5,$< 5$,$2 < d \leq 5$,1
division_1_size_3,3,1,3,$< 3$,$2 < d \leq 5$,1


# Feature engineering

## Function for processing

In [8]:
def generate_transformed_features(data, feature_list, tolerance=50, zero_substitute=1e-8):
    data_transformed = pd.DataFrame(index=data.index)

    def clean(col):
        if col.startswith('$') and col.endswith('$'):
            col = col[1:-1]
        return col.replace("Σ", r"\Sigma")

    def wrap_log(col):
        return fr"$\log {clean(col)}$"

    def wrap_pow(col, power):
        return fr"$({clean(col)})^{{{power}}}$"

    def wrap_div(col1, col2):
        return fr"$\frac{{{clean(col1)}}}{{{clean(col2)}}}$"

    def extract_center(name):
        nums = re.findall(r'\d+', name)
        return int(nums[0]) if nums else None

    def get_interval_tag(center):
        if center is None:
            return None
        if 2200 <= center <= 3800:
            return "high"
        elif 800 <= center <= 1900:
            return "low"
        return None

    def extract_center_and_type(name):
        center = extract_center(name)
        is_avg = r'\overline{I}' in name or '±' in name
        return center, is_avg

    def is_near_duplicate(f1, f2):
        c1, avg1 = extract_center_and_type(f1)
        c2, avg2 = extract_center_and_type(f2)
        if c1 is None or c2 is None:
            return False
        return abs(c1 - c2) <= tolerance and (avg1 or avg2)

    np.seterr(divide='ignore', invalid='ignore')

    for col in feature_list:
        base = data[col].copy()
        base_safe = base.replace(0, zero_substitute)

        data_transformed[wrap_log(col)] = np.log(base_safe)

        for power in [-1]:
            transformed = np.power(base_safe, power)
            transformed.replace([np.inf, -np.inf], np.nan, inplace=True)
            data_transformed[wrap_pow(col, power)] = transformed

    for f1, f2 in combinations(feature_list, 2):
        if is_near_duplicate(f1, f2):
            continue
        c1 = extract_center(f1)
        c2 = extract_center(f2)
        tag1 = get_interval_tag(c1)
        tag2 = get_interval_tag(c2)
        if tag1 != tag2:
            continue

        numerator = data[f1].replace(0, zero_substitute)
        denominator = data[f2].replace(0, zero_substitute)
        ratio = numerator / denominator
        ratio.replace([np.inf, -np.inf], np.nan, inplace=True)

        data_transformed[wrap_div(f1, f2)] = ratio

    return data_transformed

## Calculation process

In [9]:
engineered_features_df = generate_transformed_features(data=initial_features_df, feature_list=initial_features_df.columns, tolerance=50)

print(engineered_features_df.shape)
engineered_features_df.head(3)

(75, 492)


Unnamed: 0,$\log \overline{I}_{3800\text{–}2200}$,$(\overline{I}_{3800\text{–}2200})^{-1}$,$\log I_{3733}$,$(I_{3733})^{-1}$,$\log \overline{I}_{3733 \pm 32}$,$(\overline{I}_{3733 \pm 32})^{-1}$,$\log I_{3383}$,$(I_{3383})^{-1}$,$\log \overline{I}_{3383 \pm 32}$,$(\overline{I}_{3383 \pm 32})^{-1}$,$\log I_{3298}$,$(I_{3298})^{-1}$,$\log \overline{I}_{3298 \pm 32}$,$(\overline{I}_{3298 \pm 32})^{-1}$,$\log I_{2930}$,$(I_{2930})^{-1}$,$\log \overline{I}_{2930 \pm 32}$,$(\overline{I}_{2930 \pm 32})^{-1}$,$\log I_{2348}$,$(I_{2348})^{-1}$,$\log \overline{I}_{2348 \pm 32}$,$(\overline{I}_{2348 \pm 32})^{-1}$,$\log \overline{I}_{1800\text{–}900}$,$(\overline{I}_{1800\text{–}900})^{-1}$,$\log \overline{I}_{1800\text{–}1500}$,$(\overline{I}_{1800\text{–}1500})^{-1}$,$\log I_{1722}$,$(I_{1722})^{-1}$,$\log \overline{I}_{1722 \pm 32}$,$(\overline{I}_{1722 \pm 32})^{-1}$,$\log I_{1635}$,$(I_{1635})^{-1}$,$\log \overline{I}_{1635 \pm 32}$,$(\overline{I}_{1635 \pm 32})^{-1}$,$\log I_{1588}$,$(I_{1588})^{-1}$,$\log \overline{I}_{1588 \pm 32}$,$(\overline{I}_{1588 \pm 32})^{-1}$,$\log I_{1547}$,$(I_{1547})^{-1}$,$\log \overline{I}_{1547 \pm 32}$,$(\overline{I}_{1547 \pm 32})^{-1}$,$\log \overline{I}_{1500\text{–}1300}$,$(\overline{I}_{1500\text{–}1300})^{-1}$,$\log I_{1397}$,$(I_{1397})^{-1}$,$\log \overline{I}_{1397 \pm 32}$,$(\overline{I}_{1397 \pm 32})^{-1}$,$\log \overline{I}_{1300\text{–}900}$,$(\overline{I}_{1300\text{–}900})^{-1}$,...,$\frac{\overline{I}_{1187 \pm 32}}{I_{1075}}$,$\frac{\overline{I}_{1187 \pm 32}}{\overline{I}_{1075 \pm 32}}$,$\frac{\overline{I}_{1187 \pm 32}}{I_{876}}$,$\frac{\overline{I}_{1187 \pm 32}}{\overline{I}_{876 \pm 32}}$,"$\frac{\overline{I}_{1187 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1187 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1187 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1187 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{I_{1132}}{I_{1075}}$,$\frac{I_{1132}}{\overline{I}_{1075 \pm 32}}$,$\frac{I_{1132}}{I_{876}}$,$\frac{I_{1132}}{\overline{I}_{876 \pm 32}}$,"$\frac{I_{1132}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{1132}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{1132}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{1132}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{\overline{I}_{1132 \pm 32}}{I_{1075}}$,$\frac{\overline{I}_{1132 \pm 32}}{\overline{I}_{1075 \pm 32}}$,$\frac{\overline{I}_{1132 \pm 32}}{I_{876}}$,$\frac{\overline{I}_{1132 \pm 32}}{\overline{I}_{876 \pm 32}}$,"$\frac{\overline{I}_{1132 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1132 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1132 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1132 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{I_{1075}}{I_{876}}$,$\frac{I_{1075}}{\overline{I}_{876 \pm 32}}$,"$\frac{I_{1075}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{1075}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{1075}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{1075}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{\overline{I}_{1075 \pm 32}}{I_{876}}$,$\frac{\overline{I}_{1075 \pm 32}}{\overline{I}_{876 \pm 32}}$,"$\frac{\overline{I}_{1075 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1075 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1075 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1075 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1800\text{–}1500}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1800\text{–}1500}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1500\text{–}1300}}{\Sigma I_{p,\ 1300\text{–}900}}$"
division_1_size_bulk,-1.092972,2.983128,-2.987704,19.84008,-3.205016,24.655887,0.0,1.0,-0.029871,1.030321,-0.05446,1.05597,-0.060724,1.062605,-1.325672,3.764716,-1.322682,3.753476,-1.715425,5.559038,-1.861564,6.433793,-0.597792,1.8181,-0.597017,1.816691,-0.853956,2.348921,-0.863636,2.371769,0.0,1.0,-0.099974,1.105142,-0.210264,1.234004,-0.238706,1.269605,-0.803369,2.233052,-0.741007,2.098047,-0.456674,1.578815,-0.2038,1.226053,-0.247183,1.280413,-0.677029,1.968022,...,0.650922,0.663168,6.044168,6.039273,0.081404,0.177895,0.585386,0.201822,0.864306,0.880566,8.025558,8.019058,0.10809,0.236213,0.777287,0.267983,0.850205,0.8662,7.894626,7.888233,0.106326,0.232359,0.764606,0.263611,9.285551,9.278031,0.12506,0.273298,0.899319,0.310056,9.114089,9.106708,0.12275,0.268251,0.882712,0.304331,0.013468,0.029433,0.096851,0.033391,0.013479,0.029456,0.09693,0.033418,2.18534,7.191126,2.479267,3.290621,1.134499,0.344768
division_1_size_5,-1.008882,2.742533,-5.048926,155.855059,-5.168913,175.723662,0.0,1.0,-0.004419,1.004429,-0.09437,1.098966,-0.096659,1.101485,-1.146384,3.146794,-1.166735,3.211491,-2.080596,8.009243,-2.170521,8.762848,-0.704084,2.021994,-0.630701,1.878927,-0.872606,2.393139,-0.882077,2.415913,0.0,1.0,-0.09487,1.099516,-0.242294,1.274169,-0.268909,1.308536,-0.9051,2.472178,-0.834893,2.304568,-0.564867,1.759214,-0.30198,1.352534,-0.343105,1.409316,-0.845149,2.328326,...,0.761161,0.854898,6.265065,6.096591,0.086885,0.185088,0.652677,0.218605,0.957371,1.075271,7.880055,7.668152,0.109282,0.232799,0.820922,0.274957,0.941544,1.057495,7.749785,7.541386,0.107476,0.228951,0.807351,0.270411,8.230929,8.009591,0.114148,0.243165,0.857476,0.2872,7.328435,7.131367,0.101632,0.216503,0.763456,0.255709,0.013868,0.029543,0.104177,0.034893,0.014251,0.030359,0.107056,0.035857,2.130258,7.511952,2.516027,3.526311,1.18109,0.334936
division_1_size_3,-1.053056,2.866398,-3.178008,23.998906,-3.485783,32.647978,0.0,1.0,-0.018202,1.018369,-0.101435,1.106758,-0.103718,1.109288,-0.842515,2.3222,-0.900907,2.461835,-1.368235,3.92841,-1.521442,4.578823,-0.588179,1.800706,-0.574817,1.776806,-0.961762,2.616304,-0.946514,2.576711,0.0,1.0,-0.084781,1.088478,-0.152553,1.164804,-0.178447,1.19536,-0.621058,1.860896,-0.586332,1.797384,-0.548558,1.730755,-0.295182,1.343371,-0.332381,1.394284,-0.618885,1.856856,...,0.649252,0.733905,4.107714,4.036941,0.090789,0.204642,0.763729,0.207531,0.96306,1.088629,6.093131,5.988151,0.134671,0.303553,1.132869,0.307839,0.936645,1.05877,5.926006,5.823905,0.130977,0.295227,1.101796,0.299395,6.326843,6.217837,0.139837,0.315196,1.176322,0.319647,5.597068,5.500635,0.123707,0.27884,1.040638,0.282777,0.022102,0.049819,0.185926,0.050522,0.02249,0.050692,0.189185,0.051408,2.254028,8.412099,2.285854,3.73203,1.01412,0.271734


*NaN-values analysis*

In [10]:
nan_df = engineered_features_df.isna().sum()
nan_summary = nan_df[nan_df > 0]
print(nan_summary)

Series([], dtype: int64)


## Merging process

In [11]:
features_df = pd.merge(
    left=initial_features_df,
    right=engineered_features_df,
    how='inner',
    left_index=True,
    right_index=True
)

print(features_df.shape)
features_df.head(3)

(75, 532)


Unnamed: 0,$\overline{I}_{3800\text{–}2200}$,$I_{3733}$,$\overline{I}_{3733 \pm 32}$,$I_{3383}$,$\overline{I}_{3383 \pm 32}$,$I_{3298}$,$\overline{I}_{3298 \pm 32}$,$I_{2930}$,$\overline{I}_{2930 \pm 32}$,$I_{2348}$,$\overline{I}_{2348 \pm 32}$,$\overline{I}_{1800\text{–}900}$,$\overline{I}_{1800\text{–}1500}$,$I_{1722}$,$\overline{I}_{1722 \pm 32}$,$I_{1635}$,$\overline{I}_{1635 \pm 32}$,$I_{1588}$,$\overline{I}_{1588 \pm 32}$,$I_{1547}$,$\overline{I}_{1547 \pm 32}$,$\overline{I}_{1500\text{–}1300}$,$I_{1397}$,$\overline{I}_{1397 \pm 32}$,$\overline{I}_{1300\text{–}900}$,$I_{1243}$,$\overline{I}_{1243 \pm 32}$,$I_{1187}$,$\overline{I}_{1187 \pm 32}$,$I_{1132}$,$\overline{I}_{1132 \pm 32}$,$I_{1075}$,$\overline{I}_{1075 \pm 32}$,$I_{876}$,$\overline{I}_{876 \pm 32}$,"$\Sigma I_{p,\ 3800\text{–}2200}$","$\Sigma I_{p,\ 1800\text{–}900}$","$\Sigma I_{p,\ 1800\text{–}1500}$","$\Sigma I_{p,\ 1500\text{–}1300}$","$\Sigma I_{p,\ 1300\text{–}900}$",$\log \overline{I}_{3800\text{–}2200}$,$(\overline{I}_{3800\text{–}2200})^{-1}$,$\log I_{3733}$,$(I_{3733})^{-1}$,$\log \overline{I}_{3733 \pm 32}$,$(\overline{I}_{3733 \pm 32})^{-1}$,$\log I_{3383}$,$(I_{3383})^{-1}$,$\log \overline{I}_{3383 \pm 32}$,$(\overline{I}_{3383 \pm 32})^{-1}$,...,$\frac{\overline{I}_{1187 \pm 32}}{I_{1075}}$,$\frac{\overline{I}_{1187 \pm 32}}{\overline{I}_{1075 \pm 32}}$,$\frac{\overline{I}_{1187 \pm 32}}{I_{876}}$,$\frac{\overline{I}_{1187 \pm 32}}{\overline{I}_{876 \pm 32}}$,"$\frac{\overline{I}_{1187 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1187 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1187 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1187 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{I_{1132}}{I_{1075}}$,$\frac{I_{1132}}{\overline{I}_{1075 \pm 32}}$,$\frac{I_{1132}}{I_{876}}$,$\frac{I_{1132}}{\overline{I}_{876 \pm 32}}$,"$\frac{I_{1132}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{1132}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{1132}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{1132}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{\overline{I}_{1132 \pm 32}}{I_{1075}}$,$\frac{\overline{I}_{1132 \pm 32}}{\overline{I}_{1075 \pm 32}}$,$\frac{\overline{I}_{1132 \pm 32}}{I_{876}}$,$\frac{\overline{I}_{1132 \pm 32}}{\overline{I}_{876 \pm 32}}$,"$\frac{\overline{I}_{1132 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1132 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1132 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1132 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{I_{1075}}{I_{876}}$,$\frac{I_{1075}}{\overline{I}_{876 \pm 32}}$,"$\frac{I_{1075}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{1075}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{1075}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{1075}}{\Sigma I_{p,\ 1300\text{–}900}}$",$\frac{\overline{I}_{1075 \pm 32}}{I_{876}}$,$\frac{\overline{I}_{1075 \pm 32}}{\overline{I}_{876 \pm 32}}$,"$\frac{\overline{I}_{1075 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{1075 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{1075 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{1075 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{I_{876}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1800\text{–}900}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\overline{I}_{876 \pm 32}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1800\text{–}1500}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1800\text{–}900}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1800\text{–}1500}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1800\text{–}1500}}{\Sigma I_{p,\ 1300\text{–}900}}$","$\frac{\Sigma I_{p,\ 1500\text{–}1300}}{\Sigma I_{p,\ 1300\text{–}900}}$"
division_1_size_bulk,0.335219,0.050403,0.040558,1.0,0.970571,0.946997,0.941083,0.265624,0.26642,0.179887,0.155429,0.550025,0.550451,0.425727,0.421626,1.0,0.904861,0.81037,0.787646,0.447818,0.476634,0.633387,0.815626,0.780998,0.508124,0.53737,0.528082,0.460874,0.477456,0.633975,0.623632,0.733507,0.719963,0.078995,0.079059,2.442911,5.865267,2.683915,0.815626,2.365726,-1.092972,2.983128,-2.987704,19.84008,-3.205016,24.655887,0.0,1.0,-0.029871,1.030321,...,0.650922,0.663168,6.044168,6.039273,0.081404,0.177895,0.585386,0.201822,0.864306,0.880566,8.025558,8.019058,0.10809,0.236213,0.777287,0.267983,0.850205,0.8662,7.894626,7.888233,0.106326,0.232359,0.764606,0.263611,9.285551,9.278031,0.12506,0.273298,0.899319,0.310056,9.114089,9.106708,0.12275,0.268251,0.882712,0.304331,0.013468,0.029433,0.096851,0.033391,0.013479,0.029456,0.09693,0.033418,2.18534,7.191126,2.479267,3.290621,1.134499,0.344768
division_1_size_5,0.364626,0.006416,0.005691,1.0,0.995591,0.909946,0.907866,0.317784,0.311382,0.124856,0.114118,0.494561,0.532219,0.417861,0.413922,1.0,0.909491,0.784825,0.764213,0.404502,0.433921,0.568436,0.739353,0.709564,0.429493,0.497679,0.493277,0.468835,0.482559,0.606951,0.596917,0.633977,0.564463,0.077024,0.079152,2.359002,5.553983,2.607188,0.739353,2.207442,-1.008882,2.742533,-5.048926,155.855059,-5.168913,175.723662,0.0,1.0,-0.004419,1.004429,...,0.761161,0.854898,6.265065,6.096591,0.086885,0.185088,0.652677,0.218605,0.957371,1.075271,7.880055,7.668152,0.109282,0.232799,0.820922,0.274957,0.941544,1.057495,7.749785,7.541386,0.107476,0.228951,0.807351,0.270411,8.230929,8.009591,0.114148,0.243165,0.857476,0.2872,7.328435,7.131367,0.101632,0.216503,0.763456,0.255709,0.013868,0.029543,0.104177,0.034893,0.014251,0.030359,0.107056,0.035857,2.130258,7.511952,2.516027,3.526311,1.18109,0.334936
division_1_size_3,0.34887,0.041669,0.03063,1.0,0.981962,0.90354,0.901479,0.430626,0.406201,0.254556,0.218397,0.555338,0.562808,0.382219,0.388092,1.0,0.918714,0.858514,0.836568,0.537376,0.556364,0.577782,0.744396,0.717214,0.538545,0.469872,0.469877,0.550605,0.568517,0.843303,0.820172,0.875649,0.774647,0.138402,0.140829,2.630391,6.261932,2.778108,0.744396,2.739428,-1.053056,2.866398,-3.178008,23.998906,-3.485783,32.647978,0.0,1.0,-0.018202,1.018369,...,0.649252,0.733905,4.107714,4.036941,0.090789,0.204642,0.763729,0.207531,0.96306,1.088629,6.093131,5.988151,0.134671,0.303553,1.132869,0.307839,0.936645,1.05877,5.926006,5.823905,0.130977,0.295227,1.101796,0.299395,6.326843,6.217837,0.139837,0.315196,1.176322,0.319647,5.597068,5.500635,0.123707,0.27884,1.040638,0.282777,0.022102,0.049819,0.185926,0.050522,0.02249,0.050692,0.189185,0.051408,2.254028,8.412099,2.285854,3.73203,1.01412,0.271734


# Creating features description

## Functions for processing

In [12]:
def classify_feature_type(base_columns, columns):
    base_columns = [col for col in base_columns if "Unnamed" not in col]
    columns = [col for col in columns if "Unnamed" not in col]

    def clean(col):
        if col.startswith('$') and col.endswith('$'):
            col = col[1:-1]
        return col.replace("Σ", r"\Sigma")

    def is_interval(col):
        col = clean(col)
        return r"\overline{I}_{" in col or r"\Sigma I_{p," in col

    def is_peak(col):
        return not is_interval(col)

    def extract_numbers(col):
        return list(map(int, re.findall(r"\d{3,4}", col)))

    def is_in_range(col, low, high):
        return any(low <= num <= high for num in extract_numbers(clean(col)))

    def classify(col, base=True):
        col_clean = clean(col)
        source = "intervals" if is_interval(col_clean) else "peaks"

        if is_in_range(col_clean, 2200, 3800):
            region = "3800_2200"
        elif is_in_range(col_clean, 800, 1900):
            region = "1900_800"
        else:
            region = "unknown"

        kind = "base" if base else "engineered"
        return f"{kind}_features_{source}_{region}"

    groups = {}

    for col in base_columns:
        key = classify(col, base=True)
        groups.setdefault(key, []).append(col)

    for col in columns:
        if col not in base_columns:
            key = classify(col, base=False)
            groups.setdefault(key, []).append(col)

    groups["base_features"] = base_columns
    groups["engineered_features"] = [col for col in columns if col not in base_columns]

    return groups

## Feature groups selection

In [13]:
base_columns = initial_features_df.columns.tolist()
columns = features_df.columns.tolist()

In [14]:
features_description_dict = classify_feature_type(base_columns=base_columns, columns=columns)

In [15]:
for feature_type, feature_lst in features_description_dict.items():
    print(f'(*) {feature_type}: {len(feature_lst)}')

(*) base_features_intervals_3800_2200: 7
(*) base_features_peaks_3800_2200: 5
(*) base_features_intervals_1900_800: 18
(*) base_features_peaks_1900_800: 10
(*) engineered_features_intervals_3800_2200: 64
(*) engineered_features_peaks_3800_2200: 20
(*) engineered_features_intervals_1900_800: 343
(*) engineered_features_peaks_1900_800: 65
(*) base_features: 40
(*) engineered_features: 492


# Saving data

In [16]:
features_df.to_csv('./../data/processed_data/ftir_features_long_list.csv', sep=';')

In [17]:
with open('./../data/processed_data/features_description.json', 'w', encoding='utf-8') as f:
    json.dump(features_description_dict, f, ensure_ascii=False, indent=2)