# Import libraries

In [1]:
import os
import warnings

import numpy as np
import pandas as pd

import json

import re

from tqdm import tqdm

from itertools import combinations

In [2]:
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', None)

In [3]:
path_to_save = './../results/03a_features'

# Loading data

In [4]:
path_to_ftir_data_3800_2500 = './../data/processed_data/dataset_3800_2500_extracted.csv'
path_to_ftir_data_1900_800= './../data/processed_data/dataset_1900_800_extracted.csv'

## FTIR-features (3800-2500 cm$^{-1}$)

In [5]:
dataset_3800_2500 = pd.read_csv(path_to_ftir_data_3800_2500, sep=';', index_col=0)

print(dataset_3800_2500.shape)
dataset_3800_2500.head(3)

(75, 12)


Unnamed: 0,$I_{2928}$,$I_{3352}$,$I_{3723}$,$\overline{I}_{3800\text{–}2500}$,"$\Sigma I_{p,\ 3800\text{–}2500}$",$\overline{I}_{2928 \pm 50}$,$\overline{I}_{3352 \pm 50}$,$\overline{I}_{3723 \pm 50}$,Split,Fraction_hue,Fraction_grouped_hue,Class
division_1_size_bulk,0.49172,1.0,0.090605,0.526201,1.582324,0.494443,0.983311,0.077488,Train,$> 0$,$d > 5$,2
division_1_size_5,0.406937,1.0,0.030092,0.477909,1.437029,0.396989,0.980473,0.037942,Train,$< 5$,$2 < d \leq 5$,1
division_1_size_3,0.618445,1.0,0.08075,0.535789,1.699195,0.581733,0.980981,0.069433,Train,$< 3$,$2 < d \leq 5$,1


## FTIR-features (1900-800 cm$^{-1}$)

In [6]:
dataset_1900_800 = pd.read_csv(path_to_ftir_data_1900_800, sep=';', index_col=0)

print(dataset_1900_800.shape)
dataset_1900_800.head(3)

(75, 24)


Unnamed: 0,$I_{873}$,$I_{1107}$,$I_{1241}$,$I_{1393}$,$I_{1613}$,$I_{1725}$,$\overline{I}_{1900\text{–}800}$,"$\Sigma I_{p,\ 1900\text{–}800}$",$\overline{I}_{1900\text{–}1500}$,"$\Sigma I_{p,\ 1900\text{–}1500}$",$\overline{I}_{1500\text{–}1300}$,"$\Sigma I_{p,\ 1500\text{–}1300}$",$\overline{I}_{1300\text{–}800}$,"$\Sigma I_{p,\ 1300\text{–}800}$",$\overline{I}_{873 \pm 25}$,$\overline{I}_{1107 \pm 25}$,$\overline{I}_{1241 \pm 25}$,$\overline{I}_{1393 \pm 25}$,$\overline{I}_{1613 \pm 25}$,$\overline{I}_{1725 \pm 25}$,Split,Fraction_hue,Fraction_grouped_hue,Class
division_1_size_bulk,0.048208,0.681294,0.455066,0.760582,1.0,0.423273,0.424162,3.368423,0.426619,1.423273,0.571735,0.760582,0.363338,1.184569,0.047543,0.624379,0.447733,0.739415,0.9395,0.41747,Train,$> 0$,$d > 5$,2
division_1_size_5,0.091875,0.606138,0.427988,0.687458,1.0,0.384475,0.37873,3.197934,0.378004,1.384475,0.502368,0.687458,0.329999,1.126001,0.093349,0.594059,0.424918,0.668322,0.930634,0.378463,Train,$< 5$,$2 < d \leq 5$,1
division_1_size_3,0.110321,0.836176,0.40095,0.696553,1.0,0.368753,0.436266,3.412752,0.429362,1.368753,0.523513,0.696553,0.406988,1.347446,0.113396,0.817537,0.401435,0.678474,0.953537,0.369746,Train,$< 3$,$2 < d \leq 5$,1


# Feature engineering

## Data wrangling

In [7]:
features_3800_2500 = dataset_3800_2500.loc[:, ~dataset_3800_2500.columns.isin(['Split', 'Fraction_hue', 'Fraction_grouped_hue', 'Class'])]

print(features_3800_2500.shape)
features_3800_2500.head(3)

(75, 8)


Unnamed: 0,$I_{2928}$,$I_{3352}$,$I_{3723}$,$\overline{I}_{3800\text{–}2500}$,"$\Sigma I_{p,\ 3800\text{–}2500}$",$\overline{I}_{2928 \pm 50}$,$\overline{I}_{3352 \pm 50}$,$\overline{I}_{3723 \pm 50}$
division_1_size_bulk,0.49172,1.0,0.090605,0.526201,1.582324,0.494443,0.983311,0.077488
division_1_size_5,0.406937,1.0,0.030092,0.477909,1.437029,0.396989,0.980473,0.037942
division_1_size_3,0.618445,1.0,0.08075,0.535789,1.699195,0.581733,0.980981,0.069433


In [8]:
features_1900_800 = dataset_1900_800.loc[:, ~dataset_1900_800.columns.isin(['Split', 'Fraction_hue', 'Fraction_grouped_hue', 'Class'])]

print(features_1900_800.shape)
features_1900_800.head(3)

(75, 20)


Unnamed: 0,$I_{873}$,$I_{1107}$,$I_{1241}$,$I_{1393}$,$I_{1613}$,$I_{1725}$,$\overline{I}_{1900\text{–}800}$,"$\Sigma I_{p,\ 1900\text{–}800}$",$\overline{I}_{1900\text{–}1500}$,"$\Sigma I_{p,\ 1900\text{–}1500}$",$\overline{I}_{1500\text{–}1300}$,"$\Sigma I_{p,\ 1500\text{–}1300}$",$\overline{I}_{1300\text{–}800}$,"$\Sigma I_{p,\ 1300\text{–}800}$",$\overline{I}_{873 \pm 25}$,$\overline{I}_{1107 \pm 25}$,$\overline{I}_{1241 \pm 25}$,$\overline{I}_{1393 \pm 25}$,$\overline{I}_{1613 \pm 25}$,$\overline{I}_{1725 \pm 25}$
division_1_size_bulk,0.048208,0.681294,0.455066,0.760582,1.0,0.423273,0.424162,3.368423,0.426619,1.423273,0.571735,0.760582,0.363338,1.184569,0.047543,0.624379,0.447733,0.739415,0.9395,0.41747
division_1_size_5,0.091875,0.606138,0.427988,0.687458,1.0,0.384475,0.37873,3.197934,0.378004,1.384475,0.502368,0.687458,0.329999,1.126001,0.093349,0.594059,0.424918,0.668322,0.930634,0.378463
division_1_size_3,0.110321,0.836176,0.40095,0.696553,1.0,0.368753,0.436266,3.412752,0.429362,1.368753,0.523513,0.696553,0.406988,1.347446,0.113396,0.817537,0.401435,0.678474,0.953537,0.369746


In [9]:
initial_features_df = pd.concat([features_3800_2500, features_1900_800], axis=1)

print(initial_features_df.shape)
initial_features_df.head(3)

(75, 28)


Unnamed: 0,$I_{2928}$,$I_{3352}$,$I_{3723}$,$\overline{I}_{3800\text{–}2500}$,"$\Sigma I_{p,\ 3800\text{–}2500}$",$\overline{I}_{2928 \pm 50}$,$\overline{I}_{3352 \pm 50}$,$\overline{I}_{3723 \pm 50}$,$I_{873}$,$I_{1107}$,$I_{1241}$,$I_{1393}$,$I_{1613}$,$I_{1725}$,$\overline{I}_{1900\text{–}800}$,"$\Sigma I_{p,\ 1900\text{–}800}$",$\overline{I}_{1900\text{–}1500}$,"$\Sigma I_{p,\ 1900\text{–}1500}$",$\overline{I}_{1500\text{–}1300}$,"$\Sigma I_{p,\ 1500\text{–}1300}$",$\overline{I}_{1300\text{–}800}$,"$\Sigma I_{p,\ 1300\text{–}800}$",$\overline{I}_{873 \pm 25}$,$\overline{I}_{1107 \pm 25}$,$\overline{I}_{1241 \pm 25}$,$\overline{I}_{1393 \pm 25}$,$\overline{I}_{1613 \pm 25}$,$\overline{I}_{1725 \pm 25}$
division_1_size_bulk,0.49172,1.0,0.090605,0.526201,1.582324,0.494443,0.983311,0.077488,0.048208,0.681294,0.455066,0.760582,1.0,0.423273,0.424162,3.368423,0.426619,1.423273,0.571735,0.760582,0.363338,1.184569,0.047543,0.624379,0.447733,0.739415,0.9395,0.41747
division_1_size_5,0.406937,1.0,0.030092,0.477909,1.437029,0.396989,0.980473,0.037942,0.091875,0.606138,0.427988,0.687458,1.0,0.384475,0.37873,3.197934,0.378004,1.384475,0.502368,0.687458,0.329999,1.126001,0.093349,0.594059,0.424918,0.668322,0.930634,0.378463
division_1_size_3,0.618445,1.0,0.08075,0.535789,1.699195,0.581733,0.980981,0.069433,0.110321,0.836176,0.40095,0.696553,1.0,0.368753,0.436266,3.412752,0.429362,1.368753,0.523513,0.696553,0.406988,1.347446,0.113396,0.817537,0.401435,0.678474,0.953537,0.369746


## Functions for processing

### Spectral region determination

In [10]:
def extract_center(name: str):
    nums = re.findall(r"\d+", name)
    return int(nums[0]) if nums else None

def get_interval_tag(center):
    if center is None:
        return None
    if 2500 <= center <= 3800:
        return "high"
    elif 800 <= center <= 1900:
        return "low"
    return None

### Features construction

In [11]:
def generate_transformed_features(data, feature_list, tolerance=50, zero_substitute=1e-8):
    data_transformed = pd.DataFrame(index=data.index)

    def clean(col):
        if col.startswith('$') and col.endswith('$'):
            col = col[1:-1]
        return col.replace("Σ", r"\Sigma")

    def wrap_log(col):
        return fr"$\log {clean(col)}$"

    def wrap_pow(col, power):
        return fr"$({clean(col)})^{{{power}}}$"

    def wrap_div(col1, col2):
        return fr"$\frac{{{clean(col1)}}}{{{clean(col2)}}}$"

    def extract_center(name):
        nums = re.findall(r'\d+', name)
        return int(nums[0]) if nums else None

    def get_interval_tag(center):
        if center is None:
            return None
        if 2500 <= center <= 3800:
            return "high"
        elif 800 <= center <= 1900:
            return "low"
        return None

    def extract_center_and_type(name):
        center = extract_center(name)
        is_avg = r'\overline{I}' in name or '±' in name
        return center, is_avg

    def is_near_duplicate(f1, f2):
        c1, avg1 = extract_center_and_type(f1)
        c2, avg2 = extract_center_and_type(f2)
        if c1 is None or c2 is None:
            return False
        return abs(c1 - c2) <= tolerance and (avg1 or avg2)

    def get_interval_kind(name: str):
        if r"\overline{I}" in name or "±" in name:
            return "average"
        if r"\Sigma I_{p," in name:
            return "sum"
        return None
    
        np.seterr(divide='ignore', invalid='ignore')

    for col in feature_list:
        base = data[col].copy()
        base_safe = base.replace(0, zero_substitute)

        data_transformed[wrap_log(col)] = np.log(base_safe)

        for power in [-1]:
            transformed = np.power(base_safe, power)
            transformed.replace([np.inf, -np.inf], np.nan, inplace=True)
            data_transformed[wrap_pow(col, power)] = transformed

    for f1, f2 in combinations(feature_list, 2):
        if is_near_duplicate(f1, f2):
            continue
        
        kind1 = get_interval_kind(f1)
        kind2 = get_interval_kind(f2)
        if {kind1, kind2} == {"average", "sum"}:
            continue
        
        c1 = extract_center(f1)
        c2 = extract_center(f2)
        tag1 = get_interval_tag(c1)
        tag2 = get_interval_tag(c2)
        if tag1 != tag2:
            continue

        numerator = data[f1].replace(0, zero_substitute)
        denominator = data[f2].replace(0, zero_substitute)
        ratio = numerator / denominator
        ratio.replace([np.inf, -np.inf], np.nan, inplace=True)

        data_transformed[wrap_div(f1, f2)] = ratio

    return data_transformed


### Features classification

In [12]:
def classify_feature_type(base_columns, columns):
    base_columns = [col for col in base_columns if "Unnamed" not in col]
    columns = [col for col in columns if "Unnamed" not in col]

    def clean(col):
        if col.startswith('$') and col.endswith('$'):
            col = col[1:-1]
        return col.replace("Σ", r"\Sigma")

    def is_interval(col):
        col = clean(col)
        return r"\overline{I}_{" in col or r"\Sigma I_{p," in col

    def is_peak(col):
        return not is_interval(col)

    def extract_numbers(col):
        return list(map(int, re.findall(r"\d{3,4}", col)))

    def is_in_range(col, low, high):
        return any(low <= num <= high for num in extract_numbers(clean(col)))

    def classify(col, base=True):
        col_clean = clean(col)
        source = "intervals" if is_interval(col_clean) else "peaks"

        if is_in_range(col_clean, 2500, 3800):
            region = "3800_2500"
        elif is_in_range(col_clean, 800, 1900):
            region = "1900_800"
        else:
            region = "unknown"

        kind = "base" if base else "engineered"
        return f"{kind}_features_{source}_{region}"

    groups = {}

    for col in base_columns:
        key = classify(col, base=True)
        groups.setdefault(key, []).append(col)

    for col in columns:
        if col not in base_columns:
            key = classify(col, base=False)
            groups.setdefault(key, []).append(col)

    groups["base_features"] = base_columns
    groups["engineered_features"] = [col for col in columns if col not in base_columns]

    return groups

## Calculation process

In [13]:
all_features = initial_features_df.columns.tolist()
features_1900_800_list = [c for c in all_features if get_interval_tag(extract_center(c)) == "low"]
features_3800_2500_list = [c for c in all_features if get_interval_tag(extract_center(c)) == "high"]

print(
    f"Число фичей:",
    f"(*) 1900–800: {len(features_1900_800_list)}",
    f"(*) 3800–2500: {len(features_3800_2500_list)}",
    sep='\n'
)

Число фичей:
(*) 1900–800: 20
(*) 3800–2500: 8


### 3800—2500 cm$^{-1}$

In [14]:
engineered_3800_2500 = generate_transformed_features(
    data=features_3800_2500, 
    feature_list=features_3800_2500_list, 
    tolerance=0
)

In [15]:
nan_df = engineered_3800_2500.isna().sum()
nan_summary = nan_df[nan_df > 0].sort_values(ascending=False)

print(f"Столбцов с NaN-values: {nan_summary.shape[0]}")
display(nan_summary.head(10))

Столбцов с NaN-values: 0


Series([], dtype: int64)

In [16]:
all_features_3800_2500 = pd.merge(
    left=features_3800_2500,
    right=engineered_3800_2500,
    how='inner',
    left_index=True,
    right_index=True
)

print(all_features_3800_2500.shape)
all_features_3800_2500.head(3)

(75, 45)


Unnamed: 0,$I_{2928}$,$I_{3352}$,$I_{3723}$,$\overline{I}_{3800\text{–}2500}$,"$\Sigma I_{p,\ 3800\text{–}2500}$",$\overline{I}_{2928 \pm 50}$,$\overline{I}_{3352 \pm 50}$,$\overline{I}_{3723 \pm 50}$,$\log I_{2928}$,$(I_{2928})^{-1}$,$\log I_{3352}$,$(I_{3352})^{-1}$,$\log I_{3723}$,$(I_{3723})^{-1}$,$\log \overline{I}_{3800\text{–}2500}$,$(\overline{I}_{3800\text{–}2500})^{-1}$,"$\log \Sigma I_{p,\ 3800\text{–}2500}$","$(\Sigma I_{p,\ 3800\text{–}2500})^{-1}$",$\log \overline{I}_{2928 \pm 50}$,$(\overline{I}_{2928 \pm 50})^{-1}$,$\log \overline{I}_{3352 \pm 50}$,$(\overline{I}_{3352 \pm 50})^{-1}$,$\log \overline{I}_{3723 \pm 50}$,$(\overline{I}_{3723 \pm 50})^{-1}$,$\frac{I_{2928}}{I_{3352}}$,$\frac{I_{2928}}{I_{3723}}$,$\frac{I_{2928}}{\overline{I}_{3800\text{–}2500}}$,"$\frac{I_{2928}}{\Sigma I_{p,\ 3800\text{–}2500}}$",$\frac{I_{2928}}{\overline{I}_{3352 \pm 50}}$,$\frac{I_{2928}}{\overline{I}_{3723 \pm 50}}$,$\frac{I_{3352}}{I_{3723}}$,$\frac{I_{3352}}{\overline{I}_{3800\text{–}2500}}$,"$\frac{I_{3352}}{\Sigma I_{p,\ 3800\text{–}2500}}$",$\frac{I_{3352}}{\overline{I}_{2928 \pm 50}}$,$\frac{I_{3352}}{\overline{I}_{3723 \pm 50}}$,$\frac{I_{3723}}{\overline{I}_{3800\text{–}2500}}$,"$\frac{I_{3723}}{\Sigma I_{p,\ 3800\text{–}2500}}$",$\frac{I_{3723}}{\overline{I}_{2928 \pm 50}}$,$\frac{I_{3723}}{\overline{I}_{3352 \pm 50}}$,$\frac{\overline{I}_{3800\text{–}2500}}{\overline{I}_{2928 \pm 50}}$,$\frac{\overline{I}_{3800\text{–}2500}}{\overline{I}_{3352 \pm 50}}$,$\frac{\overline{I}_{3800\text{–}2500}}{\overline{I}_{3723 \pm 50}}$,$\frac{\overline{I}_{2928 \pm 50}}{\overline{I}_{3352 \pm 50}}$,$\frac{\overline{I}_{2928 \pm 50}}{\overline{I}_{3723 \pm 50}}$,$\frac{\overline{I}_{3352 \pm 50}}{\overline{I}_{3723 \pm 50}}$
division_1_size_bulk,0.49172,1.0,0.090605,0.526201,1.582324,0.494443,0.983311,0.077488,-0.709847,2.033679,0.0,1.0,-2.40125,11.036962,-0.642072,1.900414,0.458895,0.631982,-0.704324,2.022478,-0.01683,1.016972,-2.557635,12.905257,0.49172,5.42709,0.934471,0.310758,0.500065,6.345768,11.036962,1.900414,0.631982,2.022478,12.905257,0.172186,0.05726,0.183246,0.092142,1.06423,0.535132,6.790761,0.502835,6.380913,12.689885
division_1_size_5,0.406937,1.0,0.030092,0.477909,1.437029,0.396989,0.980473,0.037942,-0.899098,2.457385,0.0,1.0,-3.503491,33.231252,-0.738335,2.092448,0.362578,0.69588,-0.923846,2.51896,-0.01972,1.019916,-3.27169,26.355833,0.406937,13.523014,0.851494,0.283179,0.415041,10.725154,33.231252,2.092448,0.69588,2.51896,26.355833,0.062966,0.020941,0.075801,0.030691,1.203834,0.487427,12.595695,0.404896,10.462984,25.84118
division_1_size_3,0.618445,1.0,0.08075,0.535789,1.699195,0.581733,0.980981,0.069433,-0.480547,1.616959,0.0,1.0,-2.516397,12.383902,-0.624015,1.866407,0.530155,0.588514,-0.541743,1.719,-0.019202,1.019388,-2.667394,14.402384,0.618445,7.658761,1.15427,0.363964,0.630435,8.907082,12.383902,1.866407,0.588514,1.719,14.402384,0.150712,0.047523,0.138809,0.082316,0.921021,0.546177,7.716638,0.593012,8.378348,14.128462


In [17]:
base_columns = features_3800_2500.columns.tolist()
columns = all_features_3800_2500.columns.tolist()
features_description_dict_3800_2500 = classify_feature_type(base_columns=base_columns, columns=columns)

for feature_type, feature_lst in features_description_dict_3800_2500.items():
    print(f'(*) {feature_type}: {len(feature_lst)}')

(*) base_features_peaks_3800_2500: 3
(*) base_features_intervals_3800_2500: 5
(*) engineered_features_peaks_3800_2500: 9
(*) engineered_features_intervals_3800_2500: 28
(*) base_features: 8
(*) engineered_features: 37


### 1900—800 cm$^{-1}$

In [18]:
engineered_1900_800 = generate_transformed_features(
    data=features_1900_800, 
    feature_list=features_1900_800_list, 
    tolerance=0
)

In [19]:
nan_df = engineered_1900_800.isna().sum()
nan_summary = nan_df[nan_df > 0].sort_values(ascending=False)

print(f"Столбцов с NaN-values: {nan_summary.shape[0]}")
display(nan_summary.head(10))

Столбцов с NaN-values: 0


Series([], dtype: int64)

In [20]:
all_features_1900_800 = pd.merge(
    left=features_1900_800,
    right=engineered_1900_800,
    how='inner',
    left_index=True,
    right_index=True
)

print(all_features_1900_800.shape)
all_features_1900_800.head(3)

(75, 203)


Unnamed: 0,$I_{873}$,$I_{1107}$,$I_{1241}$,$I_{1393}$,$I_{1613}$,$I_{1725}$,$\overline{I}_{1900\text{–}800}$,"$\Sigma I_{p,\ 1900\text{–}800}$",$\overline{I}_{1900\text{–}1500}$,"$\Sigma I_{p,\ 1900\text{–}1500}$",$\overline{I}_{1500\text{–}1300}$,"$\Sigma I_{p,\ 1500\text{–}1300}$",$\overline{I}_{1300\text{–}800}$,"$\Sigma I_{p,\ 1300\text{–}800}$",$\overline{I}_{873 \pm 25}$,$\overline{I}_{1107 \pm 25}$,$\overline{I}_{1241 \pm 25}$,$\overline{I}_{1393 \pm 25}$,$\overline{I}_{1613 \pm 25}$,$\overline{I}_{1725 \pm 25}$,$\log I_{873}$,$(I_{873})^{-1}$,$\log I_{1107}$,$(I_{1107})^{-1}$,$\log I_{1241}$,$(I_{1241})^{-1}$,$\log I_{1393}$,$(I_{1393})^{-1}$,$\log I_{1613}$,$(I_{1613})^{-1}$,$\log I_{1725}$,$(I_{1725})^{-1}$,$\log \overline{I}_{1900\text{–}800}$,$(\overline{I}_{1900\text{–}800})^{-1}$,"$\log \Sigma I_{p,\ 1900\text{–}800}$","$(\Sigma I_{p,\ 1900\text{–}800})^{-1}$",$\log \overline{I}_{1900\text{–}1500}$,$(\overline{I}_{1900\text{–}1500})^{-1}$,"$\log \Sigma I_{p,\ 1900\text{–}1500}$","$(\Sigma I_{p,\ 1900\text{–}1500})^{-1}$",$\log \overline{I}_{1500\text{–}1300}$,$(\overline{I}_{1500\text{–}1300})^{-1}$,"$\log \Sigma I_{p,\ 1500\text{–}1300}$","$(\Sigma I_{p,\ 1500\text{–}1300})^{-1}$",$\log \overline{I}_{1300\text{–}800}$,$(\overline{I}_{1300\text{–}800})^{-1}$,"$\log \Sigma I_{p,\ 1300\text{–}800}$","$(\Sigma I_{p,\ 1300\text{–}800})^{-1}$",$\log \overline{I}_{873 \pm 25}$,$(\overline{I}_{873 \pm 25})^{-1}$,$\log \overline{I}_{1107 \pm 25}$,$(\overline{I}_{1107 \pm 25})^{-1}$,$\log \overline{I}_{1241 \pm 25}$,$(\overline{I}_{1241 \pm 25})^{-1}$,$\log \overline{I}_{1393 \pm 25}$,$(\overline{I}_{1393 \pm 25})^{-1}$,$\log \overline{I}_{1613 \pm 25}$,$(\overline{I}_{1613 \pm 25})^{-1}$,$\log \overline{I}_{1725 \pm 25}$,$(\overline{I}_{1725 \pm 25})^{-1}$,$\frac{I_{873}}{I_{1107}}$,$\frac{I_{873}}{I_{1241}}$,$\frac{I_{873}}{I_{1393}}$,$\frac{I_{873}}{I_{1613}}$,$\frac{I_{873}}{I_{1725}}$,$\frac{I_{873}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{873}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{873}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{873}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{873}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{873}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{873}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{873}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{873}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{873}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{873}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{873}}{\overline{I}_{1613 \pm 25}}$,$\frac{I_{873}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1107}}{I_{1241}}$,$\frac{I_{1107}}{I_{1393}}$,$\frac{I_{1107}}{I_{1613}}$,$\frac{I_{1107}}{I_{1725}}$,$\frac{I_{1107}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1107}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1107}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1107}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1107}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1107}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1107}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1107}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1107}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1107}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{1107}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{1107}}{\overline{I}_{1613 \pm 25}}$,$\frac{I_{1107}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1241}}{I_{1393}}$,$\frac{I_{1241}}{I_{1613}}$,$\frac{I_{1241}}{I_{1725}}$,$\frac{I_{1241}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1241}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1241}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1241}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1241}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1241}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1241}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1241}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1241}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1241}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{1241}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{1241}}{\overline{I}_{1613 \pm 25}}$,$\frac{I_{1241}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1393}}{I_{1613}}$,$\frac{I_{1393}}{I_{1725}}$,$\frac{I_{1393}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1393}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1393}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1393}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1393}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1393}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1393}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1393}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1393}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1393}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{1393}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{1393}}{\overline{I}_{1613 \pm 25}}$,$\frac{I_{1393}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1613}}{I_{1725}}$,$\frac{I_{1613}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1613}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1613}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1613}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1613}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1613}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1613}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1613}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1613}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1613}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{1613}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{1613}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{1613}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1725}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1725}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1725}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1725}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1725}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1725}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1725}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1725}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1500\text{–}1300}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1300\text{–}800}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{873 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1725 \pm 25}}$,"$\frac{\Sigma I_{p,\ 1900\text{–}800}}{\Sigma I_{p,\ 1900\text{–}1500}}$","$\frac{\Sigma I_{p,\ 1900\text{–}800}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1900\text{–}800}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1500\text{–}1300}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1300\text{–}800}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{873 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1725 \pm 25}}$,"$\frac{\Sigma I_{p,\ 1900\text{–}1500}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1900\text{–}1500}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1300\text{–}800}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{873 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1725 \pm 25}}$,"$\frac{\Sigma I_{p,\ 1500\text{–}1300}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{873 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{1107 \pm 25}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1107 \pm 25}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1107 \pm 25}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1107 \pm 25}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{1241 \pm 25}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1241 \pm 25}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1241 \pm 25}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{1393 \pm 25}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1393 \pm 25}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{1613 \pm 25}}{\overline{I}_{1725 \pm 25}}$
division_1_size_bulk,0.048208,0.681294,0.455066,0.760582,1.0,0.423273,0.424162,3.368423,0.426619,1.423273,0.571735,0.760582,0.363338,1.184569,0.047543,0.624379,0.447733,0.739415,0.9395,0.41747,-3.032222,20.743271,-0.383761,1.467794,-0.787312,2.197482,-0.273672,1.314783,0.0,1.0,-0.859739,2.362543,-0.857641,2.357591,1.214445,0.296875,-0.851863,2.34401,0.352959,0.702606,-0.55908,1.749062,-0.273672,1.314783,-1.012422,2.752259,0.169379,0.844189,-3.046113,21.033424,-0.470997,1.60159,-0.803558,2.233473,-0.301897,1.352421,-0.062408,1.064396,-0.873542,2.395381,0.07076,0.105937,0.063384,0.048208,0.113894,0.113656,0.014312,0.113001,0.033872,0.08432,0.063384,0.132682,0.040697,0.07721,0.107672,0.065198,0.051313,0.115477,1.497132,0.895754,0.681294,1.609587,1.606214,0.202259,1.596961,0.478681,1.191626,0.895754,1.875098,0.575141,14.329951,1.521652,0.921397,0.725167,1.631959,0.598314,0.455066,1.075114,1.072861,0.135098,1.06668,0.319732,0.795939,0.598314,1.25246,0.384162,9.571604,0.72883,0.615441,0.484371,1.090057,0.760582,1.796907,1.793141,0.225798,1.782811,0.534389,1.330305,1.0,2.093317,0.642074,15.997634,1.21814,1.698738,0.80956,1.821882,2.362543,2.357591,0.296875,2.34401,0.702606,1.749062,1.314783,2.752259,0.844189,21.033424,1.60159,2.233473,1.352421,2.395381,0.997904,0.125659,0.992156,0.297394,0.74033,0.556512,1.164956,0.357322,8.902873,0.677909,0.945368,0.572443,0.45053,0.741885,1.167403,8.921573,0.679333,0.947354,0.573645,0.451476,1.016029,2.366675,4.428747,2.843585,0.746184,1.174167,8.973264,0.683269,0.952843,0.576969,0.454092,1.021916,1.871295,1.201511,1.573562,12.025543,0.915685,1.276955,0.773227,0.608552,1.369523,0.642074,7.64224,0.581918,0.811505,0.491386,0.386735,0.870333,0.076145,0.106187,0.064299,0.050605,0.113884,1.394535,0.844424,0.664587,1.495626,0.605524,0.476565,1.072491,0.78703,1.771179,2.25046
division_1_size_5,0.091875,0.606138,0.427988,0.687458,1.0,0.384475,0.37873,3.197934,0.378004,1.384475,0.502368,0.687458,0.329999,1.126001,0.093349,0.594059,0.424918,0.668322,0.930634,0.378463,-2.387325,10.884335,-0.500648,1.64979,-0.84866,2.336514,-0.374754,1.454634,0.0,1.0,-0.955877,2.600952,-0.970931,2.6404,1.162505,0.312702,-0.972851,2.645477,0.325321,0.722296,-0.688422,1.990572,-0.374754,1.454634,-1.108665,3.030309,0.118672,0.888099,-2.371406,10.712443,-0.520776,1.683333,-0.855859,2.353396,-0.402986,1.496286,-0.071889,1.074536,-0.971638,2.642268,0.151575,0.214668,0.133645,0.091875,0.238963,0.242587,0.02873,0.243054,0.066361,0.182884,0.133645,0.27841,0.081594,0.154656,0.216219,0.137471,0.098723,0.242759,1.41625,0.881708,0.606138,1.576535,1.600446,0.18954,1.603523,0.437811,1.206561,0.881708,1.836785,0.53831,6.493216,1.426482,0.906955,0.651317,1.601578,0.622566,0.427988,1.113176,1.13006,0.133833,1.132232,0.309134,0.851941,0.622566,1.296936,0.380096,4.584797,0.720446,0.640392,0.459889,1.130859,0.687458,1.788046,1.815165,0.21497,1.818655,0.496548,1.368435,1.0,2.083211,0.610531,7.364358,1.157221,1.617862,0.738699,1.816449,2.600952,2.6404,0.312702,2.645477,0.722296,1.990572,1.454634,3.030309,0.888099,10.712443,1.683333,2.353396,1.496286,2.642268,1.015167,0.120226,1.017119,0.277704,0.765324,0.55927,1.165077,0.341451,4.118663,0.647199,0.904821,0.575284,0.413132,0.75389,1.14767,4.057128,0.637529,0.891303,0.566689,0.406959,1.000707,2.309854,4.651822,2.840081,0.752443,1.145468,4.049343,0.636306,0.889592,0.565602,0.406179,0.998787,2.013903,1.22955,1.522331,5.381591,0.845653,1.182272,0.751686,0.539813,1.327392,0.610531,3.535099,0.555499,0.776619,0.493773,0.354596,0.871947,0.157138,0.219688,0.139677,0.100307,0.246654,1.398057,0.888883,0.638338,1.569664,0.635799,0.45659,1.122747,0.718136,1.765885,2.458985
division_1_size_3,0.110321,0.836176,0.40095,0.696553,1.0,0.368753,0.436266,3.412752,0.429362,1.368753,0.523513,0.696553,0.406988,1.347446,0.113396,0.817537,0.401435,0.678474,0.953537,0.369746,-2.204364,9.064483,-0.178917,1.195921,-0.913918,2.494076,-0.361611,1.43564,0.0,1.0,-0.997629,2.711845,-0.829504,2.292181,1.227519,0.293019,-0.845455,2.329038,0.3139,0.730592,-0.647194,1.910173,-0.361611,1.43564,-0.89897,2.457072,0.298211,0.742145,-2.176872,8.818675,-0.201459,1.223186,-0.91271,2.491063,-0.387909,1.473896,-0.047578,1.048728,-0.994938,2.704557,0.131935,0.275148,0.158381,0.110321,0.299173,0.252875,0.032326,0.256941,0.080599,0.210732,0.158381,0.271066,0.081874,0.134943,0.274816,0.162601,0.115696,0.298369,2.085485,1.200447,0.836176,2.267579,1.916666,0.245015,1.947485,0.610903,1.59724,1.200447,2.054544,0.620563,7.373961,2.082966,1.232436,0.87692,2.261485,0.57562,0.40095,1.087315,0.91905,0.117486,0.933828,0.292931,0.765884,0.57562,0.985163,0.297563,3.535849,0.490437,0.590959,0.420487,1.084393,0.696553,1.888945,1.596626,0.204103,1.6223,0.508897,1.330538,1.0,1.711482,0.516943,6.142679,0.852014,1.735159,0.730495,1.883869,2.711845,2.292181,0.293019,2.329038,0.730592,1.910173,1.43564,2.457072,0.742145,8.818675,1.223186,2.491063,1.473896,2.704557,0.845248,0.108051,0.858839,0.269408,0.704381,0.529396,0.906052,0.273668,3.25191,0.451053,0.918586,0.543503,0.386721,0.833343,1.071937,3.847286,0.533634,1.086766,0.64301,0.457524,1.179906,2.49333,4.899484,2.532756,0.820155,1.054973,3.786402,0.525189,1.069567,0.632835,0.450284,1.161234,1.965036,1.015812,1.286309,4.616689,0.640353,1.304103,0.771603,0.549022,1.41587,0.516943,3.589099,0.497823,1.013834,0.599859,0.42682,1.100724,0.138704,0.282476,0.167133,0.118921,0.306685,2.036537,1.204964,0.857374,2.211076,0.591673,0.420996,1.085704,0.711534,1.834972,2.578894


In [21]:
base_columns = features_1900_800.columns.tolist()
columns = all_features_1900_800.columns.tolist()
features_description_dict_1900_800 = classify_feature_type(base_columns=base_columns, columns=columns)

for feature_type, feature_lst in features_description_dict_1900_800.items():
    print(f'(*) {feature_type}: {len(feature_lst)}')

(*) base_features_peaks_1900_800: 6
(*) base_features_intervals_1900_800: 14
(*) engineered_features_peaks_1900_800: 27
(*) engineered_features_intervals_1900_800: 156
(*) base_features: 20
(*) engineered_features: 183


## Construction of datasets

### 3800—2500 cm$^{-1}$

In [22]:
dataset_3800_2500_part = dataset_3800_2500[['Split', 'Fraction_hue', 'Fraction_grouped_hue', 'Class']]

print(dataset_3800_2500_part.shape)
dataset_3800_2500_part.head(3)

(75, 4)


Unnamed: 0,Split,Fraction_hue,Fraction_grouped_hue,Class
division_1_size_bulk,Train,$> 0$,$d > 5$,2
division_1_size_5,Train,$< 5$,$2 < d \leq 5$,1
division_1_size_3,Train,$< 3$,$2 < d \leq 5$,1


In [23]:
dataset_3800_2500 = pd.merge(all_features_3800_2500, dataset_3800_2500_part, left_index=True, right_index=True, how='right')

print(dataset_3800_2500.shape)
display(dataset_3800_2500.head(3))
display(pd.DataFrame(dataset_3800_2500.isna().sum()).T)

(75, 49)


Unnamed: 0,$I_{2928}$,$I_{3352}$,$I_{3723}$,$\overline{I}_{3800\text{–}2500}$,"$\Sigma I_{p,\ 3800\text{–}2500}$",$\overline{I}_{2928 \pm 50}$,$\overline{I}_{3352 \pm 50}$,$\overline{I}_{3723 \pm 50}$,$\log I_{2928}$,$(I_{2928})^{-1}$,$\log I_{3352}$,$(I_{3352})^{-1}$,$\log I_{3723}$,$(I_{3723})^{-1}$,$\log \overline{I}_{3800\text{–}2500}$,$(\overline{I}_{3800\text{–}2500})^{-1}$,"$\log \Sigma I_{p,\ 3800\text{–}2500}$","$(\Sigma I_{p,\ 3800\text{–}2500})^{-1}$",$\log \overline{I}_{2928 \pm 50}$,$(\overline{I}_{2928 \pm 50})^{-1}$,$\log \overline{I}_{3352 \pm 50}$,$(\overline{I}_{3352 \pm 50})^{-1}$,$\log \overline{I}_{3723 \pm 50}$,$(\overline{I}_{3723 \pm 50})^{-1}$,$\frac{I_{2928}}{I_{3352}}$,$\frac{I_{2928}}{I_{3723}}$,$\frac{I_{2928}}{\overline{I}_{3800\text{–}2500}}$,"$\frac{I_{2928}}{\Sigma I_{p,\ 3800\text{–}2500}}$",$\frac{I_{2928}}{\overline{I}_{3352 \pm 50}}$,$\frac{I_{2928}}{\overline{I}_{3723 \pm 50}}$,$\frac{I_{3352}}{I_{3723}}$,$\frac{I_{3352}}{\overline{I}_{3800\text{–}2500}}$,"$\frac{I_{3352}}{\Sigma I_{p,\ 3800\text{–}2500}}$",$\frac{I_{3352}}{\overline{I}_{2928 \pm 50}}$,$\frac{I_{3352}}{\overline{I}_{3723 \pm 50}}$,$\frac{I_{3723}}{\overline{I}_{3800\text{–}2500}}$,"$\frac{I_{3723}}{\Sigma I_{p,\ 3800\text{–}2500}}$",$\frac{I_{3723}}{\overline{I}_{2928 \pm 50}}$,$\frac{I_{3723}}{\overline{I}_{3352 \pm 50}}$,$\frac{\overline{I}_{3800\text{–}2500}}{\overline{I}_{2928 \pm 50}}$,$\frac{\overline{I}_{3800\text{–}2500}}{\overline{I}_{3352 \pm 50}}$,$\frac{\overline{I}_{3800\text{–}2500}}{\overline{I}_{3723 \pm 50}}$,$\frac{\overline{I}_{2928 \pm 50}}{\overline{I}_{3352 \pm 50}}$,$\frac{\overline{I}_{2928 \pm 50}}{\overline{I}_{3723 \pm 50}}$,$\frac{\overline{I}_{3352 \pm 50}}{\overline{I}_{3723 \pm 50}}$,Split,Fraction_hue,Fraction_grouped_hue,Class
division_1_size_bulk,0.49172,1.0,0.090605,0.526201,1.582324,0.494443,0.983311,0.077488,-0.709847,2.033679,0.0,1.0,-2.40125,11.036962,-0.642072,1.900414,0.458895,0.631982,-0.704324,2.022478,-0.01683,1.016972,-2.557635,12.905257,0.49172,5.42709,0.934471,0.310758,0.500065,6.345768,11.036962,1.900414,0.631982,2.022478,12.905257,0.172186,0.05726,0.183246,0.092142,1.06423,0.535132,6.790761,0.502835,6.380913,12.689885,Train,$> 0$,$d > 5$,2
division_1_size_5,0.406937,1.0,0.030092,0.477909,1.437029,0.396989,0.980473,0.037942,-0.899098,2.457385,0.0,1.0,-3.503491,33.231252,-0.738335,2.092448,0.362578,0.69588,-0.923846,2.51896,-0.01972,1.019916,-3.27169,26.355833,0.406937,13.523014,0.851494,0.283179,0.415041,10.725154,33.231252,2.092448,0.69588,2.51896,26.355833,0.062966,0.020941,0.075801,0.030691,1.203834,0.487427,12.595695,0.404896,10.462984,25.84118,Train,$< 5$,$2 < d \leq 5$,1
division_1_size_3,0.618445,1.0,0.08075,0.535789,1.699195,0.581733,0.980981,0.069433,-0.480547,1.616959,0.0,1.0,-2.516397,12.383902,-0.624015,1.866407,0.530155,0.588514,-0.541743,1.719,-0.019202,1.019388,-2.667394,14.402384,0.618445,7.658761,1.15427,0.363964,0.630435,8.907082,12.383902,1.866407,0.588514,1.719,14.402384,0.150712,0.047523,0.138809,0.082316,0.921021,0.546177,7.716638,0.593012,8.378348,14.128462,Train,$< 3$,$2 < d \leq 5$,1


Unnamed: 0,$I_{2928}$,$I_{3352}$,$I_{3723}$,$\overline{I}_{3800\text{–}2500}$,"$\Sigma I_{p,\ 3800\text{–}2500}$",$\overline{I}_{2928 \pm 50}$,$\overline{I}_{3352 \pm 50}$,$\overline{I}_{3723 \pm 50}$,$\log I_{2928}$,$(I_{2928})^{-1}$,$\log I_{3352}$,$(I_{3352})^{-1}$,$\log I_{3723}$,$(I_{3723})^{-1}$,$\log \overline{I}_{3800\text{–}2500}$,$(\overline{I}_{3800\text{–}2500})^{-1}$,"$\log \Sigma I_{p,\ 3800\text{–}2500}$","$(\Sigma I_{p,\ 3800\text{–}2500})^{-1}$",$\log \overline{I}_{2928 \pm 50}$,$(\overline{I}_{2928 \pm 50})^{-1}$,$\log \overline{I}_{3352 \pm 50}$,$(\overline{I}_{3352 \pm 50})^{-1}$,$\log \overline{I}_{3723 \pm 50}$,$(\overline{I}_{3723 \pm 50})^{-1}$,$\frac{I_{2928}}{I_{3352}}$,$\frac{I_{2928}}{I_{3723}}$,$\frac{I_{2928}}{\overline{I}_{3800\text{–}2500}}$,"$\frac{I_{2928}}{\Sigma I_{p,\ 3800\text{–}2500}}$",$\frac{I_{2928}}{\overline{I}_{3352 \pm 50}}$,$\frac{I_{2928}}{\overline{I}_{3723 \pm 50}}$,$\frac{I_{3352}}{I_{3723}}$,$\frac{I_{3352}}{\overline{I}_{3800\text{–}2500}}$,"$\frac{I_{3352}}{\Sigma I_{p,\ 3800\text{–}2500}}$",$\frac{I_{3352}}{\overline{I}_{2928 \pm 50}}$,$\frac{I_{3352}}{\overline{I}_{3723 \pm 50}}$,$\frac{I_{3723}}{\overline{I}_{3800\text{–}2500}}$,"$\frac{I_{3723}}{\Sigma I_{p,\ 3800\text{–}2500}}$",$\frac{I_{3723}}{\overline{I}_{2928 \pm 50}}$,$\frac{I_{3723}}{\overline{I}_{3352 \pm 50}}$,$\frac{\overline{I}_{3800\text{–}2500}}{\overline{I}_{2928 \pm 50}}$,$\frac{\overline{I}_{3800\text{–}2500}}{\overline{I}_{3352 \pm 50}}$,$\frac{\overline{I}_{3800\text{–}2500}}{\overline{I}_{3723 \pm 50}}$,$\frac{\overline{I}_{2928 \pm 50}}{\overline{I}_{3352 \pm 50}}$,$\frac{\overline{I}_{2928 \pm 50}}{\overline{I}_{3723 \pm 50}}$,$\frac{\overline{I}_{3352 \pm 50}}{\overline{I}_{3723 \pm 50}}$,Split,Fraction_hue,Fraction_grouped_hue,Class
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### 1900—800 cm$^{-1}$

In [24]:
dataset_1900_800_part = dataset_1900_800[['Split', 'Fraction_hue', 'Fraction_grouped_hue', 'Class']]

print(dataset_1900_800_part.shape)
dataset_1900_800_part.head(3)

(75, 4)


Unnamed: 0,Split,Fraction_hue,Fraction_grouped_hue,Class
division_1_size_bulk,Train,$> 0$,$d > 5$,2
division_1_size_5,Train,$< 5$,$2 < d \leq 5$,1
division_1_size_3,Train,$< 3$,$2 < d \leq 5$,1


In [25]:
dataset_1900_800= pd.merge(all_features_1900_800, dataset_1900_800_part, left_index=True, right_index=True, how='right')

print(dataset_1900_800.shape)
display(dataset_1900_800.head(3))
display(pd.DataFrame(dataset_1900_800.isna().sum()).T)

(75, 207)


Unnamed: 0,$I_{873}$,$I_{1107}$,$I_{1241}$,$I_{1393}$,$I_{1613}$,$I_{1725}$,$\overline{I}_{1900\text{–}800}$,"$\Sigma I_{p,\ 1900\text{–}800}$",$\overline{I}_{1900\text{–}1500}$,"$\Sigma I_{p,\ 1900\text{–}1500}$",$\overline{I}_{1500\text{–}1300}$,"$\Sigma I_{p,\ 1500\text{–}1300}$",$\overline{I}_{1300\text{–}800}$,"$\Sigma I_{p,\ 1300\text{–}800}$",$\overline{I}_{873 \pm 25}$,$\overline{I}_{1107 \pm 25}$,$\overline{I}_{1241 \pm 25}$,$\overline{I}_{1393 \pm 25}$,$\overline{I}_{1613 \pm 25}$,$\overline{I}_{1725 \pm 25}$,$\log I_{873}$,$(I_{873})^{-1}$,$\log I_{1107}$,$(I_{1107})^{-1}$,$\log I_{1241}$,$(I_{1241})^{-1}$,$\log I_{1393}$,$(I_{1393})^{-1}$,$\log I_{1613}$,$(I_{1613})^{-1}$,$\log I_{1725}$,$(I_{1725})^{-1}$,$\log \overline{I}_{1900\text{–}800}$,$(\overline{I}_{1900\text{–}800})^{-1}$,"$\log \Sigma I_{p,\ 1900\text{–}800}$","$(\Sigma I_{p,\ 1900\text{–}800})^{-1}$",$\log \overline{I}_{1900\text{–}1500}$,$(\overline{I}_{1900\text{–}1500})^{-1}$,"$\log \Sigma I_{p,\ 1900\text{–}1500}$","$(\Sigma I_{p,\ 1900\text{–}1500})^{-1}$",$\log \overline{I}_{1500\text{–}1300}$,$(\overline{I}_{1500\text{–}1300})^{-1}$,"$\log \Sigma I_{p,\ 1500\text{–}1300}$","$(\Sigma I_{p,\ 1500\text{–}1300})^{-1}$",$\log \overline{I}_{1300\text{–}800}$,$(\overline{I}_{1300\text{–}800})^{-1}$,"$\log \Sigma I_{p,\ 1300\text{–}800}$","$(\Sigma I_{p,\ 1300\text{–}800})^{-1}$",$\log \overline{I}_{873 \pm 25}$,$(\overline{I}_{873 \pm 25})^{-1}$,$\log \overline{I}_{1107 \pm 25}$,$(\overline{I}_{1107 \pm 25})^{-1}$,$\log \overline{I}_{1241 \pm 25}$,$(\overline{I}_{1241 \pm 25})^{-1}$,$\log \overline{I}_{1393 \pm 25}$,$(\overline{I}_{1393 \pm 25})^{-1}$,$\log \overline{I}_{1613 \pm 25}$,$(\overline{I}_{1613 \pm 25})^{-1}$,$\log \overline{I}_{1725 \pm 25}$,$(\overline{I}_{1725 \pm 25})^{-1}$,$\frac{I_{873}}{I_{1107}}$,$\frac{I_{873}}{I_{1241}}$,$\frac{I_{873}}{I_{1393}}$,$\frac{I_{873}}{I_{1613}}$,$\frac{I_{873}}{I_{1725}}$,$\frac{I_{873}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{873}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{873}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{873}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{873}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{873}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{873}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{873}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{873}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{873}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{873}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{873}}{\overline{I}_{1613 \pm 25}}$,$\frac{I_{873}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1107}}{I_{1241}}$,$\frac{I_{1107}}{I_{1393}}$,$\frac{I_{1107}}{I_{1613}}$,$\frac{I_{1107}}{I_{1725}}$,$\frac{I_{1107}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1107}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1107}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1107}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1107}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1107}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1107}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1107}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1107}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1107}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{1107}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{1107}}{\overline{I}_{1613 \pm 25}}$,$\frac{I_{1107}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1241}}{I_{1393}}$,$\frac{I_{1241}}{I_{1613}}$,$\frac{I_{1241}}{I_{1725}}$,$\frac{I_{1241}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1241}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1241}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1241}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1241}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1241}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1241}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1241}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1241}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1241}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{1241}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{1241}}{\overline{I}_{1613 \pm 25}}$,$\frac{I_{1241}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1393}}{I_{1613}}$,$\frac{I_{1393}}{I_{1725}}$,$\frac{I_{1393}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1393}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1393}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1393}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1393}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1393}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1393}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1393}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1393}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1393}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{1393}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{1393}}{\overline{I}_{1613 \pm 25}}$,$\frac{I_{1393}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1613}}{I_{1725}}$,$\frac{I_{1613}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1613}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1613}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1613}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1613}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1613}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1613}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1613}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1613}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1613}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{1613}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{1613}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{1613}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1725}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1725}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1725}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1725}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1725}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1725}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1725}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1725}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1500\text{–}1300}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1300\text{–}800}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{873 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1725 \pm 25}}$,"$\frac{\Sigma I_{p,\ 1900\text{–}800}}{\Sigma I_{p,\ 1900\text{–}1500}}$","$\frac{\Sigma I_{p,\ 1900\text{–}800}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1900\text{–}800}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1500\text{–}1300}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1300\text{–}800}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{873 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1725 \pm 25}}$,"$\frac{\Sigma I_{p,\ 1900\text{–}1500}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1900\text{–}1500}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1300\text{–}800}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{873 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1725 \pm 25}}$,"$\frac{\Sigma I_{p,\ 1500\text{–}1300}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{873 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{1107 \pm 25}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1107 \pm 25}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1107 \pm 25}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1107 \pm 25}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{1241 \pm 25}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1241 \pm 25}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1241 \pm 25}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{1393 \pm 25}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1393 \pm 25}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{1613 \pm 25}}{\overline{I}_{1725 \pm 25}}$,Split,Fraction_hue,Fraction_grouped_hue,Class
division_1_size_bulk,0.048208,0.681294,0.455066,0.760582,1.0,0.423273,0.424162,3.368423,0.426619,1.423273,0.571735,0.760582,0.363338,1.184569,0.047543,0.624379,0.447733,0.739415,0.9395,0.41747,-3.032222,20.743271,-0.383761,1.467794,-0.787312,2.197482,-0.273672,1.314783,0.0,1.0,-0.859739,2.362543,-0.857641,2.357591,1.214445,0.296875,-0.851863,2.34401,0.352959,0.702606,-0.55908,1.749062,-0.273672,1.314783,-1.012422,2.752259,0.169379,0.844189,-3.046113,21.033424,-0.470997,1.60159,-0.803558,2.233473,-0.301897,1.352421,-0.062408,1.064396,-0.873542,2.395381,0.07076,0.105937,0.063384,0.048208,0.113894,0.113656,0.014312,0.113001,0.033872,0.08432,0.063384,0.132682,0.040697,0.07721,0.107672,0.065198,0.051313,0.115477,1.497132,0.895754,0.681294,1.609587,1.606214,0.202259,1.596961,0.478681,1.191626,0.895754,1.875098,0.575141,14.329951,1.521652,0.921397,0.725167,1.631959,0.598314,0.455066,1.075114,1.072861,0.135098,1.06668,0.319732,0.795939,0.598314,1.25246,0.384162,9.571604,0.72883,0.615441,0.484371,1.090057,0.760582,1.796907,1.793141,0.225798,1.782811,0.534389,1.330305,1.0,2.093317,0.642074,15.997634,1.21814,1.698738,0.80956,1.821882,2.362543,2.357591,0.296875,2.34401,0.702606,1.749062,1.314783,2.752259,0.844189,21.033424,1.60159,2.233473,1.352421,2.395381,0.997904,0.125659,0.992156,0.297394,0.74033,0.556512,1.164956,0.357322,8.902873,0.677909,0.945368,0.572443,0.45053,0.741885,1.167403,8.921573,0.679333,0.947354,0.573645,0.451476,1.016029,2.366675,4.428747,2.843585,0.746184,1.174167,8.973264,0.683269,0.952843,0.576969,0.454092,1.021916,1.871295,1.201511,1.573562,12.025543,0.915685,1.276955,0.773227,0.608552,1.369523,0.642074,7.64224,0.581918,0.811505,0.491386,0.386735,0.870333,0.076145,0.106187,0.064299,0.050605,0.113884,1.394535,0.844424,0.664587,1.495626,0.605524,0.476565,1.072491,0.78703,1.771179,2.25046,Train,$> 0$,$d > 5$,2
division_1_size_5,0.091875,0.606138,0.427988,0.687458,1.0,0.384475,0.37873,3.197934,0.378004,1.384475,0.502368,0.687458,0.329999,1.126001,0.093349,0.594059,0.424918,0.668322,0.930634,0.378463,-2.387325,10.884335,-0.500648,1.64979,-0.84866,2.336514,-0.374754,1.454634,0.0,1.0,-0.955877,2.600952,-0.970931,2.6404,1.162505,0.312702,-0.972851,2.645477,0.325321,0.722296,-0.688422,1.990572,-0.374754,1.454634,-1.108665,3.030309,0.118672,0.888099,-2.371406,10.712443,-0.520776,1.683333,-0.855859,2.353396,-0.402986,1.496286,-0.071889,1.074536,-0.971638,2.642268,0.151575,0.214668,0.133645,0.091875,0.238963,0.242587,0.02873,0.243054,0.066361,0.182884,0.133645,0.27841,0.081594,0.154656,0.216219,0.137471,0.098723,0.242759,1.41625,0.881708,0.606138,1.576535,1.600446,0.18954,1.603523,0.437811,1.206561,0.881708,1.836785,0.53831,6.493216,1.426482,0.906955,0.651317,1.601578,0.622566,0.427988,1.113176,1.13006,0.133833,1.132232,0.309134,0.851941,0.622566,1.296936,0.380096,4.584797,0.720446,0.640392,0.459889,1.130859,0.687458,1.788046,1.815165,0.21497,1.818655,0.496548,1.368435,1.0,2.083211,0.610531,7.364358,1.157221,1.617862,0.738699,1.816449,2.600952,2.6404,0.312702,2.645477,0.722296,1.990572,1.454634,3.030309,0.888099,10.712443,1.683333,2.353396,1.496286,2.642268,1.015167,0.120226,1.017119,0.277704,0.765324,0.55927,1.165077,0.341451,4.118663,0.647199,0.904821,0.575284,0.413132,0.75389,1.14767,4.057128,0.637529,0.891303,0.566689,0.406959,1.000707,2.309854,4.651822,2.840081,0.752443,1.145468,4.049343,0.636306,0.889592,0.565602,0.406179,0.998787,2.013903,1.22955,1.522331,5.381591,0.845653,1.182272,0.751686,0.539813,1.327392,0.610531,3.535099,0.555499,0.776619,0.493773,0.354596,0.871947,0.157138,0.219688,0.139677,0.100307,0.246654,1.398057,0.888883,0.638338,1.569664,0.635799,0.45659,1.122747,0.718136,1.765885,2.458985,Train,$< 5$,$2 < d \leq 5$,1
division_1_size_3,0.110321,0.836176,0.40095,0.696553,1.0,0.368753,0.436266,3.412752,0.429362,1.368753,0.523513,0.696553,0.406988,1.347446,0.113396,0.817537,0.401435,0.678474,0.953537,0.369746,-2.204364,9.064483,-0.178917,1.195921,-0.913918,2.494076,-0.361611,1.43564,0.0,1.0,-0.997629,2.711845,-0.829504,2.292181,1.227519,0.293019,-0.845455,2.329038,0.3139,0.730592,-0.647194,1.910173,-0.361611,1.43564,-0.89897,2.457072,0.298211,0.742145,-2.176872,8.818675,-0.201459,1.223186,-0.91271,2.491063,-0.387909,1.473896,-0.047578,1.048728,-0.994938,2.704557,0.131935,0.275148,0.158381,0.110321,0.299173,0.252875,0.032326,0.256941,0.080599,0.210732,0.158381,0.271066,0.081874,0.134943,0.274816,0.162601,0.115696,0.298369,2.085485,1.200447,0.836176,2.267579,1.916666,0.245015,1.947485,0.610903,1.59724,1.200447,2.054544,0.620563,7.373961,2.082966,1.232436,0.87692,2.261485,0.57562,0.40095,1.087315,0.91905,0.117486,0.933828,0.292931,0.765884,0.57562,0.985163,0.297563,3.535849,0.490437,0.590959,0.420487,1.084393,0.696553,1.888945,1.596626,0.204103,1.6223,0.508897,1.330538,1.0,1.711482,0.516943,6.142679,0.852014,1.735159,0.730495,1.883869,2.711845,2.292181,0.293019,2.329038,0.730592,1.910173,1.43564,2.457072,0.742145,8.818675,1.223186,2.491063,1.473896,2.704557,0.845248,0.108051,0.858839,0.269408,0.704381,0.529396,0.906052,0.273668,3.25191,0.451053,0.918586,0.543503,0.386721,0.833343,1.071937,3.847286,0.533634,1.086766,0.64301,0.457524,1.179906,2.49333,4.899484,2.532756,0.820155,1.054973,3.786402,0.525189,1.069567,0.632835,0.450284,1.161234,1.965036,1.015812,1.286309,4.616689,0.640353,1.304103,0.771603,0.549022,1.41587,0.516943,3.589099,0.497823,1.013834,0.599859,0.42682,1.100724,0.138704,0.282476,0.167133,0.118921,0.306685,2.036537,1.204964,0.857374,2.211076,0.591673,0.420996,1.085704,0.711534,1.834972,2.578894,Train,$< 3$,$2 < d \leq 5$,1


Unnamed: 0,$I_{873}$,$I_{1107}$,$I_{1241}$,$I_{1393}$,$I_{1613}$,$I_{1725}$,$\overline{I}_{1900\text{–}800}$,"$\Sigma I_{p,\ 1900\text{–}800}$",$\overline{I}_{1900\text{–}1500}$,"$\Sigma I_{p,\ 1900\text{–}1500}$",$\overline{I}_{1500\text{–}1300}$,"$\Sigma I_{p,\ 1500\text{–}1300}$",$\overline{I}_{1300\text{–}800}$,"$\Sigma I_{p,\ 1300\text{–}800}$",$\overline{I}_{873 \pm 25}$,$\overline{I}_{1107 \pm 25}$,$\overline{I}_{1241 \pm 25}$,$\overline{I}_{1393 \pm 25}$,$\overline{I}_{1613 \pm 25}$,$\overline{I}_{1725 \pm 25}$,$\log I_{873}$,$(I_{873})^{-1}$,$\log I_{1107}$,$(I_{1107})^{-1}$,$\log I_{1241}$,$(I_{1241})^{-1}$,$\log I_{1393}$,$(I_{1393})^{-1}$,$\log I_{1613}$,$(I_{1613})^{-1}$,$\log I_{1725}$,$(I_{1725})^{-1}$,$\log \overline{I}_{1900\text{–}800}$,$(\overline{I}_{1900\text{–}800})^{-1}$,"$\log \Sigma I_{p,\ 1900\text{–}800}$","$(\Sigma I_{p,\ 1900\text{–}800})^{-1}$",$\log \overline{I}_{1900\text{–}1500}$,$(\overline{I}_{1900\text{–}1500})^{-1}$,"$\log \Sigma I_{p,\ 1900\text{–}1500}$","$(\Sigma I_{p,\ 1900\text{–}1500})^{-1}$",$\log \overline{I}_{1500\text{–}1300}$,$(\overline{I}_{1500\text{–}1300})^{-1}$,"$\log \Sigma I_{p,\ 1500\text{–}1300}$","$(\Sigma I_{p,\ 1500\text{–}1300})^{-1}$",$\log \overline{I}_{1300\text{–}800}$,$(\overline{I}_{1300\text{–}800})^{-1}$,"$\log \Sigma I_{p,\ 1300\text{–}800}$","$(\Sigma I_{p,\ 1300\text{–}800})^{-1}$",$\log \overline{I}_{873 \pm 25}$,$(\overline{I}_{873 \pm 25})^{-1}$,$\log \overline{I}_{1107 \pm 25}$,$(\overline{I}_{1107 \pm 25})^{-1}$,$\log \overline{I}_{1241 \pm 25}$,$(\overline{I}_{1241 \pm 25})^{-1}$,$\log \overline{I}_{1393 \pm 25}$,$(\overline{I}_{1393 \pm 25})^{-1}$,$\log \overline{I}_{1613 \pm 25}$,$(\overline{I}_{1613 \pm 25})^{-1}$,$\log \overline{I}_{1725 \pm 25}$,$(\overline{I}_{1725 \pm 25})^{-1}$,$\frac{I_{873}}{I_{1107}}$,$\frac{I_{873}}{I_{1241}}$,$\frac{I_{873}}{I_{1393}}$,$\frac{I_{873}}{I_{1613}}$,$\frac{I_{873}}{I_{1725}}$,$\frac{I_{873}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{873}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{873}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{873}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{873}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{873}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{873}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{873}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{873}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{873}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{873}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{873}}{\overline{I}_{1613 \pm 25}}$,$\frac{I_{873}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1107}}{I_{1241}}$,$\frac{I_{1107}}{I_{1393}}$,$\frac{I_{1107}}{I_{1613}}$,$\frac{I_{1107}}{I_{1725}}$,$\frac{I_{1107}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1107}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1107}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1107}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1107}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1107}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1107}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1107}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1107}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1107}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{1107}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{1107}}{\overline{I}_{1613 \pm 25}}$,$\frac{I_{1107}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1241}}{I_{1393}}$,$\frac{I_{1241}}{I_{1613}}$,$\frac{I_{1241}}{I_{1725}}$,$\frac{I_{1241}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1241}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1241}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1241}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1241}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1241}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1241}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1241}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1241}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1241}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{1241}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{1241}}{\overline{I}_{1613 \pm 25}}$,$\frac{I_{1241}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1393}}{I_{1613}}$,$\frac{I_{1393}}{I_{1725}}$,$\frac{I_{1393}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1393}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1393}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1393}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1393}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1393}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1393}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1393}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1393}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1393}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{1393}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{1393}}{\overline{I}_{1613 \pm 25}}$,$\frac{I_{1393}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1613}}{I_{1725}}$,$\frac{I_{1613}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1613}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1613}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1613}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1613}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1613}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1613}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1613}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1613}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1613}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{1613}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{1613}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{1613}}{\overline{I}_{1725 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1900\text{–}800}}$,"$\frac{I_{1725}}{\Sigma I_{p,\ 1900\text{–}800}}$",$\frac{I_{1725}}{\overline{I}_{1900\text{–}1500}}$,"$\frac{I_{1725}}{\Sigma I_{p,\ 1900\text{–}1500}}$",$\frac{I_{1725}}{\overline{I}_{1500\text{–}1300}}$,"$\frac{I_{1725}}{\Sigma I_{p,\ 1500\text{–}1300}}$",$\frac{I_{1725}}{\overline{I}_{1300\text{–}800}}$,"$\frac{I_{1725}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{I_{1725}}{\overline{I}_{873 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1107 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1241 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1393 \pm 25}}$,$\frac{I_{1725}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1500\text{–}1300}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1300\text{–}800}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{873 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}800}}{\overline{I}_{1725 \pm 25}}$,"$\frac{\Sigma I_{p,\ 1900\text{–}800}}{\Sigma I_{p,\ 1900\text{–}1500}}$","$\frac{\Sigma I_{p,\ 1900\text{–}800}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1900\text{–}800}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1500\text{–}1300}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1300\text{–}800}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{873 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1900\text{–}1500}}{\overline{I}_{1725 \pm 25}}$,"$\frac{\Sigma I_{p,\ 1900\text{–}1500}}{\Sigma I_{p,\ 1500\text{–}1300}}$","$\frac{\Sigma I_{p,\ 1900\text{–}1500}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1300\text{–}800}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{873 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1500\text{–}1300}}{\overline{I}_{1725 \pm 25}}$,"$\frac{\Sigma I_{p,\ 1500\text{–}1300}}{\Sigma I_{p,\ 1300\text{–}800}}$",$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{873 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1300\text{–}800}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1107 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{873 \pm 25}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{1107 \pm 25}}{\overline{I}_{1241 \pm 25}}$,$\frac{\overline{I}_{1107 \pm 25}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1107 \pm 25}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1107 \pm 25}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{1241 \pm 25}}{\overline{I}_{1393 \pm 25}}$,$\frac{\overline{I}_{1241 \pm 25}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1241 \pm 25}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{1393 \pm 25}}{\overline{I}_{1613 \pm 25}}$,$\frac{\overline{I}_{1393 \pm 25}}{\overline{I}_{1725 \pm 25}}$,$\frac{\overline{I}_{1613 \pm 25}}{\overline{I}_{1725 \pm 25}}$,Split,Fraction_hue,Fraction_grouped_hue,Class
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Saving data

In [26]:
dataset_3800_2500.to_csv('./../data/processed_data/dataset_3800_2500_engineered.csv', sep=';')
dataset_1900_800.to_csv('./../data/processed_data/dataset_1900_800_engineered.csv', sep=';')

In [27]:
with open('./../data/processed_data/features_description_3800_2500.json', 'w', encoding='utf-8') as f:
    json.dump(features_description_dict_3800_2500, f, ensure_ascii=False, indent=2)

In [28]:
with open('./../data/processed_data/features_description_1900_800.json', 'w', encoding='utf-8') as f:
    json.dump(features_description_dict_1900_800, f, ensure_ascii=False, indent=2)