# Import libraries

In [22]:
import os
import warnings

import numpy as np
import pandas as pd

In [23]:
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

In [24]:
path_to_save_data = './../data/preprocessed_data'

# Loading & parsing data

In [25]:
path_to_ftir = './../data/raw_data'

*Collecting paths to subfolders in `path_to_ftir`*

In [26]:
division_folders = []

subfolder_names_lst = os.listdir(path_to_ftir)

for subfolder in subfolder_names_lst:
    isdir_flg = os.path.isdir(os.path.join(path_to_ftir, subfolder))
    startswith_division_flg = subfolder.startswith("division_")
    
    if isdir_flg and startswith_division_flg:
        path_to_subfolder = os.path.join(path_to_ftir, subfolder)
        division_folders.append(path_to_subfolder)

division_folders = sorted(division_folders, key=lambda x: int(x.split('_')[-1]))

*Collecting paths to all file and sorting them*

In [27]:
files = []

for folder in division_folders:
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        
        if os.path.isfile(file_path) and filename.endswith(".dpt"):
            files.append(file_path)

In [28]:
files = sorted(
    files,
    key=lambda x: (
        os.path.basename(x).split("_")[0].replace(".", "").isnumeric(),
        float(os.path.basename(x).split("_")[0]) if os.path.basename(x).split("_")[0].replace(".", "").isnumeric() else os.path.basename(x)
    ),
    reverse=True
)

*Reading all files*

In [29]:
spectra_dict = {}
description_dict = {}

for i, file_path in enumerate(files):
    filename = os.path.basename(file_path)
    folder_name = os.path.basename(os.path.dirname(file_path))
    fraction_name = filename.split("_")[0]

    try:
        tmp_df = pd.read_csv(file_path, sep=",", header=None)
        wavenumbers = tmp_df[0].values
        intensities = tmp_df[1].values

        key = f"{folder_name}_size_{fraction_name}"
        
        spectra_dict[key] = intensities
        description_dict[key] = [i+1, folder_name.split('_')[-1], fraction_name]

    except Exception as e:
        print(f"Ошибка при чтении {file_path}: {e}")

## Spectral data

In [30]:
initial_data = pd.DataFrame.from_dict(
    spectra_dict,
    orient="index",
    columns=wavenumbers
)

initial_data = initial_data.sort_index(
    key=lambda x: x.map(lambda y: (float(y.split('_')[1]), -float(y.split('_')[-1].replace('bulk', '1000')))),
    ascending=True
)

print(initial_data.shape)
initial_data.head(3)

(75, 3733)


Unnamed: 0,3998.31069,3997.34631,3996.38194,3995.41756,3994.45318,3993.48880,3992.52442,3991.56005,3990.59567,3989.63129,3988.66691,3987.70253,3986.73816,3985.77378,3984.80940,3983.84502,3982.88065,3981.91627,3980.95189,3979.98751,3979.02313,3978.05876,3977.09438,3976.13000,3975.16562,...,422.39751,421.43313,420.46876,419.50438,418.54000,417.57562,416.61124,415.64687,414.68249,413.71811,412.75373,411.78935,410.82498,409.86060,408.89622,407.93184,406.96747,406.00309,405.03871,404.07433,403.10995,402.14558,401.18120,400.21682,399.25244
division_1_size_bulk,-0.00858,-0.00858,-0.00858,-0.00858,-0.00858,-0.00858,-0.00858,-0.008612,-0.008578,-0.008559,-0.008554,-0.008587,-0.008574,-0.008486,-0.008457,-0.008504,-0.008556,-0.008586,-0.008602,-0.008547,-0.00846,-0.008443,-0.008471,-0.008505,-0.008509,...,0.005667,0.005664,0.0057,0.005717,0.005726,0.005732,0.005741,0.005662,0.005531,0.005438,0.005388,0.005372,0.005361,0.005383,0.005366,0.00529,0.005246,0.00521,0.005174,0.005174,0.005174,0.005174,0.005174,0.005174,0.005174
division_1_size_5,-0.008005,-0.008005,-0.008005,-0.008005,-0.008005,-0.008005,-0.008005,-0.008121,-0.008191,-0.00825,-0.008269,-0.008278,-0.008247,-0.008157,-0.00812,-0.008136,-0.008145,-0.008111,-0.008079,-0.008044,-0.007976,-0.007943,-0.007927,-0.007923,-0.007929,...,0.010026,0.01,0.009974,0.009935,0.009883,0.009873,0.009853,0.009706,0.009551,0.009473,0.009416,0.009341,0.009259,0.009198,0.009123,0.009064,0.009072,0.009097,0.009078,0.009078,0.009078,0.009078,0.009078,0.009078,0.009078
division_1_size_3,-0.003743,-0.003743,-0.003743,-0.003743,-0.003743,-0.003743,-0.003743,-0.003775,-0.003766,-0.003757,-0.003773,-0.003883,-0.003973,-0.003965,-0.003945,-0.003949,-0.00397,-0.003917,-0.003853,-0.003797,-0.0037,-0.003678,-0.003729,-0.003794,-0.003794,...,0.005184,0.0052,0.005233,0.005221,0.005201,0.005209,0.0052,0.005069,0.004923,0.004889,0.004884,0.004844,0.004761,0.004684,0.004632,0.004606,0.004621,0.004636,0.004621,0.004621,0.004621,0.004621,0.004621,0.004621,0.004621


## Samples description

In [31]:
samples_description = pd.DataFrame.from_dict(
    description_dict,
    orient='index',
    columns=['Row_ID', 'Division', 'Size']
)

samples_description = samples_description.sort_index(
    key=lambda x: x.map(lambda y: (float(y.split('_')[1]), -float(y.split('_')[-1].replace('bulk', '1000')))),
    ascending=True
)

samples_description['Division'] = samples_description['Division'].astype(int)

print(samples_description.shape)
samples_description.head(3)

(75, 3)


Unnamed: 0,Row_ID,Division,Size
division_1_size_bulk,66,1,bulk
division_1_size_5,14,1,5
division_1_size_3,25,1,3


In [32]:
pd.DataFrame(
    samples_description
    ['Size']
    .value_counts()
).T

Size,bulk,5,3,2.5,2,0.4,0.2,25,75,70,50,4
count,10,10,10,8,8,8,7,6,3,3,1,1


### Creating column `Fraction_hue`

In [33]:
samples_description['Fraction_hue'] = (
    samples_description['Size']
    .apply(
        lambda x:
           f'$> 0$' if x.split('_')[-1] == 'bulk'
           else f'$< {x.split("_")[-1]}$'
    )
)

print(samples_description.shape)
samples_description.head(3)

(75, 4)


Unnamed: 0,Row_ID,Division,Size,Fraction_hue
division_1_size_bulk,66,1,bulk,$> 0$
division_1_size_5,14,1,5,$< 5$
division_1_size_3,25,1,3,$< 3$


In [34]:
pd.DataFrame(
    samples_description
    ['Fraction_hue']
    .value_counts()
).T

Fraction_hue,$> 0$,$< 5$,$< 3$,$< 2.5$,$< 2$,$< 0.4$,$< 0.2$,$< 25$,$< 75$,$< 70$,$< 50$,$< 4$
count,10,10,10,8,8,8,7,6,3,3,1,1


In [35]:
fractions_count_stat_df = (
    samples_description
    .groupby('Division')
    .agg({
        'Fraction_hue': [
            ('unique_fractions', lambda x: sorted(set(x))),
            ('count', 'count'),
            ('n_unique', 'nunique')
        ]
    })
    .reset_index()
    .sort_values(by='Division', ascending=True)
)

fractions_count_stat_df.columns = ['Division', 'Fraction types', 'Number of fractions', 'Number of unique fractions']

fractions_count_stat_df

Unnamed: 0,Division,Fraction types,Number of fractions,Number of unique fractions
0,1,"[$< 0.2$, $< 0.4$, $< 2$, $< 2.5$, $< 3$, $< 5$, $> 0$]",7,7
1,2,"[$< 0.2$, $< 0.4$, $< 2$, $< 2.5$, $< 3$, $< 5$, $> 0$]",7,7
2,3,"[$< 0.2$, $< 0.4$, $< 2$, $< 2.5$, $< 3$, $< 5$, $> 0$]",7,7
3,4,"[$< 0.2$, $< 0.4$, $< 2$, $< 2.5$, $< 3$, $< 5$, $> 0$]",7,7
4,5,"[$< 25$, $< 3$, $< 5$, $< 75$, $> 0$]",5,5
5,6,"[$< 25$, $< 3$, $< 5$, $< 75$, $> 0$]",5,5
6,7,"[$< 0.4$, $< 2$, $< 2.5$, $< 25$, $< 3$, $< 5$, $< 75$, $> 0$]",8,8
7,8,"[$< 0.2$, $< 0.4$, $< 2$, $< 2.5$, $< 25$, $< 3$, $< 5$, $< 70$, $> 0$]",9,9
8,9,"[$< 0.2$, $< 0.4$, $< 2$, $< 2.5$, $< 25$, $< 3$, $< 5$, $< 70$, $> 0$]",9,9
9,10,"[$< 0.2$, $< 0.4$, $< 2$, $< 2.5$, $< 25$, $< 3$, $< 4$, $< 5$, $< 50$, $< 70$, $> 0$]",11,11


### Creating column `Fraction_grouped_hue`

In [36]:
samples_description['Fraction_grouped_hue'] = (
    samples_description['Size']
    .apply(
        lambda x:
            rf'$d \leq 2$' if float(x.replace('bulk', '1000')) <= 2.0
            else rf'$2 < d \leq 5$' if float(x.replace('bulk', '1000')) <= 5.0
            else rf'$d > 5$'
    )
)

print(samples_description.shape)
samples_description.head(3)

(75, 5)


Unnamed: 0,Row_ID,Division,Size,Fraction_hue,Fraction_grouped_hue
division_1_size_bulk,66,1,bulk,$> 0$,$d > 5$
division_1_size_5,14,1,5,$< 5$,$2 < d \leq 5$
division_1_size_3,25,1,3,$< 3$,$2 < d \leq 5$


In [37]:
pd.DataFrame(
    samples_description
    ['Fraction_grouped_hue']
    .value_counts()
).T

Fraction_grouped_hue,$2 < d \leq 5$,$d > 5$,$d \leq 2$
count,29,23,23


### Creating column `target`

In [38]:
samples_description['Class'] = (
    samples_description['Fraction_grouped_hue']
    .apply(
        lambda x:
            0 if x == rf'$d \leq 2$'
            else 1 if x == rf'$2 < d \leq 5$'
            else 2
    )
)

print(samples_description.shape)
samples_description.head(3)

(75, 6)


Unnamed: 0,Row_ID,Division,Size,Fraction_hue,Fraction_grouped_hue,Class
division_1_size_bulk,66,1,bulk,$> 0$,$d > 5$,2
division_1_size_5,14,1,5,$< 5$,$2 < d \leq 5$,1
division_1_size_3,25,1,3,$< 3$,$2 < d \leq 5$,1


In [39]:
pd.DataFrame(
    samples_description
    ['Class']
    .value_counts()
).T

Class,1,2,0
count,29,23,23


### Correcting column `Row_ID`

In [40]:
samples_description['Row_ID'] = range(1, samples_description.shape[0]+1)

print(samples_description.shape)
samples_description.head(3)

(75, 6)


Unnamed: 0,Row_ID,Division,Size,Fraction_hue,Fraction_grouped_hue,Class
division_1_size_bulk,1,1,bulk,$> 0$,$d > 5$,2
division_1_size_5,2,1,5,$< 5$,$2 < d \leq 5$,1
division_1_size_3,3,1,3,$< 3$,$2 < d \leq 5$,1


# Saving data

In [41]:
path_to_save = './../data/parsed_data'

In [42]:
initial_data.to_csv(f'{path_to_save}/ftir_raw_spectra.csv', sep=';')
samples_description.to_csv(f'{path_to_save}/samples_description.csv', sep=';')