In [None]:
import os
import re

import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse.linalg import spsolve
import seaborn as sns
import rampy as rp
import math

import plotly
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from scipy.spatial import ConvexHull

from scipy.signal import savgol_filter
from scipy.optimize import curve_fit
from scipy.optimize import minimize
from scipy.fftpack import dct
from scipy.fftpack import idct
from scipy.ndimage import gaussian_filter1d
from sklearn.metrics import auc

import lmfit

from BaselineRemoval import BaselineRemoval
from pybaselines import Baseline, utils

import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib import transforms
%matplotlib inline

# %matplotlib widget

%config InlineBackend.figure_format = 'retina'

In [None]:
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
import matplotlib.patheffects as pe

def confidence_ellipse(x, y, ax, n_std=3.0, facecolor='none', center_color='k', text=None, **kwargs):
    """
    Create a plot of the covariance confidence ellipse of `x` and `y`
    Parameters
    ----------
    x, y : array_like, shape (n, )
        Input data.
    ax : matplotlib.axes.Axes
        The axes object to draw the ellipse into.
    n_std : float
        The number of standard deviations to determine the ellipse's radiuses.
    Returns
    -------
    matplotlib.patches.Ellipse
    Other parameters
    ----------------
    kwargs : `~matplotlib.patches.Patch` properties
    """
    if x.size != y.size:
        raise ValueError("x and y must be the same size")

    cov = np.cov(x, y)
    pearson = cov[0, 1]/np.sqrt(cov[0, 0] * cov[1, 1])
    # Using a special case to obtain the eigenvalues of this
    # two-dimensionl dataset.
    ell_radius_x = np.sqrt(1 + pearson)
    ell_radius_y = np.sqrt(1 - pearson)
    ellipse = Ellipse((0, 0),
        width=ell_radius_x * 2,
        height=ell_radius_y * 2,
        facecolor=facecolor,
        **kwargs)

    # Calculating the stdandard deviation of x from
    # the squareroot of the variance and multiplying
    # with the given number of standard deviations.
    scale_x = np.sqrt(cov[0, 0]) * n_std
    mean_x = np.mean(x)

    # calculating the stdandard deviation of y ...
    scale_y = np.sqrt(cov[1, 1]) * n_std
    mean_y = np.mean(y)

    transf = transforms.Affine2D() \
        .rotate_deg(45) \
        .scale(scale_x, scale_y) \
        .translate(mean_x, mean_y)

    ellipse.set_transform(transf + ax.transData)

    ax.plot(mean_x, mean_y, marker='+', color='w', ms=12, mew=4)
    ax.plot(mean_x, mean_y, marker='+', color=center_color, ms=12, mew=1.5)

    # if text is not None:
    #     ax.annotate(
    #         text,
    #         (mean_x, mean_y),
    #         xytext=(-5, 5),
    #         textcoords='offset points',
    #         fontsize=20,
    #         ha='left',
    #         va='bottom', color='w'
    #     )

    if text is not None:
        txt = ax.annotate(
            text,
            (mean_x, mean_y),
            xytext=(-5, 5),
            textcoords='offset points',
            fontsize=20,
            color='k',
            ha='left',
            va='bottom',
            bbox=dict(
        boxstyle='round,pad=0.1',
        facecolor='white',
        edgecolor='none',
        alpha=0.6
    )
        )
        txt.set_path_effects([
            pe.withStroke(linewidth=1, foreground='white')
        ])

    return ax.add_patch(ellipse)

In [None]:
def align_spectra_by_peak(x, df, reference_peak=812, window=10):
    """
    Корректирует смещение спектров по эталонному пику.
    
    df : pd.DataFrame
        index = wavenumbers (ось X),
        columns = спектры (каждый столбец = отдельный спектр)
    reference_peak : float
        Ожидаемое положение эталонного пика (например, 812 см⁻¹)
    window : float
        Допустимое отклонение для поиска максимума (+- window)
        
    Возвращает скорректированный DataFrame.
    """
    x = np.asarray(x)
    corrected = pd.DataFrame(index=x)

    # Определяем диапазон для поиска
    mask = (x >= reference_peak - window) & (x <= reference_peak + window)

    for col in df.columns:
        y = df[col].to_numpy()

        # Находим положение локального максимума в окне
        local_x = x[mask]
        local_y = y[mask]
        max_idx = np.argmax(local_y)
        peak_pos = local_x[max_idx]

        # Сдвиг (сколько нужно сместить, чтобы пик оказался в reference_peak)
        shift = int((peak_pos - reference_peak) / 0.5)

        if shift == 0:
            corrected[col] = y
        else:
            y_shifted = np.zeros_like(y)
            if shift > 0:  # спектр "правее" — сдвигаем влево
                y_shifted[:-shift] = y[shift:]
            else:          # спектр "левее" — сдвигаем вправо
                y_shifted[-shift:] = y[:shift]
            corrected[col] = y_shifted

    return corrected

In [None]:
def shift_func(x, df_y, reference_peak=812, window=10):
    x = np.asarray(x)
    mask = (x >= reference_peak - window) & (x <= reference_peak + window)

    y_shifted = pd.DataFrame()
    for i in range(df_y.shape[1]):
        y = df_y.iloc[:, i].copy()
        y = np.asarray(y)

        local_x = x[mask]
        local_y = y[mask]
        max_idx = np.argmax(local_y)
        peak_pos = local_x[max_idx]
        # peaks_1, _ = find_peaks(y_1, height=0.95, distance=10)

        # if peaks_1[-1] != 192:
        #     if peaks_1[-1]-192 > 0:
        #         sh = peaks_1[-1] - 192
        #         len = IR_sm.iloc[sh:, i].shape[0]
        #         sh_d = pd.concat((IR_sm.iloc[sh:, i], pd.Series(np.zeros(sh))), axis=0, ignore_index=True)
        #         y_shift[i] = sh_d
        #     else:
        #         sh = abs(peaks_1[-1] - 192)
        #         sh_d = pd.concat((pd.Series(np.zeros(sh)), IR_sm.iloc[:, i]), axis=0, ignore_index=True)
        #         y_shift[i] = sh_d.iloc[:]
        # else:
        #     y_shift[i] = IR_sm.iloc[:, i]


    return y_shift

In [None]:
def save_data_by_one(df_x, df_y, names):
    for i in range(df_y.shape[1]):
        (pd.concat((pd.Series(df_x),  df_y.iloc[:, i]), axis=1)).to_csv(fr'C:\Users\gusen\Downloads\аспер\6 сем\data\1\{names[i]}.dat', sep='\t', index=None, header=None )


все измерения рамана былы сделаны на 25 мВт 

.+ в толще на этой же мощности все плавилось 

In [None]:
colors = ['#ff0000', '#ff8c00', '#ffd700', '#adff2f', '#1e90ff', '#0000cd', "#7104ff",
          '#00ff7f', '#f08080',  '#ff00ff',  '#dda0dd', '#87ceeb', '#7fffd4', '#ffe4b5',
          '#696969', '#2e8b57', '#8b0000', '#808000', '#663399',]

# Ram

In [None]:
root_folder = r'C:\Users\gusen\Downloads\аспер\5сем\data\pmma'
fold_name_s = re.search(r'[^\\/]+$', root_folder)
fold_name = fold_name_s.group(0)

numbers = re.compile(r'(\d+)')
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

classes = []
classes_n = []
samle_names = []

all_dfs = []


for subfolder in sorted(os.listdir(root_folder), key=numericalSort):
        subfolder_path = os.path.join(root_folder, subfolder)
        if not os.path.isdir(subfolder_path):
            continue

        for file in sorted(os.listdir(subfolder_path), key=numericalSort):
            file_path = os.path.join(subfolder_path, file)
            if not os.path.isfile(file_path):
                continue

            # df = pd.read_csv(file_path, delimiter='\t', header=None).iloc[:, 1]
            df = pd.DataFrame()
            if f"{fold_name}_{subfolder}" not in classes:
                 classes.append(f"{fold_name}_{subfolder}")
            classes_n.append(f"{fold_name}_{subfolder}")
            samle_names.append(f"{fold_name}_{subfolder}_{os.path.splitext(file)[0]}")
            column_name = f"{fold_name}_{subfolder}_{os.path.splitext(file)[0]}"
            df[column_name] = pd.read_csv(file_path, delimiter='\t', header=None).iloc[:, 1]

            all_dfs.append(df)

if all_dfs:
    result = pd.concat(all_dfs, axis=1)
else:
    result = pd.DataFrame()

In [None]:
unique_labels = np.unique(samle_names)
label_to_num = {label: i for i, label in enumerate(unique_labels)}
y = np.array([label_to_num[label] for label in samle_names])

colors_n = {}
for en,i in enumerate(classes):
    colors_n[i] = colors[en]

colors_for_points = [colors_n[label] for label in classes_n]

In [None]:
df_x = pd.read_csv(file_path, delimiter='\t', header=None).iloc[:, 0]
df_y = result.copy()

df_x_copy = df_x.copy()
df_y_copy = df_y.copy()

## ram range

In [None]:
x_begin = 300
x_end = 2000

x_begin = df_x_copy[df_x_copy>=x_begin].index[0]
x_end = df_x_copy[df_x_copy>=x_end].index[0]

df_x = df_x_copy.iloc[x_begin:x_end+1].copy()
df_y = df_y_copy.iloc[x_begin:x_end+1, :].copy()

df_x.reset_index(drop=True, inplace=True)
df_y.reset_index(drop=True, inplace=True)

In [None]:
# df_norm = df_y.copy()
# for i in range(df_norm.shape[1]):
#     df_norm.iloc[:, i] = (df_norm.iloc[:, i] - df_norm.iloc[:, i].mean()) / df_norm.iloc[:, i].std()

In [None]:
# datas = pd.DataFrame(columns=['class', 'name'] + list(df_x))
# datas['class'] = classes_mw
# datas['name'] = samle_names
# datas.iloc[:, 2:] = df_y.T.reset_index(drop=True)
# datas.to_csv(r'C:\Users\gusen\Downloads\аспер\5сем\data\pmma_init.csv', index=None, sep=';')

In [None]:
baseline_fitter = Baseline(df_x, check_finite=False)

df_bg = pd.DataFrame()
df_f = pd.DataFrame()
for i in range(df_y.shape[1]):
    # df_bg[samle_names[i]] = baseline_fitter.swima(df_y.iloc[:, i],  min_half_window=60, smooth_half_window=10)[0]
    # df_bg[samle_names[i]] = baseline_fitter.asls(df_y.iloc[:, i], lam=1e1, p=0.0001)[0]

    # df_bg[samle_names[i]] = baseline_fitter.pspline_arpls(df_y.iloc[:, i], lam=1e3, num_knots=2, diff_order=2, max_iter=50)[0]
    # df_bg[samle_names[i]] = baseline_fitter.arpls(df_y.iloc[:, i], lam=1e10, diff_order=2, max_iter=500)[0]
    # df_bg[samle_names[i]] = baseline_fitter.drpls(df_y.iloc[:, i], lam=1e12, eta=0.1, diff_order=2, max_iter=50)[0]
    # df_bg[samle_names[i]] = baseline_fitter.airpls(df_y.iloc[:, i], lam=1e6, max_iter=50, tol=1e-3)[0]

    df_bg[samle_names[i]] = baseline_fitter.rubberband(df_y.iloc[:, i], smooth_half_window=25)[0]
    df_f[samle_names[i]] = df_y.iloc[:, i] - df_bg.iloc[:, i]

    # df_bg[samle_names[i]] = concave_rubberband_baseline(df_x, df_y.iloc[:, i])
    # df_f[samle_names[i]] = df_y.iloc[:, i] - df_bg.iloc[:, i]

#     raman_19sp_f.iloc[:, i] = raman_19sp_f.iloc[:, i] / max(raman_19sp_f.iloc[1400:1650, i])
df_f[df_f<0] = 0

for i in range(df_f.shape[1]):
    # df_smooth[df_names[i]] = savgol_filter(df_y.iloc[:, i], 25, 4)
    df_f.iloc[:, i] = gaussian_filter1d(df_f.iloc[:, i], 10)

for i in range(df_f.shape[1]):
    df_f.iloc[:, i] = df_f.iloc[:, i] / max(df_f.iloc[:, i])

plt.plot(df_x, df_f.iloc[:, :])
# plt.xlim((800, 820))
# plt.ylim((0.9, 1.1))
# plt.xlabel('$wavenumber, cm^{-1}$')
# plt.ylabel('$a.u.$')
plt.grid()
plt.show()

In [None]:
# cc = r'C:\Users\gusen\Downloads\аспер\6 сем\data\4'
# fold_name_s = re.search(r'[^\\/]+$', root_folder)
# fold_name = fold_name_s.group(0)

# numbers = re.compile(r'(\d+)')
# def numericalSort(value):
#     parts = numbers.split(value)
#     parts[1::2] = map(int, parts[1::2])
#     return parts

# all_dfs = []

# for file in sorted(os.listdir(cc), key=numericalSort):
#     file_path = os.path.join(cc, file)
#     if not os.path.isfile(file_path):
#         continue
#     df = pd.DataFrame()
#     df[file[:-4]] = pd.read_csv(file_path, delimiter='\t', header=None).iloc[:, 1]

#     all_dfs.append(df)

# if all_dfs:
#     result = pd.concat(all_dfs, axis=1)

# df_f = result.copy()
# plt.plot(df_x, df_f.iloc[:, :])
# plt.grid()
# plt.show()

In [None]:
ref_peak = 812

if ref_peak in df_x.values:
    df_f = align_spectra_by_peak(df_x, df_f, reference_peak=ref_peak, window=10)

    plt.plot(df_x, df_f.iloc[:, :])
    plt.xlim((810, 814))
    plt.ylim((0.975, 1.025))
    # plt.xlabel('$wavenumber, cm^{-1}$')
    # plt.ylabel('$a.u.$')
    plt.grid()
    plt.show()

In [None]:
df_smooth = pd.DataFrame()

for i in range(df_y.shape[1]):
    # df_smooth[df_names[i]] = savgol_filter(df_y.iloc[:, i], 25, 4)
    df_smooth[samle_names[i]] = gaussian_filter1d(df_f.iloc[:, i], 2)

plt.plot(df_x, df_smooth.iloc[:, :])
plt.show

### normalization

In [None]:
df_norm = pd.DataFrame()

norm_var = 'range'
# norm_var = 'snv' 

peak_window = (1100, 1143)
# peak_window = (750, 900)
xy = np.asarray(df_x)
x_min, x_max = peak_window
mask = (xy >= x_min) & (xy <= x_max)

if norm_var == 'range':
    if len(df_x[mask]) == 0:
        for i in range(df_smooth.shape[1]):
            df_norm[samle_names[i]] = df_smooth.iloc[:, i] / max(df_smooth.iloc[:, i])
    else:
        for i in range(df_smooth.shape[1]):
            # df_norm[samle_names[i]] = df_smooth.iloc[:, i] / max(df_smooth.iloc[:, i])
            df_norm[samle_names[i]] = df_smooth.iloc[:, i] / max(df_smooth.iloc[mask, i])
elif norm_var == 'snv':
    for i in range(df_smooth.shape[1]):
        df_norm[samle_names[i]] = (df_smooth.iloc[:, i] - df_smooth.iloc[:, i].mean()) / df_smooth.iloc[:, i].std()

plt.plot(df_x, df_norm.iloc[:, :])

# plt.xlim((800, 2000))
plt.xlabel('$wavenumber, cm^{-1}$')
plt.ylabel('$a.u.$')
plt.show()

In [None]:
'''этот мини скрипт может менять количество точек в спектре'''

from scipy.interpolate import interp1d

df_y_mod = pd.DataFrame()

for i in range(df_norm.shape[1]):
    y =  df_norm.iloc[:, i]
    x = df_x
    f_interp = interp1d(x, y, kind='cubic')
    new_x = np.linspace(x.iloc[0], x.iloc[-1], 501)
    new_y = f_interp(new_x)
    df_y_mod[df_norm.columns[i]] = new_y

df_x = new_x.copy()
df_norm = df_y_mod.copy()

In [None]:
# df_norm_2 = df_norm.copy()
# corrs = pd.DataFrame(columns=df_x, index=(0,1))
# for j in range(df_norm_2.shape[0]):
#     for i in range(df_norm_2.shape[0]):
#         df_norm_2.iloc[i, :] = df_norm_2.iloc[i, :] / df_norm_2.iloc[i, :].max()
#     df_norm_2.loc[len(df_norm_2), :] = classes_mw
#     j_corr = df_norm_2.T.corr()
#     corrs[df_x[j]] = j_corr.iloc[-1, :].mean()
#     print(j)

In [None]:
# corrs[corrs>0.66].iloc[0, :]

In [None]:
labled_df = df_norm.T.copy()
labled_df['class'] = classes_n

In [None]:
auc(df_x, df_y_mod.iloc[:, i])

In [None]:
auc_all = pd.DataFrame()

for i in range(df_y_mod.shape[1]):
    auc_all.loc[0, df_norm.columns[i]] = auc(df_x, df_y_mod.iloc[:, i])

In [None]:
# df_norm.T.to_csv(r'C:\Users\gusen\Downloads\аспер\5сем\data\pmma_3.csv', index=None, header=None, sep=';')

In [None]:
# datas = pd.DataFrame(columns=['class', 'name'] + list(df_x))
# datas['class'] = classes_n
# datas['name'] = samle_names
# datas.iloc[:, 2:] = df_norm.T.reset_index(drop=True)
# datas.to_csv(r'C:\Users\gusen\Downloads\аспер\5сем\data\pmma_2.csv', index=None, sep=';')

## avg graphs

In [None]:
mean_spectra = pd.DataFrame()
groupped_df = labled_df.groupby('class').mean()
for i in classes:
    mean_spectra[i] = groupped_df.T[i]

mean_spectra = mean_spectra.T.copy()
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)

# a = df_x[1500].index[0]

for i in range(8):
    plt.plot(df_x, mean_spectra.iloc[i, :]+(5-i)*0.5, c=colors[i], label=mean_spectra.index[i], linewidth=2)
    # plt.text(1950, mean_spectra.iloc[i, 494]+(5-i)*0.5, i+1, c='k', fontsize=20 )

# plt.xlim((700, 1800))
# ax.set_xticks(np.arange(800, 4000, 200))
# plt.title('пмма, раман(785нм)')
ax.set_yticklabels([])
plt.xlabel('$Волновое\ число, см^{-1}$', fontsize=25)
plt.ylabel('$Интенсивность,\ усл.\ ед.$', fontsize=25)
plt.xticks(fontsize=20)
# plt.tick_params(axis='both', which='both', width=2)
# plt.legend(bbox_to_anchor=(1.23, 1))
plt.show()

In [None]:
mean_spectra = pd.DataFrame()
groupped_df = labled_df.groupby('class').mean()
for i in classes:
    mean_spectra[i] = groupped_df.T[i]

mean_spectra = mean_spectra.T.copy()
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)

# a = df_x[1500].index[0]

textt =['4', '6', '8', '10', '12', '14', '16', '0']
# textt =['4 мВт', '6 мВт', '8 мВт', '10 мВт', '12 мВт', '14 мВт', '16 мВт', 'пмма']

for i in range(8):
    if i == 7:
        plt.plot(df_x, mean_spectra.iloc[i, :]-3, c=colors[i], label=mean_spectra.index[i], linewidth=2)
        plt.text(1940, mean_spectra.iloc[i, 494]-2.9, textt[i], c='k', fontsize=20 )
    else:
        plt.plot(df_x, mean_spectra.iloc[i, :]-(5-i)*0.5, c=colors[i], label=mean_spectra.index[i], linewidth=2)
        plt.text(1940, mean_spectra.iloc[i, 494]-(4.9-i)*0.5, textt[i], c='k', fontsize=20 )

# plt.xlim((700, 1800))
# ax.set_xticks(np.arange(800, 4000, 200))
# plt.title('пмма, раман(785нм)')
ax.set_yticklabels([])
plt.xlabel('$Волновое\ число, см^{-1}$', fontsize=25)
plt.ylabel('$Интенсивность,\ усл.\ ед.$', fontsize=25)
plt.xticks(fontsize=20)
# plt.tick_params(axis='both', which='both', width=2)
# plt.legend(bbox_to_anchor=(1.23, 1))
plt.show()

In [None]:
mean_spectra = pd.DataFrame()
groupped_df = labled_df.groupby('class').mean()
for i in classes:
    mean_spectra[i] = groupped_df.T[i]

mean_spectra = mean_spectra.T.copy()
fig, ax = plt.subplots()
fig.set_size_inches(10, 6)

# a = df_x[1500].index[0]

for i in range(8):
    if i == 7:
        plt.plot(df_x, mean_spectra.iloc[i, :]-0.2, c=colors[i], label=mean_spectra.index[i])
    elif i == 6:
        plt.plot(df_x, mean_spectra.iloc[i, :]+i*0.22, c=colors[i], label=mean_spectra.index[i])
    elif i == 0:
        plt.plot(df_x, mean_spectra.iloc[i, :]-0.05, c=colors[i], label=mean_spectra.index[i])
    elif i == 2:
        plt.plot(df_x, mean_spectra.iloc[i, :]+i*0.23, c=colors[i], label=mean_spectra.index[i])
    else:
        plt.plot(df_x, mean_spectra.iloc[i, :]+i*0.2, c=colors[i], label=mean_spectra.index[i])
    # if i < 3 or i == 7:
    #     plt.text(1790, mean_spectra.iloc[i, 439], i+1, c='k', fontsize=20)
    # else:
    #     plt.text(1887, mean_spectra.iloc[i, 467], i+1, c='k', fontsize=20)
plt.text(1830, 0.3, 4, c='k', fontsize=20)
plt.text(1810, 0.54, 6, c='k', fontsize=20)
plt.text(1830, 0.72, 8, c='k', fontsize=20)
plt.text(1830, 1.03, 10, c='k', fontsize=20)
plt.text(1830, 1.27, 12, c='k', fontsize=20)
plt.text(1810, 1.55, 14, c='k', fontsize=20)
plt.text(1830, 1.7, 16, c='k', fontsize=20)
plt.text(1790, 0.1, 'пмма', c='k', fontsize=20)

plt.xlim((1650, 1950))
plt.ylim((-.2, 2.9))
# ax.set_xticks(np.arange(800, 4000, 200))
# plt.title('пмма, раман(785нм)')
ax.set_yticklabels([])
plt.xlabel('$Волновое\ число, см^{-1}$', fontsize=20)
plt.ylabel('$Интенсивность,\ усл.\ ед.$', fontsize=20)
plt.xticks(fontsize=20)
# plt.legend(bbox_to_anchor=(1, 1))
plt.show()

In [None]:
# for i in range(mean_spectra.shape[0]):
    # df_sp = pd.concat((pd.Series(df_x), mean_spectra.iloc[i, :].T), axis=1)
    # # plt.plot(df_sp.iloc[:, 0], df_sp.iloc[:, 1])
    # df_sp.to_csv(rf'C:\Users\gusen\Downloads\аспер\5сем\data\ram\спектры для диапазона 1650-1950\{mean_spectra.index[i]}.dat', sep='\t', header=None, index=None)

In [None]:
fig = go.Figure()

for i in range(len(mean_spectra)):
    fig.add_trace(go.Scatter(
        x=df_x,
        y=mean_spectra.iloc[i, :],  # смещение
        mode="lines",
        name=classes[i],
        line=dict(color=colors[i])
    ))

fig.update_layout(
    width=800,
    height=800,
    title="Mean spectra by class",
    xaxis_title="Wavenumber, cm⁻¹",
    yaxis_title="a.u.",
    legend_title="Классы"
)

# Ограничения оси X (как у тебя в комментарии)
# fig.update_xaxes(range=[700, 1800], dtick=200)

fig.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

for i in range(len(mean_spectra)):
    fig.add_trace(go.Scatter(
        x=df_x,
        y=mean_spectra.iloc[i, :] + (5 - i) * 0.2,  # смещение
        mode="lines",
        name=classes[i],
        line=dict(color=colors[i])
    ))

fig.update_layout(
    width=800,
    height=800,
    title="Mean spectra by class",
    xaxis_title="Wavenumber, cm⁻¹",
    yaxis_title="a.u.",
    legend_title="Классы"
)

# Ограничения оси X (как у тебя в комментарии)
# fig.update_xaxes(range=[700, 1800], dtick=200)

fig.show()

In [None]:
# (pd.concat((pd.Series(df_x), mean_spectra.iloc[-2, :] - mean_spectra.iloc[-1, :]), axis=1)).to_csv(r'C:\Users\gusen\Downloads\аспер\6 сем\data\1\diff_sp.csv', sep='\t', index=None, header=None )

## pca

In [None]:
fig, ax = plt.subplots(figsize=(8,6))

pca = PCA(n_components=20)
# pca_1 = pca.fit_transform(df_norm.iloc[:, :].T) #1195
pca_1 = pca.fit_transform((df_y_mod.iloc[:, :].T))
# pca_1 = pca.fit_transform(StandardScaler().fit_transform(df_y_mod.iloc[:, :].T))
# pca_1 = pca.fit_transform(MinMaxScaler().fit_transform(df_y_mod.iloc[:, :].T))

# stsc = StandardScaler().fit_transform(df_y_mod.iloc[:, :].T)
# pca_1 = pca.fit_transform(stsc)
# pca_1 = pca.fit_transform(pd.concat((df_y_mod.iloc[:3400, :], df_y_mod.iloc[5000:, :]), axis=0).T)

pc_1 = 1
pc_2 = 2

scatter = plt.scatter(pca_1[:, pc_1-1], pca_1[:, pc_2-1], c=colors_for_points , s=100, edgecolors='k')
# plt.text(np.mean(pca_1[0+10*i:10+10*i, 0]), np.mean(pca_1[0+10*i:10+10*i, 1]), IR_files_names[i][3:7], bbox=dict(facecolor='none', edgecolor=colors[i], boxstyle='round'))

values, counts = np.unique(classes_n, return_counts=True)

order = np.argsort([classes.index(v) for v in values])
values = values[order]
counts = counts[order]

n_uniq = len(values)
cum_sum = np.cumsum(counts)
# for i in range(n_uniq):
#        if i == 0:
#               confidence_ellipse(pca_1[0:cum_sum[i], pc_1-1], pca_1[0:cum_sum[i], pc_2-1], ax, n_std=2, edgecolor=colors[i], facecolor=colors[i], center_color=colors[i], alpha=0.1, text=i+1)
#        elif i == 6:
#               confidence_ellipse(pca_1[cum_sum[i-1]:cum_sum[i], pc_1-1], pca_1[cum_sum[i-1]:cum_sum[i], pc_2-1], ax, n_std=1, edgecolor=colors[i], facecolor=colors[i], center_color=colors[i], alpha=0.1, text=i+1)
#        else:
#               confidence_ellipse(pca_1[cum_sum[i-1]:cum_sum[i], pc_1-1], pca_1[cum_sum[i-1]:cum_sum[i], pc_2-1], ax, n_std=2, edgecolor=colors[i], facecolor=colors[i], center_color=colors[i], alpha=0.1, text=i+1)

# confidence_ellipse(pca_1[0:15, pc_1-1], pca_1[0:15, pc_2-1], ax, n_std=2, edgecolor=colors[0], facecolor=colors[0], center_color=colors[0], alpha=0.2)
# confidence_ellipse(pca_1[15:26, pc_1-1], pca_1[15:26, pc_2-1], ax, n_std=2, edgecolor=colors[1], facecolor=colors[1], center_color=colors[1], alpha=0.2)
# confidence_ellipse(pca_1[26:38, pc_1-1], pca_1[26:38, pc_2-1], ax, n_std=2, edgecolor=colors[2], facecolor=colors[2], center_color=colors[2], alpha=0.2)
# confidence_ellipse(pca_1[38:46, pc_1-1], pca_1[38:46, pc_2-1], ax, n_std=2, edgecolor=colors[3], facecolor=colors[3], center_color=colors[3], alpha=0.2)
# confidence_ellipse(pca_1[46:58, pc_1-1], pca_1[46:58, pc_2-1], ax, n_std=2, edgecolor=colors[4], facecolor=colors[4], center_color=colors[4], alpha=0.2)
# confidence_ellipse(pca_1[58:69, pc_1-1], pca_1[58:69, pc_2-1], ax, n_std=2, edgecolor=colors[5], facecolor=colors[5], center_color=colors[5], alpha=0.2)
# confidence_ellipse(pca_1[69:82, pc_1-1], pca_1[69:82, pc_2-1], ax, n_std=2, edgecolor=colors[6], facecolor=colors[6], center_color=colors[6], alpha=0.2)
# confidence_ellipse(pca_1[82:93, pc_1-1], pca_1[82:93, pc_2-1], ax, n_std=2, edgecolor=colors[7], facecolor=colors[7], center_color=colors[7], alpha=0.2)

plt.xlabel(f'${pc_1}\ главная\ компонента:$ {pca.explained_variance_ratio_[pc_1-1]*100:.2f}%', fontsize=15)
plt.ylabel(f'${pc_2}\ главная\ компонента:$ {pca.explained_variance_ratio_[pc_2-1]*100:.2f}%', fontsize=15)
# plt.xlabel(f'${pc_1}я\ главная\ компонента:$ {55.91}%', fontsize=17)
# plt.ylabel(f'${pc_2}я\ главная\ компонента:$ {16.38}%', fontsize=17)
# plt.title('$PCA,\ раман$')
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

legend_elements = [Line2D([0], [0], marker='o', color='w', label=cls,
                          markerfacecolor=color, markersize=10, markeredgecolor='k')
                   for cls, color in colors_n.items()]

# plt.legend(handles=legend_elements, title="Классы")

plt.show()

In [None]:
# fig, ax = plt.subplots(figsize=(8,6))

# pca = PCA(n_components=20)
# # pca_1 = pca.fit_transform(df_norm.iloc[:, :].T) #1195
# # pca_1 = pca.fit_transform((df_y_mod.iloc[:, :].T))
# # pca_1 = pca.fit_transform(StandardScaler().fit_transform(df_y_mod.iloc[:, :].T))
# pca_1 = pca.fit_transform(MinMaxScaler().fit_transform(df_y_mod.iloc[:, :].T))

# # stsc = StandardScaler().fit_transform(df_y_mod.iloc[:, :].T)
# # pca_1 = pca.fit_transform(stsc)
# # pca_1 = pca.fit_transform(pd.concat((df_y_mod.iloc[:3400, :], df_y_mod.iloc[5000:, :]), axis=0).T)

# pc_1 = 1
# pc_2 = 2

# scatter = plt.scatter(pca_1[:, pc_1-1], pca_1[:, pc_2-1], c=colors_for_points , s=100, edgecolors='k')
# # plt.text(np.mean(pca_1[0+10*i:10+10*i, 0]), np.mean(pca_1[0+10*i:10+10*i, 1]), IR_files_names[i][3:7], bbox=dict(facecolor='none', edgecolor=colors[i], boxstyle='round'))

# values, counts = np.unique(classes_n, return_counts=True)

# order = np.argsort([classes.index(v) for v in values])
# values = values[order]
# counts = counts[order]

# n_uniq = len(values)
# cum_sum = np.cumsum(counts)
# for i in range(n_uniq):
#        if i == 0:
#               confidence_ellipse(pca_1[0:cum_sum[i], pc_1-1], pca_1[0:cum_sum[i], pc_2-1], ax, n_std=2, edgecolor=colors[i], facecolor=colors[i], center_color=colors[i], alpha=0.1, text=textt[i])
#        elif i == 6:
#               confidence_ellipse(pca_1[cum_sum[i-1]:cum_sum[i], pc_1-1], pca_1[cum_sum[i-1]:cum_sum[i], pc_2-1], ax, n_std=1, edgecolor=colors[i], facecolor=colors[i], center_color=colors[i], alpha=0.1, text=textt[i])
#        else:
#               confidence_ellipse(pca_1[cum_sum[i-1]:cum_sum[i], pc_1-1], pca_1[cum_sum[i-1]:cum_sum[i], pc_2-1], ax, n_std=2, edgecolor=colors[i], facecolor=colors[i], center_color=colors[i], alpha=0.1, text=textt[i])

# # confidence_ellipse(pca_1[0:15, pc_1-1], pca_1[0:15, pc_2-1], ax, n_std=2, edgecolor=colors[0], facecolor=colors[0], center_color=colors[0], alpha=0.2)
# # confidence_ellipse(pca_1[15:26, pc_1-1], pca_1[15:26, pc_2-1], ax, n_std=2, edgecolor=colors[1], facecolor=colors[1], center_color=colors[1], alpha=0.2)
# # confidence_ellipse(pca_1[26:38, pc_1-1], pca_1[26:38, pc_2-1], ax, n_std=2, edgecolor=colors[2], facecolor=colors[2], center_color=colors[2], alpha=0.2)
# # confidence_ellipse(pca_1[38:46, pc_1-1], pca_1[38:46, pc_2-1], ax, n_std=2, edgecolor=colors[3], facecolor=colors[3], center_color=colors[3], alpha=0.2)
# # confidence_ellipse(pca_1[46:58, pc_1-1], pca_1[46:58, pc_2-1], ax, n_std=2, edgecolor=colors[4], facecolor=colors[4], center_color=colors[4], alpha=0.2)
# # confidence_ellipse(pca_1[58:69, pc_1-1], pca_1[58:69, pc_2-1], ax, n_std=2, edgecolor=colors[5], facecolor=colors[5], center_color=colors[5], alpha=0.2)
# # confidence_ellipse(pca_1[69:82, pc_1-1], pca_1[69:82, pc_2-1], ax, n_std=2, edgecolor=colors[6], facecolor=colors[6], center_color=colors[6], alpha=0.2)
# # confidence_ellipse(pca_1[82:93, pc_1-1], pca_1[82:93, pc_2-1], ax, n_std=2, edgecolor=colors[7], facecolor=colors[7], center_color=colors[7], alpha=0.2)

# # plt.xlabel(f'${pc_1}\ главная\ компонента:$ {pca.explained_variance_ratio_[pc_1-1]*100:.2f}%', fontsize=15)
# # plt.ylabel(f'${pc_2}\ главная\ компонента:$ {pca.explained_variance_ratio_[pc_2-1]*100:.2f}%', fontsize=15)
# plt.xlabel(f'${pc_1}я\ главная\ компонента:$ {55.91}%', fontsize=17)
# plt.ylabel(f'${pc_2}я\ главная\ компонента:$ {16.38}%', fontsize=17)
# # plt.title('$PCA,\ раман$')
# plt.xticks(fontsize=15)
# plt.yticks(fontsize=15)

# legend_elements = [Line2D([0], [0], marker='o', color='w', label=cls,
#                           markerfacecolor=color, markersize=10, markeredgecolor='k')
#                    for cls, color in colors_n.items()]

# # plt.legend(handles=legend_elements, title="Классы")

# plt.show()

In [None]:
# pls_binary = PLSRegression(n_components=10)
# # Fit and transform the data
# # X_pls = pls_binary.fit_transform(df_y_mod.T, classes_mw)[0]
# X_pls = pls_binary.fit_transform(StandardScaler().fit_transform(df_y_mod).T, classes_mw)[0]

# pc_1 = 1
# pc_2 = 2

# plt.scatter(X_pls[:, pc_1-1], X_pls[:, pc_2-1], c=colors_for_points , s=100, edgecolors='k')
# # for i in range(10):
# #     plt.scatter(X_pls[0+10*i:10+10*i, 1], X_pls[0+10*i:10+10*i, 2], c=colors[i], s=100, edgecolors='k')
# #     plt.text(np.mean(X_pls[0+10*i:10+10*i, 1]), np.mean(X_pls[0+10*i:10+10*i, 2]), paper_Y[i], bbox=dict(facecolor='none', edgecolor=colors[i], boxstyle='round'))

# plt.xlabel('Latent Variable 1')
# plt.ylabel('Latent Variable 2')
# # plt.legend(labplot,loc='lower left')
# plt.title('PLS, Raman')
# plt.show()

In [None]:
# import plotly.express as px

# # PCA
# pca = PCA(n_components=10)
# pca_1 = pca.fit_transform(df_y_mod.T)  # Транспонируем, как у тебя

# pc_1 = 1
# pc_2 = 2

# # Делаем DataFrame для удобства
# df_plot = pd.DataFrame({
#     f"PC{pc_1}": pca_1[:, pc_1-1],
#     f"PC{pc_2}": pca_1[:, pc_2-1],
#     "class": [cls for cls in colors_for_points],  # твоя раскраска
#     'name': samle_names
# })

# # Рисуем
# fig = px.scatter(
#     df_plot,
#     x=f"PC{pc_1}",
#     y=f"PC{pc_2}",
#     color="class",
#     title="PCA",
#     hover_name='name',
#     labels={
#         f"PC{pc_1}": f"PC{pc_1} ({pca.explained_variance_ratio_[pc_1-1]*100:.2f}%)",
#         f"PC{pc_2}": f"PC{pc_2} ({pca.explained_variance_ratio_[pc_2-1]*100:.2f}%)"
#     }
# )

# fig.update_traces(marker=dict(size=12, line=dict(width=1, color='black')))
# fig.update_layout(legend_title="Классы")

# fig.show()


In [None]:
# n_components = 10

# # total_var = pca.explained_variance_ratio_.sum() * 100

# labels = {str(i): f"PC {i+1}" for i in range(n_components)}

# fig = px.scatter_matrix(
#     pca_1,
#     # color = colors_for_points,
#     # color=c_selected,
#     color=classes_n,
#     color_discrete_map=colors_n,
#     dimensions=range(n_components),
#     labels=labels,
#     # title=f'Total Explained Variance: {total_var:.2f}%', 
#     width=2500, height=2000
# )

# # fig.update_layout(plot_bgcolor='white' )
# # fig.update_xaxes(zeroline=True, zerolinewidth=1, zerolinecolor='LightPink')
# # fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor='LightPink')
# # fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightPink')
# # fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightPink')


# fig.update_traces(diagonal_visible=False)
# fig.show()

In [None]:
# import umap
# # embedding = umap.UMAP(n_neighbors=7, random_state=152).fit_transform(df_norm.iloc[:, :].T)
# embedding = umap.UMAP(n_neighbors=7, random_state=2).fit_transform(df_y_mod.iloc[:, :].T)


# plt.figure(figsize=(12,10))
# scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=colors_for_points , s=100, edgecolors='k')
# # scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=c_selected , s=100, edgecolors='k')
# legend_elements = [Line2D([0], [0], marker='o', color='w', label=cls,
#                           markerfacecolor=color, markersize=10, markeredgecolor='k')
#                    for cls, color in colors_n.items()]

# plt.legend(handles=legend_elements, title="Классы")
# plt.show()

In [None]:
# pcs_1 = pd.concat((pd.Series(df_x), pd.Series(pca.components_[pc_1-1])), axis=1)
# pcs_1
# pcs_1.to_csv(r'C:\Users\gusen\Downloads\аспер\5сем\data\ram\785 loading\pc_1.dat', sep='\t', index=None, header=None)

# pcs_2 = pd.concat((pd.Series(df_x), pd.Series(pca.components_[pc_2-1])), axis=1)
# pcs_2
# pcs_2.to_csv(r'C:\Users\gusen\Downloads\аспер\5сем\data\ram\785 loading\pc_2.dat', sep='\t', index=None, header=None)

In [None]:
fig, ax = plt.subplots(figsize=(12,6))

pc_1 = 1
pc_2 = 2
pc_3 = 4

# pca_1 = pca.fit_transform(df_norm.iloc[:, :].T)

# pca.components_[pc_1-1, :1300] = pca.components_[pc_1-1, :1300]/1.7
# pca.components_[pc_2-1, :1300] = pca.components_[pc_2-1, :1300]/1.7
# pca.components_[pc_3-1, :1300] = pca.components_[pc_3-1, :1300]/1.7

plt.plot(df_x, gaussian_filter1d(pca.components_[pc_1-1], 2), label=f'pc {pc_1}')
# plt.plot(df_x, savgol_filter(pca.components_[pc_1-1], 95, 3), label=f'pc {pc_1}')
plt.plot(df_x, gaussian_filter1d(pca.components_[pc_2-1], 2), label=f'pc {pc_2}')
plt.axhline(0, color="black", linewidth=1, linestyle="--")
# plt.plot(df_x, gaussian_filter1d(pca.components_[pc_3-1], 2), label=f'pc {pc_3}')

plt.text(1600, 0.065, 1, c= 'k', fontsize=23)
plt.text(1600, -0.04, 2, c='k', fontsize=23)

# plt.plot(IR_1800_x.iloc[:], pca.components_[pc_1-1], label=f'pc {pc_1}')
# plt.plot(IR_1800_x.iloc[:], pca.components_[pc_2-1], label=f'pc {pc_2}')
# plt.plot(IR_1800_x.iloc[:], pca.components_[pc_3-1], label=f'pc {pc_3}')

# plt.xlim([800, 4000])
# plt.legend()
# ax.set_xticks(np.arange(800, 4000, 300))
# ax.set_yticklabels([])

plt.xlabel('$Волновое\ число, см^{-1}$', fontsize=20)
plt.ylabel('$Нагрузки\ главных\ компонент,\ усл.\ ед.$', fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.show()

In [None]:
expl_var = np.cumsum(pca.explained_variance_ratio_)

n_comps = 10

fig, ax = plt.subplots(figsize=(8,4))

plt.bar(range(1, n_comps+1), pca.explained_variance_ratio_[:n_comps], align='center',
        label='доля дисперсии описанной отдельной компоненты', color = 'lightsalmon')

for x, y in zip(range(1, n_comps+1), expl_var[:n_comps]):
    ax.annotate(f"{round(y * 100):.0f}%", (x - 0.1, y - 0.07), fontsize=13)

plt.plot(range(1, n_comps+1), expl_var[:n_comps], label='Куммулятивная объясненная дисперсия', marker="o")
plt.grid(alpha=0.3)

plt.xlabel(f'Главные компоненты', fontsize=15)
plt.ylabel(f'Объясненная дисперсия', fontsize=15)
# plt.title('Куммулятивная объясненная дисперсия')

plt.xticks(range(1, n_comps+1), fontsize=15)
plt.yticks(fontsize=15)
# plt.legend(loc='center right', fontsize=13)
plt.text(2.3, 0.8, 1, c= 'k', fontsize=20)
plt.text(2.3, 0.18, 2, c='k', fontsize=20)

plt.show()

In [None]:
nums = 6
rows = 3
cols = math.ceil(nums / rows)
fig, axes = plt.subplots(rows, cols, figsize=(20, 10))
axes = axes.flatten()
for i in range(nums):
    # axes[i].plot(df_x, gaussian_filter1d(pca.components_[i], 10), label=f"Компонента {i + 1}")
    axes[i].plot(df_x, pca.components_[i], label=f"Компонента {i + 1}")
    axes[i].axhline(0, color="black", linewidth=1, linestyle="--")
    axes[i].set_title(f"Компонента {i + 1}: {pca.explained_variance_ratio_[i]*100:.2f}%")
    axes[i].grid(True)
    ax2 = axes[i].twinx()
    ax2.plot(df_x, mean_spectra.iloc[-1, :], color="red", linestyle="-", label="pmma", alpha=0.4)
    axes[i].minorticks_on()
    axes[i].grid(True, which="minor", linestyle=":", linewidth=0.5, alpha=0.5)
    lines1, labels1 = axes[i].get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    axes[i].legend( lines1 + lines2, labels1 + labels2,)
fig.suptitle('pca компоненты рамана снятого на 785 нм')
plt.tight_layout()
plt.show()

In [None]:
# classes_to_select = ['pmma_4_mw', 'pmma_6_mw', 'pmma_8_mw', 'pmma_10_mw', 'pmma_12_mw', 'pmma_14_mw', 'pmma_16_mw']

# selected_columns = [col for col in df_norm.columns if any(col.startswith(cls) for cls in classes_to_select)]
# # subset_names = [col for col in df_norm.columns if any(col.startswith(cls) for cls in classes_to_select)]

# c_selected = [i for en,i in enumerate(colors_for_points) if en<82]

# df_subset = df_norm.iloc[:, :82]
# # df_subset

In [None]:
# fig, ax = plt.subplots(figsize=(8,6))

# pca = PCA(n_components=10)
# pca_1 = pca.fit_transform(df_subset.iloc[:, :].T) #1195

# pc_1 = 1
# pc_2 = 2

# scatter = plt.scatter(pca_1[:, pc_1-1], pca_1[:, pc_2-1], c=c_selected , s=100, edgecolors='k')
# # plt.text(np.mean(pca_1[0+10*i:10+10*i, 0]), np.mean(pca_1[0+10*i:10+10*i, 1]), IR_files_names[i][3:7], bbox=dict(facecolor='none', edgecolor=colors[i], boxstyle='round'))
# # confidence_ellipse(pca_1[0+10*i:10+10*i, pc_1-1], pca_1[0+10*i:10+10*i, pc_2-1], ax, n_std=2, edgecolor=colors[i], facecolor=colors[i], alpha=0.2)

# plt.xlabel(f'${pc_1}\ pc:$ {pca.explained_variance_ratio_[pc_1-1]*100:.2f}%')
# plt.ylabel(f'${pc_2}\ pc:$ {pca.explained_variance_ratio_[pc_2-1]*100:.2f}%')
# plt.title('$PCA$')

# legend_elements = [Line2D([0], [0], marker='o', color='w', label=cls,
#                           markerfacecolor=color, markersize=10, markeredgecolor='k')
#                    for cls, color in colors_n.items()]

# plt.legend(handles=legend_elements, title="Классы")

# plt.show()

In [None]:
# fig, ax = plt.subplots(figsize=(8,6))

# pca = PCA(n_components=10)
# pca_1 = pca.fit_transform(df_subset.iloc[:, :].T) #1195

# pc_1 = 1
# pc_2 = 2

# plt.scatter(pca_1[:10, pc_1-1], pca_1[:10, pc_2-1], c=colors[0], s=100, edgecolors='k', label='pmma_16_mw')
# plt.scatter(pca_1[10:, pc_1-1], pca_1[10:, pc_2-1], c=colors[1], s=100, edgecolors='k', label='pmma_16_mw_rev')
# # plt.text(np.mean(pca_1[0+10*i:10+10*i, 0]), np.mean(pca_1[0+10*i:10+10*i, 1]), IR_files_names[i][3:7], bbox=dict(facecolor='none', edgecolor=colors[i], boxstyle='round'))
# # confidence_ellipse(pca_1[0+10*i:10+10*i, pc_1-1], pca_1[0+10*i:10+10*i, pc_2-1], ax, n_std=2, edgecolor=colors[i], facecolor=colors[i], alpha=0.2)

# plt.xlabel(f'${pc_1}\ pc:$ {pca.explained_variance_ratio_[pc_1-1]*100:.2f}%')
# plt.ylabel(f'${pc_2}\ pc:$ {pca.explained_variance_ratio_[pc_2-1]*100:.2f}%')
# plt.title('$PCA$')

# plt.legend()

# plt.show()

## diff

In [None]:
diff_spectra = pd.DataFrame()
for i in range(df_norm.shape[1]):
    diff_spectra[samle_names[i]] = df_norm[samle_names[i]] - mean_spectra.T['pmma_clean_pmma']

In [None]:
plt.plot(df_x, diff_spectra.iloc[:, :])


plt.xlabel('$wavenumber, cm^{-1}$')
plt.ylabel('$a.u.$')
plt.show()

In [None]:
df_x

In [None]:
labled_diff_df = diff_spectra.T.copy()
labled_diff_df['class'] = classes_n

mean_diff_spectra = pd.DataFrame()
groupped_diff_df = labled_diff_df.groupby('class').mean()
for i in classes:
    mean_diff_spectra[i] = groupped_diff_df.T[i]

mean_diff_spectra = mean_diff_spectra.T.copy()
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)

# a = df_x[1500].index[0]

for i in range(7):
    plt.plot(df_x, mean_diff_spectra.iloc[i, :]-(5.1-i)*0.2, c=colors[i], label=mean_diff_spectra.index[i], linewidth=2)
    # plt.axhline((5-i)*0.2, color=colors[i], linewidth=1, linestyle="--", alpha=0.5)
    # plt.text(1650, 0-(5-i)*0.5, classes[i], c='k' )
plt.text(2000, 0.25, '16', c='k', fontsize=20)
plt.text(2000, 0, '14', c='k', fontsize=20)
plt.text(2000, -0.2, '12', c='k', fontsize=20)
plt.text(2000, -0.37, '10', c='k', fontsize=20)
plt.text(2000, -0.6, '8', c='k', fontsize=20)
plt.text(2000, -0.8, '6', c='k', fontsize=20)
plt.text(2000, -0.99, '4', c='k', fontsize=20)

# plt.xlim((700, 1800))
# ax.set_xticks(np.arange(800, 4000, 200))
# plt.grid(alpha=0.3, which='both')
# plt.minorticks_on()
plt.yticks([])
plt.xlabel('$Волновое\ число, см^{-1}$', fontsize=25)
plt.ylabel('$Интенсивность,\ усл.\ ед.$', fontsize=25)
# plt.legend(bbox_to_anchor=(1.05, 1))
plt.show()

In [None]:
# for i in range(mean_diff_spectra.shape[0]-1):
#     df_sp = pd.concat((pd.Series(df_x), mean_diff_spectra.iloc[i, :].T), axis=1)
#     # plt.plot(df_sp.iloc[:, 0], df_sp.iloc[:, 1])
#     df_sp.to_csv(rf'C:\Users\gusen\Downloads\аспер\5сем\data\ram\разностные спектры\{mean_diff_spectra.index[i]}.dat', sep='\t', header=None, index=None)

In [None]:
labled_diff_df = diff_spectra.T.copy()
labled_diff_df['class'] = classes_n

mean_diff_spectra = pd.DataFrame()
groupped_diff_df = labled_diff_df.groupby('class').mean()
for i in classes:
    mean_diff_spectra[i] = groupped_diff_df.T[i]

mean_diff_spectra = mean_diff_spectra.T.copy()
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)

# a = df_x[1500].index[0]

for i in range(7):
    plt.plot(df_x, mean_diff_spectra.iloc[i, :], c=colors[i], label=mean_diff_spectra.index[i], linewidth=2)
    
    # plt.text(1650, df_means_smooth.iloc[660, i]+(5-i)*0.5, classes[i], c='k' )
# plt.axhline(0,  color='b', linewidth=1, linestyle="--", alpha=0.5)
ax.plot([300, 2000], [0, 0], color='k', linewidth=2, linestyle='--', alpha=0.5)

# plt.text(1650, mean_diff_spectra.iloc[0, 397], 1, c= 'k', fontsize=17)
# plt.text(1650, mean_diff_spectra.iloc[1, 397], 2, c='k', fontsize=17)
# plt.text(1650, mean_diff_spectra.iloc[2, 397]-0.03, 3, c='k', fontsize=17)
# plt.text(1520, mean_diff_spectra.iloc[3, 368]+0.35, 4, c='k', fontsize=17)
# plt.text(1520, mean_diff_spectra.iloc[4, 368]+0.4, 5, c='k', fontsize=17)
# plt.text(1520, mean_diff_spectra.iloc[5, 368]+0.37, 6, c='k', fontsize=17)
# plt.text(1520, mean_diff_spectra.iloc[6, 368]+0.45, 7, c='k', fontsize=17)

arrow_kw = dict(arrowstyle='-', color='k', lw=1.5, shrinkA=0, shrinkB=0)


x2 = df_x.iloc[335] if hasattr(df_x, "iloc") else df_x[335]
ax.annotate('1', xy=(1555, 0.1), xytext=(1470, -0.07),
            textcoords='data', fontsize=17, color='k', ha='left', va='center',
            arrowprops=arrow_kw)

ax.annotate('2', xy=(1590, 0.04), xytext=(1470, -0.14),
            textcoords='data', fontsize=17, color='k', ha='left', va='center',
            arrowprops=arrow_kw)

ax.annotate('3', xy=(1660, -0.03), xytext=(1470, -0.2),
            textcoords='data', fontsize=17, color='k', ha='left', va='center',
            arrowprops=arrow_kw)
ax.annotate('4', xy=(x2, mean_diff_spectra.iloc[3, 334]), xytext=(1520, mean_diff_spectra.iloc[3, 368]+0.35),
            textcoords='data', fontsize=17, color='k', ha='left', va='center',
            arrowprops=arrow_kw)

ax.annotate('5', xy=(x2, mean_diff_spectra.iloc[4, 335]), xytext=(1520, mean_diff_spectra.iloc[4, 368]+0.40),
            textcoords='data', fontsize=17, color='k', ha='left', va='center',
            arrowprops=arrow_kw)

ax.annotate('6', xy=(x2, mean_diff_spectra.iloc[5, 335]), xytext=(1520, mean_diff_spectra.iloc[5, 368]+0.37),
            textcoords='data', fontsize=17, color='k', ha='left', va='center',
            arrowprops=arrow_kw)

ax.annotate('7', xy=(x2, mean_diff_spectra.iloc[6, 342]), xytext=(1520, mean_diff_spectra.iloc[6, 368]+0.45),
            textcoords='data', fontsize=17, color='k', ha='left', va='center',
            arrowprops=arrow_kw)

# plt.xlim((700, 1800))
# ax.set_xticks(np.arange(800, 4000, 200))
# plt.grid(alpha=0.3, which='both')
# plt.minorticks_on()
# ax.set_yticklabels([])
plt.xlabel('$Волновое\ число, см^{-1}$', fontsize=20)
plt.ylabel('$Интенсивность,\ усл.\ ед.$', fontsize=20)
plt.xticks(fontsize=15)
# plt.legend(bbox_to_anchor=(1.05, 1))
plt.show()

In [None]:
# mean_diff_spectra_mod = mean_diff_spectra.T.copy()
# mean_diff_spectra_mod.set_index(df_x, inplace=True)  
# mean_diff_spectra_mod.to_csv(r'C:\Users\gusen\Downloads\аспер\6 сем\data\5\mean_diff_spectra.csv')

In [None]:
# fig = go.Figure()

# for i in range(len(mean_diff_spectra)):
#     fig.add_trace(go.Scatter(
#         x=df_x,
#         y=mean_diff_spectra.iloc[i, :] + (5 - i) * 0.2,  # смещение
#         mode="lines",
#         name=classes[i],
#         line=dict(color=colors[i])
#     ))

# fig.update_layout(
#     width=800,
#     height=800,
#     title="Mean spectra by class",
#     xaxis_title="Wavenumber, cm⁻¹",
#     yaxis_title="a.u.",
#     legend_title="Классы"
# )

# # fig.update_xaxes(range=[700, 1800], dtick=200)

# fig.show()

In [None]:
fig = go.Figure()

for i in range(len(mean_diff_spectra)):
    fig.add_trace(go.Scatter(
        x=df_x,
        y=mean_diff_spectra.iloc[i, :],  
        mode="lines",
        name=classes[i],
        line=dict(color=colors[i])
    ))

fig.update_layout(
    width=800,
    height=800,
    title="Mean spectra by class",
    xaxis_title="Wavenumber, cm⁻¹",
    yaxis_title="a.u.",
    legend_title="Классы"
)

# Ограничения оси X (как у тебя в комментарии)
# fig.update_xaxes(range=[700, 1800], dtick=200)

fig.show()

In [None]:
fig = go.Figure()

for i in range(len(mean_diff_spectra)):
    fig.add_trace(go.Scatter(
        x=df_x,
        y=mean_diff_spectra.iloc[i, :] * 100 / mean_spectra.iloc[-1, :] ,  
        mode="lines",
        name=classes[i],
        line=dict(color=colors[i])
    ))

fig.update_layout(
    width=1200,
    height=800,
    # yaxis=dict(range=[0, 100]),
    title="Mean spectra by class",
    xaxis_title="Wavenumber, cm⁻¹",
    yaxis_title="a.u.",
    legend_title="Классы"
)

# fig.update_xaxes(range=[700, 1800], dtick=200)

fig.show()

In [None]:
plt.plot(df_x, mean_diff_spectra.iloc[-2, :] * 100 / mean_spectra.iloc[-1, :]) # 
# plt.plot(df_x, mean_spectra.iloc[-2, :]) 
# plt.plot(df_x, mean_spectra.iloc[-1, :])
plt.axhline(0)
plt.ylim(top=10)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8,6))

pca = PCA(n_components=10)
pca_1 = pca.fit_transform(diff_spectra.iloc[:, :].T) #1195

pc_1 = 1
pc_2 = 2

scatter = plt.scatter(pca_1[:, pc_1-1], pca_1[:, pc_2-1], c=colors_for_points , s=100, edgecolors='k')
# plt.text(np.mean(pca_1[0+10*i:10+10*i, 0]), np.mean(pca_1[0+10*i:10+10*i, 1]), IR_files_names[i][3:7], bbox=dict(facecolor='none', edgecolor=colors[i], boxstyle='round'))
# confidence_ellipse(pca_1[0+10*i:10+10*i, pc_1-1], pca_1[0+10*i:10+10*i, pc_2-1], ax, n_std=2, edgecolor=colors[i], facecolor=colors[i], alpha=0.2)

plt.xlabel(f'${pc_1}\ pc:$ {pca.explained_variance_ratio_[pc_1-1]*100:.2f}%')
plt.ylabel(f'${pc_2}\ pc:$ {pca.explained_variance_ratio_[pc_2-1]*100:.2f}%')
plt.title('$PCA$')

legend_elements = [Line2D([0], [0], marker='o', color='w', label=cls,
                          markerfacecolor=color, markersize=10, markeredgecolor='k')
                   for cls, color in colors_n.items()]

plt.legend(handles=legend_elements, title="Классы")

plt.show()

In [None]:
nums = 6
rows = 2
cols = math.ceil(nums / rows)
fig, axes = plt.subplots(rows, cols, figsize=(20, 10))
axes = axes.flatten()
for i in range(nums):
    axes[i].plot(df_x, gaussian_filter1d(pca.components_[i], 0.1), label=f"Компонента {i + 1}")
    axes[i].axhline(0, color="black", linewidth=1, linestyle="--")
    axes[i].set_title(f"Компонента {i + 1}: {pca.explained_variance_ratio_[i]*100:.2f}%")
    axes[i].grid(True)
    ax2 = axes[i].twinx()
    ax2.plot(df_x, mean_spectra.iloc[-1, :], color="red", linestyle="-", label="pmma", alpha=0.4)
    axes[i].minorticks_on()
    axes[i].grid(True, which="minor", linestyle=":", linewidth=0.5, alpha=0.5)
    lines1, labels1 = axes[i].get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    axes[i].legend( lines1 + lines2, labels1 + labels2,)
    fig.suptitle('pca компоненты рамана снятого на 785 нм')
plt.tight_layout()
plt.show()

In [None]:
def extract_mw(name: str) -> int:
    if "clean" in name:
        return 0
    match = re.search(r'_(\d+)_mw', name)
    return int(match.group(1)) if match else None

classes_mw = [extract_mw(s) for s in classes_n]

In [None]:
# v = np.asarray(classes_mw) 
# corr = {}

# for i, name in enumerate(df_smooth.columns):
#     y = df_smooth.iloc[:, i].to_numpy()
#     corr[name] = np.corrcoef(v, y)[0, 1]

# corr = pd.Series(corr)

In [None]:
# v_series = pd.Series(classes_mw, index=df_y_mod.T.index)

# corr = df_smooth.corrwith(df_y_mod.T)

In [None]:
v_series = df_y_mod.copy()
v_series.loc[501, :] = classes_mw
correl = v_series.T.corr()
plt.plot(correl.loc[501, :-1])

In [None]:
plt.bar(new_x, correl.loc[501, 0:500])

In [None]:
correl.loc[501, 0:500].idxmax()

## lin reg

In [None]:
plt.scatter(df_y_mod.loc[66,:], classes_mw)

In [None]:
threshold = 0.5

idx = correl.loc[501, 0:500][(correl.loc[501, 0:500]) > threshold].index
idx.value_counts().sum()

In [None]:
print(idx)
print(new_x[idx])

In [None]:
nums = len(idx)
rows = 3
cols = math.ceil(nums / rows)
fig, axes = plt.subplots(rows, cols, figsize=(20, 10))
axes = axes.flatten()
for en,i in enumerate(idx):
    axes[en].scatter(df_y_mod.loc[i,:], classes_mw)
    axes[en].set_xlabel(r'$a.u. величина\ пика, корр: $', )
    axes[en].set_ylabel(r'$mw$')
    axes[en].set_title(f'wavenumber - {new_x[i]}')
    # k, b = np.polyfit(df_y_mod.loc[i,:], classes_mw, 1)
    # y_fit = k * df_y_mod.loc[i,:] + b
plt.tight_layout()
plt.show()

In [None]:
plt.scatter(df_y_mod.loc[430, :'pmma_16_mw_8'], classes_mw[0:-11])

In [None]:
ddt

In [None]:
ddt = pd.concat((df_y_mod.loc[430, :'pmma_16_mw_8'].reset_index(drop=True), pd.Series(classes_mw[0:-11])), axis=1)
ddt.columns = ["intensity", "mW"]
ddt_means = ddt.groupby("mW").mean()

linreg = LinearRegression().fit(X=ddt_means.values, y=ddt_means.index)
linreg_1 = linreg.predict(ddt_means.values)

classes_sorted = np.sort(ddt["mW"].unique())
box_data = [ddt.loc[ddt["mW"] == c, "intensity"].values for c in classes_sorted]

fig, ax = plt.subplots()
fig.set_size_inches(12, 8)

plt.boxplot(box_data, vert=True, positions=classes_sorted, widths=0.6, manage_ticks=False, showfliers=False)
# plt.scatter(ddt_means.values, ddt_means.index)
plt.plot(linreg_1, ddt_means.values, color='r' )
plt.ylabel('$Интенсивность\ пика\ 1765\ см^{-1},\ усл.\ ед.$', fontsize=20)
plt.xlabel('$Мощность\ лазера,\ мВт$', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

# plt.title(f'$R^{2}={np.round(r2_score(ddt_means.index, linreg_1), 2)},\ corr={np.round(np.corrcoef(ddt_means.index.T, linreg_1.T)[0, 1], 2)}$')
plt.show()
# label=f'$R^{2}={np.round(r2_score(i_380c, linreg_1), 2)},\ corr={np.round(np.corrcoef(i_380c.T, linreg_1.T)[0, 1], 2)}$'

In [None]:
ddt = pd.concat((df_y_mod.loc[305, :'pmma_16_mw_8'].reset_index(drop=True), pd.Series(classes_mw[0:-11])), axis=1)
ddt.columns = ["intensity", "mW"]
ddt_means = ddt.groupby("mW").mean()

linreg = LinearRegression().fit(X=ddt_means.values, y=ddt_means.index)
linreg_1 = linreg.predict(ddt_means.values)

classes_sorted = np.sort(ddt["mW"].unique())
box_data = [ddt.loc[ddt["mW"] == c, "intensity"].values for c in classes_sorted]

plt.boxplot(box_data, vert=True, positions=classes_sorted, widths=0.6, manage_ticks=False, showfliers=False)
# plt.scatter(ddt_means.values, ddt_means.index)
plt.plot(linreg_1, ddt_means.values, color='r' )
plt.ylabel(f'интенсивность пика {df_x[305]}')
plt.xlabel('mW')
plt.title(f'$R^{2}={np.round(r2_score(ddt_means.index, linreg_1), 2)},\ corr={np.round(np.corrcoef(ddt_means.index.T, linreg_1.T)[0, 1], 2)}$')
plt.show()
# label=f'$R^{2}={np.round(r2_score(i_380c, linreg_1), 2)},\ corr={np.round(np.corrcoef(i_380c.T, linreg_1.T)[0, 1], 2)}$'

In [None]:
mask = df_y_mod.T[345] >= 1.9
df_y_mod.T[mask]
df_y_mod.T.iloc[np.where(mask)[0], :]

In [None]:
np.sum(np.array(classes_mw) == 8)

In [None]:
ddt = pd.concat((df_y_mod.loc[420, :'pmma_16_mw_8'].reset_index(drop=True), pd.Series(classes_mw[0:-11])), axis=1)
ddt = ddt.groupby(0).mean()

linreg = LinearRegression().fit(X=ddt.values, y=ddt.index)
linreg_1 = linreg.predict(ddt.values)
plt.scatter(ddt.values, ddt.index)
plt.plot(ddt.values, linreg_1, color='r' )
plt.xlabel('интенсивность пика 1730')
plt.ylabel('mW')
plt.title(f'$R^{2}={np.round(r2_score(ddt.index, linreg_1), 2)},\ corr={np.round(np.corrcoef(ddt.index.T, linreg_1.T)[0, 1], 2)}$')
plt.show()

In [None]:
plt.scatter(df_y_mod.loc[421, :], df_y_mod.loc[306, :], c=colors_for_points , s=100, edgecolors='k')

In [None]:
plt.figure(figsize=(8, 3))
plt.imshow(correl.loc[501, 0:500], aspect="auto", cmap="coolwarm", vmin=-1, vmax=1)
plt.colorbar(label="Correlation")
plt.yticks([])
plt.xticks(range(len(correl.loc[501, 0:500])), correl.loc[501, 0:500].index, rotation=90)
plt.tight_layout()
plt.show()

In [None]:
a = 

In [None]:
import plotly.express as px

# PCA
pca = PCA(n_components=10)
pca_1 = pca.fit_transform(diff_spectra.T)  # Транспонируем, как у тебя

pc_1 = 1
pc_2 = 2

# Делаем DataFrame для удобства
df_plot = pd.DataFrame({
    f"PC{pc_1}": pca_1[:, pc_1-1],
    f"PC{pc_2}": pca_1[:, pc_2-1],
    "class": [cls for cls in colors_for_points],  # твоя раскраска
    'name': samle_names
})

# Рисуем
fig = px.scatter(
    df_plot,
    x=f"PC{pc_1}",
    y=f"PC{pc_2}",
    color="class",
    title="PCA",
    hover_name='name',
    labels={
        f"PC{pc_1}": f"PC{pc_1} ({pca.explained_variance_ratio_[pc_1-1]*100:.2f}%)",
        f"PC{pc_2}": f"PC{pc_2} ({pca.explained_variance_ratio_[pc_2-1]*100:.2f}%)"
    }
)

fig.update_traces(marker=dict(size=12, line=dict(width=1, color='black')))
fig.update_layout(legend_title="Классы")

fig.show()


In [None]:
import plotly.graph_objects as go

def plot_spectra(df, x=None, title=""):
    """
    df : pd.DataFrame
        Таблица, где каждая колонка = один спектр.
    x : массив или список
        Значения по оси X (например, длина волны). Если None -> используется индекс df.
    """
    fig = go.Figure()
    fig.update_layout(
        plot_bgcolor="white",
        paper_bgcolor="white",       
        xaxis=dict(
        # showline=True,
        # linecolor="black",
        # linewidth=2,
        showgrid=True,
        gridcolor="lightgray",  
        zeroline=False,
        minor=dict(
            showgrid=True,
            gridcolor="gainsboro",  
            dtick=50 )                      
        ),
        yaxis=dict(
        # showline=True,
        # linecolor="black",
        # linewidth=2,
        showgrid=True,
        gridcolor="lightgray",
        zeroline=False,
        minor=dict(
            showgrid=True,
            gridcolor="gainsboro",
            dtick=0.1  )
        ),
        width=900,   
        height=600,
        margin=dict(
            l=50,   
            r=50,   
            t=80,   
            b=60    
    )
)
    
    if x is None:
        x = df.index
    
    for col in df.columns:
        fig.add_trace(go.Scatter(
            x=x,
            y=df[col],
            mode="lines",
            name=str(col),
            hoverinfo="name"
        ))
    
    fig.update_layout(
        title=title,
        xaxis_title="$wavenumber, cm^{-1}$",
        yaxis_title="Интенсивность",
        hovermode="closest"
    )
    
    fig.show()

In [None]:
plot_spectra(diff_spectra, df_x)

In [None]:
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=df_x, y=diff_spectra.iloc[:, 55], mode='lines'), secondary_y=False)
# fig.add_trace(go.Scatter(x=df_x, y=df_f.iloc[:, 4], mode='lines'), secondary_y=False)
# fig.add_trace(go.Scatter(x=df_x, y=df_f.iloc[:, 9], mode='lines'), secondary_y=True)
# fig.add_trace(go.Scatter(x=df_x, y=df_f.iloc[:, 10], mode='lines'), secondary_y=False)
# fig.add_trace(go.Scatter(x=df_x, y=df_f.iloc[:, 4] - df_f.iloc[:, 10], mode='lines'), secondary_y=False)
# for i in range(10, 14):
#     fig.add_trace(go.Scatter(x=df_x, y=df_f.iloc[:, i], mode='lines', line=dict(color="#4361ee")), secondary_y=False)

fig.update_layout(
    margin=dict(l=10, r=10, t=10, b=20), width=1400, height=600)

In [None]:
for en,i in enumerate(samle_names):
    if i == 'pmma_clean_pmma_12':
        print(en)

In [None]:
import numpy as np
from scipy.spatial import ConvexHull
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt

def rubberband(x, y):
    # Находим выпуклую оболочку
    v = ConvexHull(np.column_stack([x, y])).vertices
    v = np.roll(v, -v.argmin())  # начинаем с минимального индекса
    base = v[:v.argmax()+1]      # нижняя часть оболочки
    baseline = interp1d(x[base], y[base], kind='linear', fill_value="extrapolate")(x)
    return baseline

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial import ConvexHull
from scipy.interpolate import interp1d

def rubberband(x, y):
    """
    Построение baseline методом rubber band для одного спектра
    """
    # построим выпуклую оболочку
    v = ConvexHull(np.column_stack([x, y])).vertices
    v = np.roll(v, -v.argmin())  # начать с минимального индекса
    base = v[:v.argmax()+1]      # нижняя часть оболочки
    
    # интерполяция baseline
    baseline = interp1d(x[base], y[base], kind='linear', fill_value="extrapolate")(x)
    return baseline

def rubberband_correction(df):
    """
    df: DataFrame (index = x, columns = спектры)
    Возвращает скорректированный DataFrame
    """
    x = df.index.to_numpy()
    corrected = {}
    for col in df.columns:
        y = df[col].to_numpy()
        baseline = rubberband(x, y)
        corrected[col] = y - baseline
    return pd.DataFrame(corrected, index=df.index)

In [None]:
def rubberband_baseline(x, y, n_baseline_points=64):
    """
    Rubberband correction с фиксированным числом baseline точек.
    
    x : np.array — ось спектра
    y : np.array — спектр
    n_baseline_points : int — количество точек, используемых для baseline
    """
    # Выбираем индексы baseline точек равномерно
    idx = np.linspace(0, len(x)-1, n_baseline_points, dtype=int)
    x_baseline = x[idx]
    y_baseline = y[idx]
    
    # Интерполяция baseline
    baseline = interp1d(x_baseline, y_baseline, kind='linear', fill_value="extrapolate")(x)
    return baseline

In [None]:
from scipy.signal import argrelextrema
import numpy as np

def concave_rubberband(x, y):
    # ищем локальные минимумы
    y = np.array(y)
    minima_idx = argrelextrema(y, np.less)[0]
    # добавляем крайние точки
    minima_idx = np.concatenate([[0], minima_idx, [len(y)-1]])
    baseline = np.interp(x, x[minima_idx], y[minima_idx])
    return baseline

In [None]:
import numpy as np
import pandas as pd
from scipy.signal import argrelextrema
from scipy.interpolate import interp1d

def concave_rubberband_baseline(x, y, n_baseline_points=64):
    """
    Построение baseline методом concave rubberband для одного спектра
    """
    # равномерно выбираем n_baseline_points
    x = np.array(x)
    y = np.array(y)

    idx = np.linspace(0, len(x)-1, n_baseline_points, dtype=int)
    x_sample = x[idx]
    y_sample = y[idx]
    
    # ищем локальные минимумы среди этих точек
    minima_idx = argrelextrema(y_sample, np.less)[0]
    
    # добавляем крайние точки, если их нет
    if 0 not in minima_idx:
        minima_idx = np.insert(minima_idx, 0, 0)
    if len(y_sample)-1 not in minima_idx:
        minima_idx = np.append(minima_idx, len(y_sample)-1)
    
    x_min = x_sample[minima_idx]
    y_min = y_sample[minima_idx]
    
    # линейная интерполяция baseline через минимумы
    baseline = interp1d(x_min, y_min, kind='linear', fill_value="extrapolate")(x)
    # baseline = interp1d(x_min, y_min, kind='cubic', fill_value="extrapolate")(x)
    
    return baseline

def concave_rubberband_correction(x, y, n_baseline_points=64, iterations=1):
    """
    df: DataFrame (index = x, columns = спектры)
    Возвращает скорректированный DataFrame после concave rubberband correction
    """
    x = np.array(x)
    corrected = {}
    
    for col in df.columns:
        y = df[col].to_numpy()
        y_corrected = y.copy()
        for _ in range(iterations):
            baseline = concave_rubberband_baseline(x, y_corrected, n_baseline_points)
            # y_corrected = y_corrected - baseline
            y_corrected = np.maximum(y_corrected - baseline, 0)
        corrected[col] = y_corrected
    
    return pd.DataFrame(corrected, index=df.index)


In [None]:
import numpy as np
from scipy import sparse
from scipy.spatial import ConvexHull
from scipy.interpolate import interp1d
from scipy.sparse.linalg import spsolve
from multiprocessing.pool import Pool, ThreadPool
import os
# import dill


def mp_bgcorrection(func, y, lim_single=8, lim_tp=40, progressCallback=None):
    if len(y) < 1:
        return y.copy()
    if y.ndim < 2:
        return func(y)
    if hasattr(os, 'sched_getaffinity'):
        cpus = len(os.sched_getaffinity(os.getpid()))
    else:
        cpus = os.cpu_count()
    cpus = min(cpus, len(y))
    if cpus == 1 or len(y) <= lim_single:
        cpus = 1
        it = map(func, y)
    elif len(y) <= lim_tp:
        cpus = min(cpus, 3)
        pool = ThreadPool(cpus)
        it = pool.imap(func, y, chunksize=5)
    else:
        pool = Pool(cpus)
        it = pool.imap(*pack_function_for_map(func, y), chunksize=10)

    ret = np.empty_like(y)
    for i in range(len(y)):
        ret[i] = next(it)
        if progressCallback:
            progressCallback(i+1, len(y))
    return ret

def rubberband(x, y, progressCallback=None):
    """
    Rubberband baseline correction of one or more spectra.
    Parameters:
    x: wavenumbers, sorted in either direction
    y: spectrum at those wavenumbers, or multiple spectra as array of shape (spectrum, wavenumber)
    progressCallback(int a, int b): callback function called to indicated that the processing
        is complete to a fraction a/b.
    Returns: baseline of the spectrum, measured at the same points
    """
    x = np.array(x)
    x = np.array(y)

    if x[0] > x[-1]:
        return rubberband(x[::-1], y[...,::-1],
                          progressCallback=progressCallback)[...,::-1]
    def rubberband_one(yy):
        # Find the convex hull
        v = ConvexHull(np.column_stack((x, yy))).vertices
        # Rotate convex hull vertices until they start from the lowest one
        v = np.roll(v, -v.argmin())
        # Leave only the ascending part
        v = v[:v.argmax()+1]
        # Create baseline using linear interpolation between vertices
        b = np.interp(x, x[v], yy[v])
        return b
    return mp_bgcorrection(rubberband_one, y, lim_single=100, lim_tp=10000,
                           progressCallback=progressCallback)

def concaverubberband(x, y, iters=1, progressCallback=None):
    """
    Concave rubberband baseline correction. This algorithm removes more than a
    straight line, alternating with normal rubberband to bring negative points
    up again. It does not converge nicely and will eat up all the data if run
    with many iterations.
    Parameters:
    x: wavenumbers, sorted from low to high (todo: implement high-to-low)
    y: spectrum at those wavenumbers
    iters: iterations to run; note that this algorithm doesn't converge nicely
    progressCallback(int a, int b): callback function called to indicated that the processing
        is complete to a fraction a/b.
    Returns: baseline of the spectrum, measured at the same points
    """
    def concaverubberband_one(yy):
        origyy = yy
        yy = yy - rubberband(x, yy);
        for i in range(iters):
            F = .1 * (yy.max() - yy.min())
            xmid = .5 * (x[-1] + x[0])
            d2 = .25 * (x[-1] - x[0]) ** 2
            yy += F * (x - xmid)**2 / d2
            yy -= rubberband(x, yy);
        return origyy - yy
    return mp_bgcorrection(concaverubberband_one, y, lim_single=30, lim_tp=500,
                           progressCallback=progressCallback)



In [None]:
def concave_rubberband_baseline(x, y, n_points=64, iterations=1):

    x = np.array(x)
    y = np.array(y)

    # Строим нижнюю оболочку (как резинка)
    baseline_points = np.linspace(0, len(x)-1, n_points, dtype=int)

    for _ in range(iterations):
        x_sel = x[baseline_points]
        y_sel = y[baseline_points]

        # находим "выпуклую оболочку" для этих точек
        hull = ConvexHull(np.column_stack([x_sel, y_sel]))
        hull_points = np.unique(hull.vertices)

        # оставляем только нижнюю часть
        hull_points = hull_points[np.argsort(x_sel[hull_points])]
        baseline_points = hull_points

    # интерполяция базовой линии
    baseline = np.interp(x, x_sel[baseline_points], y_sel[baseline_points])

    corrected = y - baseline
    return  corrected

In [None]:
# пример данных
# x = np.linspace(0, 10, 200)
# y = np.sin(x) + 0.1*x + np.random.normal(0, 0.1, size=200) + 5  # "спектр" с фоном

corrected = rubberband(df_x, df_y.iloc[:, 100])


plt.plot(df_x, df_y.iloc[:, 100], label="Исходный спектр")
# plt.plot(df_x, baseline, label="Baseline (rubber band)")
plt.plot(df_x, corrected, label="Корректированный")
plt.legend()
plt.show()

In [None]:
int([1.5, 2.5])

# IR

In [None]:
df2 = pd.read_csv(r'C:\Users\gusen\Downloads\аспер\5сем\data\ir\2025_09_24\clean_pmma.dpt', delimiter='\t', header=None)

df2_x = df2.iloc[:, 0].copy()
df2_y = df2.iloc[:, 1:].copy()

df2_x = df2_x.iloc[::-1].copy()
df2_y = df2_y.iloc[::-1].copy()

df2_x.reset_index(drop=True, inplace=True)
df2_y.reset_index(drop=True, inplace=True)

df2_x_copy = df2_x.copy()
df2_y_copy = df2_y.copy()

x_begin = 700
x_end = 1800

x_begin = df2_x_copy[df2_x_copy>=x_begin].index[0]
x_end = df2_x_copy[df2_x_copy>=x_end].index[0]

df2_x = df2_x_copy.iloc[x_begin:x_end].copy()
df2_y = df2_y_copy.iloc[x_begin:x_end, :].copy()

df2_x.reset_index(drop=True, inplace=True)
df2_y.reset_index(drop=True, inplace=True)

baseline_fitter = Baseline(df2_x, check_finite=False)

df_bg = pd.DataFrame()
df_f = pd.DataFrame()
for i in range(df2_y.shape[1]):
    # df_bg[samle_names[i]] = baseline_fitter.asls(df_y.iloc[:, i], lam=1e7, p=0.007)[0]
    df_bg[df2_y.columns[i]] = baseline_fitter.rubberband(df2_y.iloc[:, i], smooth_half_window=15)[0]
    df_f[df2_y.columns[i]] = df2_y.iloc[:, i] - df_bg.iloc[:, i]
df_f[df_f<0] = 0


df2_mean = df_f.mean(axis=1)

fig, axes = plt.subplots(1, 2, figsize=(16, 4))  # 1 строкa, 2 столбца

axes[0].plot(df2_x, df_f.iloc[:, :], color=colors[0])
axes[0].set_title("all")
axes[1].plot(df2_x, df2_mean, color=colors[1])
axes[1].set_title("mean")

plt.show()

In [None]:
subfolder_path = r'C:\Users\gusen\Downloads\аспер\5сем\data\ir\pmma'
df3_y = pd.DataFrame()
for file in sorted(os.listdir(subfolder_path), key=numericalSort):
    file_path = os.path.join(subfolder_path, file)
    if not os.path.isfile(file_path):
        continue

    df3_y[file[:-4]] = pd.read_csv(file_path, delimiter='\t', header=None).iloc[:, 1]
df3_x = pd.read_csv(file_path, delimiter='\t', header=None).iloc[:, 0]

df3_x = df3_x.iloc[::-1].copy()
df3_y = df3_y.iloc[::-1].copy()

df3_x.reset_index(drop=True, inplace=True)
df3_y.reset_index(drop=True, inplace=True)

df3_x_copy = df3_x.copy()
df3_y_copy = df3_y.copy()

x_begin = 700
x_end = 1800

x_begin = df3_x_copy[df3_x_copy>=x_begin].index[0]
x_end = df3_x_copy[df3_x_copy>=x_end].index[0]

df3_x = df3_x_copy.iloc[x_begin:x_end].copy()
df3_y = df3_y_copy.iloc[x_begin:x_end, :].copy()

df3_x.reset_index(drop=True, inplace=True)
df3_y.reset_index(drop=True, inplace=True)

df4 = pd.concat((df2_mean, df3_y), axis=1)
df4 = df4.rename(columns={0: 'clean_pmma', 'pmma_10_2mw': 'pmma_10mw'})
df4.drop(columns='pmma_10_1mw', inplace=True)

df4_cols = list(df4.columns)
df4 = df4[df4_cols[1:] + df4_cols[:1]]

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)



for i in range(df4.shape[1]):
    plt.plot(df3_x, df4.iloc[:, i], c=colors[i], label=df4.columns[i], linewidth=2)

    # plt.text(1650, df_means_smooth.iloc[660, i]+(5-i)*0.5, classes[i], c='k' )

plt.text(1100, 0.3 * 1.05, 1, c='k', fontsize=20)
plt.text(1100, 0.3 * 0.98, 2, c='k', fontsize=20)
plt.text(1100, 0.3 * 0.65, 3, c='k', fontsize=20)
plt.text(1090, 0.3 * 0.4, 4, c='k', fontsize=20)
plt.text(1110, 0.3 * 0.1, 5, c='k', fontsize=20)
plt.text(1110, 0.3 * -0.03, 6, c='k', fontsize=20)
plt.text(1060, 0.3 * 0.06, 7, c='k', fontsize=20)
plt.text(970, 0.3 * -0.03, 8, c='k', fontsize=20)

# plt.xlim((700, 1800))
# ax.set_xticks(np.arange(800, 4000, 200))
# plt.title('пмма, ик')
# ax.set_yticklabels([])
plt.xlabel('$Волновое\ число, см^{-1}$', fontsize=20)
plt.ylabel('$Интенсивность,\ усл.\ ед.$', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
# plt.legend()
plt.show()

In [None]:
len(df4)

In [None]:
fig = go.Figure()

for i in range(df4.shape[1]):
    fig.add_trace(go.Scatter(
        x=df3_x,
        y=df4.iloc[:, i],  # смещение
        mode="lines",
        name=classes[i],
        line=dict(color=colors[i])
    ))

fig.update_layout(
    width=1200,
    height=600,
    title="Mean spectra by class",
    xaxis_title="Wavenumber, cm⁻¹",
    yaxis_title="a.u.",
    legend_title="Классы"
)

# Ограничения оси X (как у тебя в комментарии)
# fig.update_xaxes(range=[700, 1800], dtick=200)

fig.show()

In [None]:
subfolder_path = r'C:\Users\gusen\Downloads\аспер\5сем\data\ir_2\pmma'
df_y = pd.DataFrame()
for file in sorted(os.listdir(subfolder_path), key=numericalSort):
    file_path = os.path.join(subfolder_path, file)
    if not os.path.isfile(file_path):
        continue

    df_y[file[:-4]] = pd.read_csv(file_path, delimiter='\t', header=None).iloc[:, 1]
df_x = pd.read_csv(file_path, delimiter='\t', header=None).iloc[:, 0]

df_x = df_x.iloc[::-1].copy()
df_y = df_y.iloc[::-1].copy()

df_x.reset_index(drop=True, inplace=True)
df_y.reset_index(drop=True, inplace=True)

df_x_copy = df_x.copy()
df_y_copy = df_y.copy()

In [None]:
x_begin = 700
x_end = 2000

x_begin = df_x_copy[df_x_copy>=x_begin].index[0]
x_end = df_x_copy[df_x_copy>=x_end].index[0]

df_x = df_x_copy.iloc[x_begin:x_end].copy()
df_y = df_y_copy.iloc[x_begin:x_end, :].copy()

df_x.reset_index(drop=True, inplace=True)
df_y.reset_index(drop=True, inplace=True)

In [None]:
baseline_fitter = Baseline(df_x, check_finite=False)

df_bg = pd.DataFrame()
df_f = pd.DataFrame()
for i in range(df_y.shape[1]):
    df_bg[samle_names[i]] = baseline_fitter.asls(df_y.iloc[:, i], lam=1e5, p=0.01)[0]
    # df_bg[df_y.columns[i]] = baseline_fitter.rubberband(df_y.iloc[:, i], smooth_half_window=15)[0]
    df_f[df_y.columns[i]] = df_y.iloc[:, i] - df_bg.iloc[:, i]
    # df_bg[df2_y.columns[i]] = concave_rubberband_baseline(df2_x, df2_y.iloc[:, i])
    # df_f[df2_y.columns[i]] = df2_y.iloc[:, i] - df_bg.iloc[:, i]

    # df_f.iloc[:, i] = df_f.iloc[:, i] / max(df_f.iloc[:, i])
df_f[df_f<0] = 0

# for i in range(df_f.shape[1]):
#     df_smooth[df_names[i]] = savgol_filter(df_y.iloc[:, i], 25, 4)
#     df_f.iloc[:, i] = gaussian_filter1d(df_f.iloc[:, i], 10)

for i in range(df_f.shape[1]):
    df_f.iloc[:, i] = df_f.iloc[:, i] / max(df_f.iloc[:, i])

df_mean = df_f.mean(axis=1)

fig, axes = plt.subplots(1, 2, figsize=(16, 4))  # 1 строкa, 2 столбца


axes[0].plot(df_x, df_f.iloc[:, :], color=colors[0])
axes[0].set_title("all")

axes[1].plot(df_x, df_mean, color=colors[1])
axes[1].set_title("mean")


# plt.plot(df2_x, df_f.iloc[:, :])
# plt.xlim((800, 820))
# plt.ylim((0.9, 1.1))
# plt.xlabel('$wavenumber, cm^{-1}$')
# plt.ylabel('$a.u.$')
# plt.grid()
plt.show()

In [None]:
clean_pmma = df_f.iloc[:, -1]
cd_pmma = df_f.iloc[:, :-3]
cd_diff = pd.DataFrame()

for i in cd_pmma.columns:
    cd_diff.loc[:, i] = clean_pmma - cd_pmma.loc[:, i]

In [None]:
plt.plot(df_x, cd_diff.mean(axis=1))
# plt.xlim((800, 820))
# plt.ylim((0.9, 1.1))
plt.xlabel('$wavenumber, cm^{-1}$')
plt.ylabel('$a.u.$')
plt.grid()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8,6))

pca = PCA(n_components=10)
pca_1 = pca.fit_transform(df_f.iloc[:, :].T) #1195

pc_1 = 1
pc_2 = 2

scatter = plt.scatter(pca_1[:, pc_1-1], pca_1[:, pc_2-1] , s=100, edgecolors='k')
# plt.text(np.mean(pca_1[0+10*i:10+10*i, 0]), np.mean(pca_1[0+10*i:10+10*i, 1]), IR_files_names[i][3:7], bbox=dict(facecolor='none', edgecolor=colors[i], boxstyle='round'))
# confidence_ellipse(pca_1[0+10*i:10+10*i, pc_1-1], pca_1[0+10*i:10+10*i, pc_2-1], ax, n_std=2, edgecolor=colors[i], facecolor=colors[i], alpha=0.2)

plt.xlabel(f'${pc_1}\ pc:$ {pca.explained_variance_ratio_[pc_1-1]*100:.2f}%')
plt.ylabel(f'${pc_2}\ pc:$ {pca.explained_variance_ratio_[pc_2-1]*100:.2f}%')
plt.title('$PCA$')

# legend_elements = [Line2D([0], [0], marker='o', color='w', label=cls,
#                           markerfacecolor=color, markersize=10, markeredgecolor='k')
#                    for cls, color in colors_n.items()]

# plt.legend(handles=legend_elements, title="Классы")

plt.show()

In [None]:
plot_spectra(df4, df3_x)

In [None]:
baseline_fitter = Baseline(df3_x, check_finite=False)

df_bg = pd.DataFrame()
df_f = pd.DataFrame()
for i in range(df3_y.shape[1]):
    # df_bg[samle_names[i]] = baseline_fitter.asls(df_y.iloc[:, i], lam=1e7, p=0.007)[0]
    df_bg[df3_y.columns[i]] = baseline_fitter.rubberband(df3_y.iloc[:, i], smooth_half_window=15)[0]
    df_f[df3_y.columns[i]] = df3_y.iloc[:, i] - df_bg.iloc[:, i]
    # df_bg[df2_y.columns[i]] = concave_rubberband_baseline(df2_x, df2_y.iloc[:, i])
    # df_f[df2_y.columns[i]] = df2_y.iloc[:, i] - df_bg.iloc[:, i]

    df_f.iloc[:, i] = df_f.iloc[:, i] / max(df_f.iloc[:, i])
df_f[df_f<0] = 0

# for i in range(df_f.shape[1]):
#     df_smooth[df_names[i]] = savgol_filter(df_y.iloc[:, i], 25, 4)
#     df_f.iloc[:, i] = gaussian_filter1d(df_f.iloc[:, i], 10)

# for i in range(df_f.shape[1]):
#     df_f.iloc[:, i] = df_f.iloc[:, i] / max(df_f.iloc[:, i])

In [None]:
i=9

fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=df2_x, y=df2_mean, mode='lines', name='clean_pmma'), secondary_y=False)
fig.add_trace(go.Scatter(x=df3_x, y=df_f.iloc[:, i], mode='lines', name=f'{df_f.columns[i]}'), secondary_y=False)
# fig.add_trace(go.Scatter(x=df_x, y=df_f.iloc[:, 9], mode='lines'), secondary_y=True)
# fig.add_trace(go.Scatter(x=df_x, y=df_f.iloc[:, 10], mode='lines'), secondary_y=False)
# fig.add_trace(go.Scatter(x=df_x, y=df_f.iloc[:, 4] - df_f.iloc[:, 10], mode='lines'), secondary_y=False)
# for i in range(10, 14):
#     fig.add_trace(go.Scatter(x=df_x, y=df_f.iloc[:, i], mode='lines', line=dict(color="#4361ee")), secondary_y=False)

fig.update_layout(
    margin=dict(l=10, r=10, t=10, b=20), width=1300, height=600)

# some

In [None]:
dat_mw = [4, 6, 8, 10, 12, 14, 16]
dat_1 = [i * 1e-6 for i in dat_mw]
dat_f = [i / (2.2167 * 1e-8) for i in dat_1]
dat_peak = [i / (260 * 1e-15) for i in dat_f]
np.array(dat_peak)

In [None]:
2 * 1e-8

# Fluorescence

In [None]:
folder = r'C:\Users\gusen\Downloads\аспер\5сем\data\pmma_fl\fl_data\561'
all_dfs = []
for file in sorted(os.listdir(folder), key=numericalSort):
    file_path = os.path.join(folder, file)
    if not os.path.isfile(file_path):
        continue

    # df = pd.read_csv(file_path, delimiter='\t', header=None).iloc[:, 1]
    df = pd.DataFrame()
    if f"{fold_name}_{folder}" not in classes:
            classes.append(f"{fold_name}_{folder}")
    classes_n.append(f"{fold_name}_{folder}")
    samle_names.append(f"{fold_name}_{folder}_{os.path.splitext(file)[0]}")
    column_name = f"{fold_name}_{folder}_{os.path.splitext(file)[0]}"
    # dd = pd.read_csv(file_path, delimiter='\t', header=None)
    # dd.to_csv(rf'C:\Users\gusen\Downloads\аспер\5сем\data\data_fl\561\{file}.dat', header=None, index=None, sep='\t')
    df[column_name] = pd.read_csv(file_path, delimiter='\t', header=None).iloc[:, 1]
    fl_x = pd.read_csv(file_path, delimiter='\t', header=None).iloc[:, 0]
    all_dfs.append(df)

if all_dfs:
    result = pd.concat(all_dfs, axis=1)
else:
    result = pd.DataFrame()

In [None]:
for i in range(result.shape[1]):
    # plt.plot(fl_x, result.iloc[:, i] / result.iloc[:, i].max(), color=colors[i], label=f'{result.columns[i][-5:]}')
    plt.plot(fl_x, result.iloc[:, i], color=colors[i], label=f'{result.columns[i][-5:]}')
plt.legend()
plt.show()

In [None]:
fl_x

# raman r

In [None]:
baseline_fitter.__dir__()

In [None]:
baseline_fitter = Baseline(df_x, check_finite=False)

baselines = [
    baseline_fitter.imodpoly,
    baseline_fitter.penalized_poly,
    baseline_fitter.loess,
    baseline_fitter.quant_reg,
    baseline_fitter.goldindec, 
    baseline_fitter.asls,
    baseline_fitter.iasls, 
    baseline_fitter.airpls, 
    baseline_fitter.arpls,
    baseline_fitter.drpls, 
    baseline_fitter.iarpls, 
    baseline_fitter.aspls,
    baseline_fitter.psalsa, 
    baseline_fitter.derpsalsa,
    baseline_fitter.brpls, 
    baseline_fitter.lsrpls,
    baseline_fitter.mpls,
    baseline_fitter.mor, 
    baseline_fitter.imor, 
    baseline_fitter.mormol, 
    baseline_fitter.amormol, 
    baseline_fitter.rolling_ball, 
    baseline_fitter.mwmv, 
    baseline_fitter.tophat, 
    baseline_fitter.mpspline, 
    baseline_fitter.jbcd, 
    baseline_fitter.mixture_model, 
    baseline_fitter.irsqr,
    baseline_fitter.corner_cutting, 
    baseline_fitter.pspline_asls, 
    baseline_fitter.pspline_iasls, 
    baseline_fitter.pspline_airpls, 
    baseline_fitter.pspline_arpls, 
    baseline_fitter.pspline_drpls, 
    baseline_fitter.pspline_iarpls, 
    baseline_fitter.pspline_aspls, 
    baseline_fitter.pspline_psalsa, 
    baseline_fitter.pspline_derpsalsa, 
    baseline_fitter.pspline_mpls, 
    baseline_fitter.pspline_brpls, 
    baseline_fitter.pspline_lsrpls, 
    baseline_fitter.noise_median, 
    baseline_fitter.snip, 
    baseline_fitter.swima, 
    baseline_fitter.ipsa, 
    baseline_fitter.ria, 
    baseline_fitter.peak_filling, 
    baseline_fitter.dietrich, 
    baseline_fitter.golotvin, 
    baseline_fitter.std_distribution, 
    baseline_fitter.fastchrom, 
    baseline_fitter.cwt_br, 
    baseline_fitter.fabc, 
    baseline_fitter.rubberband, 
    # baseline_fitter.
    # baseline_fitter.
    # baseline_fitter.
    # baseline_fitter. 
]

for baseline in baselines:
    df_bg = pd.DataFrame()
    df_f = pd.DataFrame()
    df_norm = pd.DataFrame()
    df_y_mod = pd.DataFrame()

    for i in range(df_y.shape[1]):
       
        df_bg[samle_names[i]] = baseline(df_y.iloc[:, i])[0]
        df_f[samle_names[i]] = df_y.iloc[:, i] - df_bg.iloc[:, i]

    df_f[df_f<0] = 0

    for i in range(df_f.shape[1]):
        df_f.iloc[:, i] = gaussian_filter1d(df_f.iloc[:, i], 10)

    for i in range(df_f.shape[1]):
        df_norm[samle_names[i]] = (df_f.iloc[:, i] - df_f.iloc[:, i].mean()) / df_f.iloc[:, i].std()    

    for i in range(df_norm.shape[1]):
        y =  df_norm.iloc[:, i]
        x = df_x
        f_interp = interp1d(x, y, kind='cubic')
        new_x = np.linspace(x.iloc[0], x.iloc[-1], 501)
        new_y = f_interp(new_x)
        df_y_mod[df_norm.columns[i]] = new_y

    new_df_x = new_x.copy()
    new_df_norm = df_y_mod.copy()

    fig, ax = plt.subplots(figsize=(10,4), ncols=2, nrows=1)

    pca = PCA(n_components=20)
    # pca_1 = pca.fit_transform(df_norm.iloc[:, :].T) #1195
    pca_1 = pca.fit_transform((new_df_norm.iloc[:, :].T))
    # pca_1 = pca.fit_transform(StandardScaler().fit_transform(new_df_norm.iloc[:, :].T))
    # pca_1 = pca.fit_transform(MinMaxScaler().fit_transform(df_y_mod.iloc[:, :].T))

    pc_1 = 1
    pc_2 = 2

    ax[0].scatter(pca_1[:, pc_1-1], pca_1[:, pc_2-1], c=colors_for_points , s=100, edgecolors='k')
    # plt.text(np.mean(pca_1[0+10*i:10+10*i, 0]), np.mean(pca_1[0+10*i:10+10*i, 1]), IR_files_names[i][3:7], bbox=dict(facecolor='none', edgecolor=colors[i], boxstyle='round'))

    # values, counts = np.unique(classes_n, return_counts=True)

    # order = np.argsort([classes.index(v) for v in values])
    # values = values[order]
    # counts = counts[order]

    # n_uniq = len(values)
    # cum_sum = np.cumsum(counts)
    # for i in range(n_uniq):
    #        if i == 0:
    #               confidence_ellipse(pca_1[0:cum_sum[i], pc_1-1], pca_1[0:cum_sum[i], pc_2-1], ax, n_std=2, edgecolor=colors[i], facecolor=colors[i], center_color=colors[i], alpha=0.1, text=i+1)
    #        elif i == 6:
    #               confidence_ellipse(pca_1[cum_sum[i-1]:cum_sum[i], pc_1-1], pca_1[cum_sum[i-1]:cum_sum[i], pc_2-1], ax, n_std=1, edgecolor=colors[i], facecolor=colors[i], center_color=colors[i], alpha=0.1, text=i+1)
    #        else:
    #               confidence_ellipse(pca_1[cum_sum[i-1]:cum_sum[i], pc_1-1], pca_1[cum_sum[i-1]:cum_sum[i], pc_2-1], ax, n_std=2, edgecolor=colors[i], facecolor=colors[i], center_color=colors[i], alpha=0.1, text=i+1)


    ax[0].set_title(f'PCA {baseline.__name__}')
    ax[0].set_xlabel(f'${pc_1}\ главная\ компонента:$ {pca.explained_variance_ratio_[pc_1-1]*100:.2f}%', fontsize=15)
    ax[0].set_ylabel(f'${pc_2}\ главная\ компонента:$ {pca.explained_variance_ratio_[pc_2-1]*100:.2f}%', fontsize=15)
    # plt.xlabel(f'${pc_1}я\ главная\ компонента:$ {55.91}%', fontsize=17)
    # plt.ylabel(f'${pc_2}я\ главная\ компонента:$ {16.38}%', fontsize=17)
    # plt.title('$PCA,\ раман$')
    # ax[0].xticks(fontsize=15)
    # ax[0].set_yticks(fontsize=15)

    # legend_elements = [Line2D([0], [0], marker='o', color='w', label=cls,
    #                         markerfacecolor=color, markersize=10, markeredgecolor='k')
    #                 for cls, color in colors_n.items()]

    # plt.legend(handles=legend_elements, title="Классы")

    pls_binary = PLSRegression(n_components=10)
    # Fit and transform the data
    X_pls = pls_binary.fit_transform(df_y_mod.T, classes_mw)[0]
    # X_pls = pls_binary.fit_transform(StandardScaler().fit_transform(df_y_mod.T), classes_mw)[0]

    pc_1 = 1
    pc_2 = 2

    ax[1].scatter(X_pls[:, pc_1-1], X_pls[:, pc_2-1], c=colors_for_points , s=100, edgecolors='k')
    # for i in range(10):
    #     plt.scatter(X_pls[0+10*i:10+10*i, 1], X_pls[0+10*i:10+10*i, 2], c=colors[i], s=100, edgecolors='k')
    #     plt.text(np.mean(X_pls[0+10*i:10+10*i, 1]), np.mean(X_pls[0+10*i:10+10*i, 2]), paper_Y[i], bbox=dict(facecolor='none', edgecolor=colors[i], boxstyle='round'))

    ax[1].set_xlabel('Latent Variable 1')
    ax[1].set_ylabel('Latent Variable 2')
    # plt.legend(labplot,loc='lower left')
    ax[1].set_title(f'PLS {baseline.__name__}')
    # plt.show()
    # plt.plot(df_x, df_f.iloc[:, :])
    
    # plt.xlim((800, 820))
    # plt.ylim((0.9, 1.1))
    # plt.xlabel('$wavenumber, cm^{-1}$')
    # plt.ylabel('$a.u.$')
    # plt.grid()
    plt.tight_layout()
    plt.show()

In [None]:
baseline_fitter = Baseline(df_x, check_finite=False)

baselines = [
    baseline_fitter.imodpoly,
    baseline_fitter.penalized_poly,
    baseline_fitter.loess,
    baseline_fitter.quant_reg,
    baseline_fitter.goldindec, 
    baseline_fitter.asls,
    baseline_fitter.iasls, 
    baseline_fitter.airpls, 
    baseline_fitter.arpls,
    baseline_fitter.drpls, 
    baseline_fitter.iarpls, 
    baseline_fitter.aspls,
    baseline_fitter.psalsa, 
    baseline_fitter.derpsalsa,
    baseline_fitter.brpls, 
    baseline_fitter.lsrpls,
    baseline_fitter.mpls,
    baseline_fitter.mor, 
    baseline_fitter.imor, 
    baseline_fitter.mormol, 
    baseline_fitter.amormol, 
    baseline_fitter.rolling_ball, 
    baseline_fitter.mwmv, 
    baseline_fitter.tophat, 
    baseline_fitter.mpspline, 
    baseline_fitter.jbcd, 
    baseline_fitter.mixture_model, 
    baseline_fitter.irsqr,
    baseline_fitter.corner_cutting, 
    baseline_fitter.pspline_asls, 
    baseline_fitter.pspline_iasls, 
    baseline_fitter.pspline_airpls, 
    baseline_fitter.pspline_arpls, 
    baseline_fitter.pspline_drpls, 
    baseline_fitter.pspline_iarpls, 
    baseline_fitter.pspline_aspls, 
    baseline_fitter.pspline_psalsa, 
    baseline_fitter.pspline_derpsalsa, 
    baseline_fitter.pspline_mpls, 
    baseline_fitter.pspline_brpls, 
    baseline_fitter.pspline_lsrpls, 
    baseline_fitter.noise_median, 
    baseline_fitter.snip, 
    baseline_fitter.swima, 
    baseline_fitter.ipsa, 
    baseline_fitter.ria, 
    baseline_fitter.peak_filling, 
    baseline_fitter.dietrich, 
    baseline_fitter.golotvin, 
    baseline_fitter.std_distribution, 
    baseline_fitter.fastchrom, 
    baseline_fitter.cwt_br, 
    baseline_fitter.fabc, 
    baseline_fitter.rubberband, 
    # baseline_fitter.
    # baseline_fitter.
    # baseline_fitter.
    # baseline_fitter. 
]

for baseline in baselines:
    df_bg = pd.DataFrame()
    df_f = pd.DataFrame()
    df_norm = pd.DataFrame()
    df_y_mod = pd.DataFrame()

    for i in range(df_y.shape[1]):
       
        df_bg[samle_names[i]] = baseline(df_y.iloc[:, i])[0]
        df_f[samle_names[i]] = df_y.iloc[:, i] - df_bg.iloc[:, i]

    df_f[df_f<0] = 0

    for i in range(df_f.shape[1]):
        df_f.iloc[:, i] = gaussian_filter1d(df_f.iloc[:, i], 10)

    for i in range(df_f.shape[1]):
        df_norm[samle_names[i]] = (df_f.iloc[:, i] - df_f.iloc[:, i].mean()) / df_f.iloc[:, i].std()    

    for i in range(df_norm.shape[1]):
        y =  df_norm.iloc[:, i]
        x = df_x
        f_interp = interp1d(x, y, kind='cubic')
        new_x = np.linspace(x.iloc[0], x.iloc[-1], 501)
        new_y = f_interp(new_x)
        df_y_mod[df_norm.columns[i]] = new_y

    new_df_x = new_x.copy()
    new_df_norm = df_y_mod.copy()

    fig, ax = plt.subplots(figsize=(10,4), ncols=2, nrows=1)

    pca = PCA(n_components=20)
    # pca_1 = pca.fit_transform(df_norm.iloc[:, :].T) #1195
    pca_1 = pca.fit_transform((new_df_norm.iloc[:, :-11].T))
    # pca_1 = pca.fit_transform(StandardScaler().fit_transform(new_df_norm.iloc[:, :].T))
    # pca_1 = pca.fit_transform(MinMaxScaler().fit_transform(df_y_mod.iloc[:, :].T))

    pc_1 = 1
    pc_2 = 2

    ax[0].scatter(pca_1[:, pc_1-1], pca_1[:, pc_2-1], c=colors_for_points[:-11] , s=100, edgecolors='k')
    # plt.text(np.mean(pca_1[0+10*i:10+10*i, 0]), np.mean(pca_1[0+10*i:10+10*i, 1]), IR_files_names[i][3:7], bbox=dict(facecolor='none', edgecolor=colors[i], boxstyle='round'))

    # values, counts = np.unique(classes_n, return_counts=True)

    # order = np.argsort([classes.index(v) for v in values])
    # values = values[order]
    # counts = counts[order]

    # n_uniq = len(values)
    # cum_sum = np.cumsum(counts)
    # for i in range(n_uniq):
    #        if i == 0:
    #               confidence_ellipse(pca_1[0:cum_sum[i], pc_1-1], pca_1[0:cum_sum[i], pc_2-1], ax, n_std=2, edgecolor=colors[i], facecolor=colors[i], center_color=colors[i], alpha=0.1, text=i+1)
    #        elif i == 6:
    #               confidence_ellipse(pca_1[cum_sum[i-1]:cum_sum[i], pc_1-1], pca_1[cum_sum[i-1]:cum_sum[i], pc_2-1], ax, n_std=1, edgecolor=colors[i], facecolor=colors[i], center_color=colors[i], alpha=0.1, text=i+1)
    #        else:
    #               confidence_ellipse(pca_1[cum_sum[i-1]:cum_sum[i], pc_1-1], pca_1[cum_sum[i-1]:cum_sum[i], pc_2-1], ax, n_std=2, edgecolor=colors[i], facecolor=colors[i], center_color=colors[i], alpha=0.1, text=i+1)


    ax[0].set_title(f'PCA {baseline.__name__}')
    ax[0].set_xlabel(f'${pc_1}\ главная\ компонента:$ {pca.explained_variance_ratio_[pc_1-1]*100:.2f}%', fontsize=15)
    ax[0].set_ylabel(f'${pc_2}\ главная\ компонента:$ {pca.explained_variance_ratio_[pc_2-1]*100:.2f}%', fontsize=15)
    # plt.xlabel(f'${pc_1}я\ главная\ компонента:$ {55.91}%', fontsize=17)
    # plt.ylabel(f'${pc_2}я\ главная\ компонента:$ {16.38}%', fontsize=17)
    # plt.title('$PCA,\ раман$')
    # ax[0].xticks(fontsize=15)
    # ax[0].set_yticks(fontsize=15)

    # legend_elements = [Line2D([0], [0], marker='o', color='w', label=cls,
    #                         markerfacecolor=color, markersize=10, markeredgecolor='k')
    #                 for cls, color in colors_n.items()]

    # plt.legend(handles=legend_elements, title="Классы")

    pls_binary = PLSRegression(n_components=10)
    # Fit and transform the data
    X_pls = pls_binary.fit_transform(df_y_mod.iloc[:, :-11].T, classes_mw[:-11])[0]
    # X_pls = pls_binary.fit_transform(StandardScaler().fit_transform(df_y_mod.T), classes_mw)[0]

    pc_1 = 1
    pc_2 = 2

    ax[1].scatter(X_pls[:, pc_1-1], X_pls[:, pc_2-1], c=colors_for_points[:-11] , s=100, edgecolors='k')
    # for i in range(10):
    #     plt.scatter(X_pls[0+10*i:10+10*i, 1], X_pls[0+10*i:10+10*i, 2], c=colors[i], s=100, edgecolors='k')
    #     plt.text(np.mean(X_pls[0+10*i:10+10*i, 1]), np.mean(X_pls[0+10*i:10+10*i, 2]), paper_Y[i], bbox=dict(facecolor='none', edgecolor=colors[i], boxstyle='round'))

    ax[1].set_xlabel('Latent Variable 1')
    ax[1].set_ylabel('Latent Variable 2')
    # plt.legend(labplot,loc='lower left')
    ax[1].set_title(f'PLS {baseline.__name__}')
    # plt.show()
    # plt.plot(df_x, df_f.iloc[:, :])
    
    # plt.xlim((800, 820))
    # plt.ylim((0.9, 1.1))
    # plt.xlabel('$wavenumber, cm^{-1}$')
    # plt.ylabel('$a.u.$')
    # plt.grid()
    plt.tight_layout()
    plt.show()

In [None]:
new_df_norm

In [None]:
baseline_fitter = Baseline(df_x, check_finite=False)

# baselines = [
#     baseline_fitter.imodpoly,
#     baseline_fitter.penalized_poly,
#     baseline_fitter.loess,
#     baseline_fitter.quant_reg,
#     baseline_fitter.goldindec, 
#     baseline_fitter.asls,
#     baseline_fitter.iasls, 
#     baseline_fitter.airpls, 
#     baseline_fitter.arpls,
#     baseline_fitter.drpls, 
#     baseline_fitter.iarpls, 
#     baseline_fitter.aspls,
#     baseline_fitter.psalsa, 
#     baseline_fitter.derpsalsa,
#     baseline_fitter.brpls, 
#     baseline_fitter.lsrpls,
#     baseline_fitter.mpls,
#     baseline_fitter.mor, 
#     baseline_fitter.imor, 
#     baseline_fitter.mormol, 
#     baseline_fitter.amormol, 
#     baseline_fitter.rolling_ball, 
#     baseline_fitter.mwmv, 
#     baseline_fitter.tophat, 
#     baseline_fitter.mpspline, 
#     baseline_fitter.jbcd, 
#     baseline_fitter.mixture_model, 
#     baseline_fitter.irsqr,
#     baseline_fitter.corner_cutting, 
#     baseline_fitter.pspline_asls, 
#     baseline_fitter.pspline_iasls, 
#     baseline_fitter.pspline_airpls, 
#     baseline_fitter.pspline_arpls, 
#     baseline_fitter.pspline_drpls, 
#     baseline_fitter.pspline_iarpls, 
#     baseline_fitter.pspline_aspls, 
#     baseline_fitter.pspline_psalsa, 
#     baseline_fitter.pspline_derpsalsa, 
#     baseline_fitter.pspline_mpls, 
#     baseline_fitter.pspline_brpls, 
#     baseline_fitter.pspline_lsrpls, 
#     baseline_fitter.noise_median, 
#     baseline_fitter.snip, 
#     baseline_fitter.swima, 
#     baseline_fitter.ipsa, 
#     baseline_fitter.ria, 
#     baseline_fitter.peak_filling, 
#     baseline_fitter.dietrich, 
#     baseline_fitter.golotvin, 
#     baseline_fitter.std_distribution, 
#     baseline_fitter.fastchrom, 
#     baseline_fitter.cwt_br, 
#     baseline_fitter.fabc, 
#     baseline_fitter.rubberband, 
# ]
# lamds = [1e-8,1e-7,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,20,30,40,50,100,1e3,1e4,1e5,1e6,1e7,1e8,1e9,1e10,1e11,1e12]
# lamds = [100,1e3,1e4,1e5,1e6,1e7,1e8,1e9,1e10,1e11,1e12]
hw = [100]
for la in hw:
    df_bg = pd.DataFrame()
    df_f = pd.DataFrame()
    df_norm = pd.DataFrame()
    df_y_mod = pd.DataFrame()

    for i in range(df_y.shape[1]):
       
        # df_bg[samle_names[i]] = baseline_fitter.mor(df_y.iloc[:, i], lam=la, diff_order=2)[0]
        df_bg[samle_names[i]] = baseline_fitter.mor(df_y.iloc[:, i], half_window=None)[0]
        df_f[samle_names[i]] = df_y.iloc[:, i] - df_bg.iloc[:, i]

    df_f[df_f<0] = 0

    for i in range(df_f.shape[1]):
        df_f.iloc[:, i] = gaussian_filter1d(df_f.iloc[:, i], 10)

    for i in range(df_f.shape[1]):
        df_norm[samle_names[i]] = (df_f.iloc[:, i] - df_f.iloc[:, i].mean()) / df_f.iloc[:, i].std()    

    for i in range(df_norm.shape[1]):
        y =  df_norm.iloc[:, i]
        x = df_x
        f_interp = interp1d(x, y, kind='cubic')
        new_x = np.linspace(x.iloc[0], x.iloc[-1], 501)
        new_y = f_interp(new_x)
        df_y_mod[df_norm.columns[i]] = new_y

    new_df_x = new_x.copy()
    new_df_norm = df_y_mod.copy()

    # fig, ax = plt.subplots(figsize=(12,6), nrows=2, ncols=2)
    fig, ax = plt.subplots(figsize=(18,10), nrows=2, ncols=2)

    pca = PCA(n_components=20)
    # pca_1 = pca.fit_transform(df_norm.iloc[:, :].T) #1195
    # pca_1 = pca.fit_transform((new_df_norm.iloc[:, :].T))
    pca_1 = pca.fit_transform(StandardScaler().fit_transform(new_df_norm.iloc[:, :].T))
    # pca_1 = pca.fit_transform(MinMaxScaler().fit_transform(df_y_mod.iloc[:, :].T))

    pc_1 = 1
    pc_2 = 2

    ax[0, 0].scatter(pca_1[:, pc_1-1], pca_1[:, pc_2-1], c=colors_for_points , s=100, edgecolors='k')

    ax[0, 0].set_title(f'{la}')
    ax[0, 0].set_xlabel(f'${pc_1}\ главная\ компонента:$ {pca.explained_variance_ratio_[pc_1-1]*100:.2f}%', fontsize=15)
    ax[0, 0].set_ylabel(f'${pc_2}\ главная\ компонента:$ {pca.explained_variance_ratio_[pc_2-1]*100:.2f}%', fontsize=15)
    # plt.xticks(fontsize=15)
    # plt.yticks(fontsize=15)

    # legend_elements = [Line2D([0], [0], marker='o', color='w', label=cls,
    #                         markerfacecolor=color, markersize=10, markeredgecolor='k')
    #                 for cls, color in colors_n.items()]

    # plt.legend(handles=legend_elements, title="Классы")
    ax[1, 0].scatter(pca.components_[pc_1-1, :], pca.components_[pc_2-1, :])
    for i, label in enumerate(new_df_x):
        ax[1, 0].text(pca.components_[pc_1-1, i], pca.components_[pc_2-1, i], f'{label:.0f}', fontsize=3)


    # plt.show()
    # ax[0, 1].plot(new_df_x, new_df_norm.iloc[:, :])

    pls_binary = PLSRegression(n_components=10)
    # Fit and transform the data
    # X_pls = pls_binary.fit_transform(df_y_mod.T, classes_mw)[0]
    X_pls = pls_binary.fit_transform(StandardScaler().fit_transform(df_y_mod.T), classes_mw)[0]

    pc_1 = 1
    pc_2 = 2

    ax[0, 1].scatter(X_pls[:, pc_1-1], X_pls[:, pc_2-1], c=colors_for_points , s=100, edgecolors='k')
    # for i in range(10):
    #     plt.scatter(X_pls[0+10*i:10+10*i, 1], X_pls[0+10*i:10+10*i, 2], c=colors[i], s=100, edgecolors='k')
    #     plt.text(np.mean(X_pls[0+10*i:10+10*i, 1]), np.mean(X_pls[0+10*i:10+10*i, 2]), paper_Y[i], bbox=dict(facecolor='none', edgecolor=colors[i], boxstyle='round'))

    ax[0, 1].set_xlabel('Latent Variable 1')
    ax[0, 1].set_ylabel('Latent Variable 2')
    # plt.legend(labplot,loc='lower left')
    ax[0, 1].set_title(f'PLS {baseline.__name__}')

    ax[1, 1].scatter(pls_binary.x_loadings_[:, pc_1-1], pls_binary.x_loadings_[:, pc_2-1])
    for i, label in enumerate(new_df_x):
        ax[1, 1].text(pls_binary.x_loadings_[i, pc_1-1], pls_binary.x_loadings_[i, pc_2-1], f'{label:.0f}', fontsize=3)
    
    # plt.xlim((800, 820))
    # plt.ylim((0.9, 1.1))
    # plt.xlabel('$wavenumber, cm^{-1}$')
    # plt.ylabel('$a.u.$')
    # plt.grid()
    plt.show() 

In [None]:
baseline_fitter = Baseline(df_x, check_finite=False)

# baselines = [
#     baseline_fitter.imodpoly,
#     baseline_fitter.penalized_poly,
#     baseline_fitter.loess,
#     baseline_fitter.quant_reg,
#     baseline_fitter.goldindec, 
#     baseline_fitter.asls,
#     baseline_fitter.iasls, 
#     baseline_fitter.airpls, 
#     baseline_fitter.arpls,
#     baseline_fitter.drpls, 
#     baseline_fitter.iarpls, 
#     baseline_fitter.aspls,
#     baseline_fitter.psalsa, 
#     baseline_fitter.derpsalsa,
#     baseline_fitter.brpls, 
#     baseline_fitter.lsrpls,
#     baseline_fitter.mpls,
#     baseline_fitter.mor, 
#     baseline_fitter.imor, 
#     baseline_fitter.mormol, 
#     baseline_fitter.amormol, 
#     baseline_fitter.rolling_ball, 
#     baseline_fitter.mwmv, 
#     baseline_fitter.tophat, 
#     baseline_fitter.mpspline, 
#     baseline_fitter.jbcd, 
#     baseline_fitter.mixture_model, 
#     baseline_fitter.irsqr,
#     baseline_fitter.corner_cutting, 
#     baseline_fitter.pspline_asls, 
#     baseline_fitter.pspline_iasls, 
#     baseline_fitter.pspline_airpls, 
#     baseline_fitter.pspline_arpls, 
#     baseline_fitter.pspline_drpls, 
#     baseline_fitter.pspline_iarpls, 
#     baseline_fitter.pspline_aspls, 
#     baseline_fitter.pspline_psalsa, 
#     baseline_fitter.pspline_derpsalsa, 
#     baseline_fitter.pspline_mpls, 
#     baseline_fitter.pspline_brpls, 
#     baseline_fitter.pspline_lsrpls, 
#     baseline_fitter.noise_median, 
#     baseline_fitter.snip, 
#     baseline_fitter.swima, 
#     baseline_fitter.ipsa, 
#     baseline_fitter.ria, 
#     baseline_fitter.peak_filling, 
#     baseline_fitter.dietrich, 
#     baseline_fitter.golotvin, 
#     baseline_fitter.std_distribution, 
#     baseline_fitter.fastchrom, 
#     baseline_fitter.cwt_br, 
#     baseline_fitter.fabc, 
#     baseline_fitter.rubberband, 
# ]
# lamds = [1e-8,1e-7,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,20,30,40,50,100,1e3,1e4,1e5,1e6,1e7,1e8,1e9,1e10,1e11,1e12]
# lamds = [100,1e3,1e4,1e5,1e6,1e7,1e8,1e9,1e10,1e11,1e12]
hw = [100]
for la in hw:
    df_bg = pd.DataFrame()
    df_f = pd.DataFrame()
    df_norm = pd.DataFrame()
    df_y_mod = pd.DataFrame()

    for i in range(df_y.shape[1]):
       
        # df_bg[samle_names[i]] = baseline_fitter.mor(df_y.iloc[:, i], lam=la, diff_order=2)[0]
        df_bg[samle_names[i]] = baseline_fitter.mor(df_y.iloc[:, i], half_window=None)[0]
        df_f[samle_names[i]] = df_y.iloc[:, i] - df_bg.iloc[:, i]

    df_f[df_f<0] = 0

    for i in range(df_f.shape[1]):
        df_f.iloc[:, i] = gaussian_filter1d(df_f.iloc[:, i], 10)

    for i in range(df_f.shape[1]):
        df_norm[samle_names[i]] = (df_f.iloc[:, i] - df_f.iloc[:, i].mean()) / df_f.iloc[:, i].std()    

    for i in range(df_norm.shape[1]):
        y =  df_norm.iloc[:, i]
        x = df_x
        f_interp = interp1d(x, y, kind='cubic')
        new_x = np.linspace(x.iloc[0], x.iloc[-1], 501)
        new_y = f_interp(new_x)
        df_y_mod[df_norm.columns[i]] = new_y

    new_df_x = new_x.copy()
    new_df_norm = df_y_mod.copy()

    # fig, ax = plt.subplots(figsize=(12,6), nrows=2, ncols=2)
    fig, ax = plt.subplots(figsize=(18,10), nrows=2, ncols=2)

    pca = PCA(n_components=20)
    # pca_1 = pca.fit_transform(df_norm.iloc[:, :].T) #1195
    # pca_1 = pca.fit_transform((new_df_norm.iloc[:, :-11].T))
    pca_1 = pca.fit_transform(StandardScaler().fit_transform(new_df_norm.iloc[:, :-11].T))
    # pca_1 = pca.fit_transform(MinMaxScaler().fit_transform(df_y_mod.iloc[:, :-11].T))

    pc_1 = 1
    pc_2 = 2

    ax[0, 0].scatter(pca_1[:, pc_1-1], pca_1[:, pc_2-1], c=colors_for_points[:-11] , s=100, edgecolors='k')

    ax[0, 0].set_title(f'{la}')
    ax[0, 0].set_xlabel(f'${pc_1}\ главная\ компонента:$ {pca.explained_variance_ratio_[pc_1-1]*100:.2f}%', fontsize=15)
    ax[0, 0].set_ylabel(f'${pc_2}\ главная\ компонента:$ {pca.explained_variance_ratio_[pc_2-1]*100:.2f}%', fontsize=15)
    # plt.xticks(fontsize=15)
    # plt.yticks(fontsize=15)

    # legend_elements = [Line2D([0], [0], marker='o', color='w', label=cls,
    #                         markerfacecolor=color, markersize=10, markeredgecolor='k')
    #                 for cls, color in colors_n.items()]

    # plt.legend(handles=legend_elements, title="Классы")
    ax[1, 0].scatter(pca.components_[pc_1-1, :], pca.components_[pc_2-1, :])
    for i, label in enumerate(new_df_x):
        ax[1, 0].text(pca.components_[pc_1-1, i], pca.components_[pc_2-1, i], f'{label:.0f}', fontsize=3)


    # plt.show()
    # ax[0, 1].plot(new_df_x, new_df_norm.iloc[:, :])

    pls_binary = PLSRegression(n_components=10)
    # Fit and transform the data
    # X_pls = pls_binary.fit_transform(df_y_mod.T, classes_mw)[0]
    X_pls = pls_binary.fit_transform(StandardScaler().fit_transform(df_y_mod.iloc[:, :-11].T), classes_mw[:-11])[0]

    pc_1 = 1
    pc_2 = 2

    ax[0, 1].scatter(X_pls[:, pc_1-1], X_pls[:, pc_2-1], c=colors_for_points[:-11] , s=100, edgecolors='k')
    # for i in range(10):
    #     plt.scatter(X_pls[0+10*i:10+10*i, 1], X_pls[0+10*i:10+10*i, 2], c=colors[i], s=100, edgecolors='k')
    #     plt.text(np.mean(X_pls[0+10*i:10+10*i, 1]), np.mean(X_pls[0+10*i:10+10*i, 2]), paper_Y[i], bbox=dict(facecolor='none', edgecolor=colors[i], boxstyle='round'))

    ax[0, 1].set_xlabel('Latent Variable 1')
    ax[0, 1].set_ylabel('Latent Variable 2')
    # plt.legend(labplot,loc='lower left')
    ax[0, 1].set_title(f'PLS {baseline.__name__}')

    ax[1, 1].scatter(pls_binary.x_loadings_[:, pc_1-1], pls_binary.x_loadings_[:, pc_2-1])
    for i, label in enumerate(new_df_x):
        ax[1, 1].text(pls_binary.x_loadings_[i, pc_1-1], pls_binary.x_loadings_[i, pc_2-1], f'{label:.0f}', fontsize=3)
    
    # plt.xlim((800, 820))
    # plt.ylim((0.9, 1.1))
    # plt.xlabel('$wavenumber, cm^{-1}$')
    # plt.ylabel('$a.u.$')
    # plt.grid()
    plt.show() 

In [None]:
import csv
import math
import os
from collections import Counter, defaultdict


def read_dataset(path):
    with open(path, encoding='utf-8') as f:
        r = csv.reader(f, delimiter=';')
        header = next(r)
        feature_names = header[2:]
        classes = []
        names = []
        X = []
        for row in r:
            classes.append(row[0])
            names.append(row[1])
            X.append([float(v) for v in row[2:]])
    return feature_names, classes, names, X


def copy_matrix(A):
    return [row[:] for row in A]


def transpose(A):
    return list(map(list, zip(*A)))


def matmul(A, B):
    n, m, p = len(A), len(A[0]), len(B[0])
    out = [[0.0] * p for _ in range(n)]
    for i in range(n):
        Ai = A[i]
        Oi = out[i]
        for k in range(m):
            a = Ai[k]
            Bk = B[k]
            for j in range(p):
                Oi[j] += a * Bk[j]
    return out


def matvec(A, x):
    out = [0.0] * len(A)
    for i, row in enumerate(A):
        s = 0.0
        for j, v in enumerate(row):
            s += v * x[j]
        out[i] = s
    return out


def dot(a, b):
    return sum(x * y for x, y in zip(a, b))


def norm(v):
    return math.sqrt(dot(v, v))


def center_scale(X):
    n = len(X)
    p = len(X[0])
    means = [0.0] * p
    for row in X:
        for j, v in enumerate(row):
            means[j] += v
    means = [m / n for m in means]

    stds = [0.0] * p
    for row in X:
        for j, v in enumerate(row):
            d = v - means[j]
            stds[j] += d * d
    stds = [math.sqrt(s / (n - 1)) if s > 0 else 1.0 for s in stds]

    Xs = [[(row[j] - means[j]) / stds[j] for j in range(p)] for row in X]
    return Xs, means, stds


def snv(X):
    out = []
    for row in X:
        m = sum(row) / len(row)
        var = sum((v - m) ** 2 for v in row) / max(1, (len(row) - 1))
        s = math.sqrt(var) if var > 0 else 1.0
        out.append([(v - m) / s for v in row])
    return out


def gram_matrix(X):
    n = len(X)
    G = [[0.0] * n for _ in range(n)]
    for i in range(n):
        G[i][i] = dot(X[i], X[i])
        for j in range(i + 1, n):
            v = dot(X[i], X[j])
            G[i][j] = v
            G[j][i] = v
    return G


def power_eigen_symmetric(A, k=2, max_iter=300, tol=1e-9):
    n = len(A)
    vectors = []
    values = []
    for comp in range(k):
        x = [0.0] * n
        x[comp % n] = 1.0
        for _ in range(max_iter):
            y = matvec(A, x)
            for q in vectors:
                proj = dot(y, q)
                for i in range(n):
                    y[i] -= proj * q[i]
            ny = norm(y)
            if ny == 0:
                break
            y = [v / ny for v in y]
            diff = math.sqrt(sum((y[i] - x[i]) ** 2 for i in range(n)))
            x = y
            if diff < tol:
                break
        Ax = matvec(A, x)
        val = dot(x, Ax)
        vectors.append(x)
        values.append(val)
    return values, vectors


def pca_scores(X, n_components=2):
    G = gram_matrix(X)
    vals, vecs = power_eigen_symmetric(G, k=n_components)
    scores = [[0.0] * n_components for _ in range(len(X))]
    for c in range(n_components):
        scale = math.sqrt(max(vals[c], 0.0))
        for i in range(len(X)):
            scores[i][c] = vecs[c][i] * scale
    total_var = sum(G[i][i] for i in range(len(G)))
    explained = [max(v, 0.0) / total_var if total_var > 0 else 0.0 for v in vals]
    return scores, explained


def one_hot(labels):
    uniq = sorted(set(labels), key=lambda x: float(x))
    idx = {c: i for i, c in enumerate(uniq)}
    Y = []
    for c in labels:
        row = [0.0] * len(uniq)
        row[idx[c]] = 1.0
        Y.append(row)
    return Y, uniq


def matrix_inverse(A):
    n = len(A)
    M = [row[:] + [1.0 if i == j else 0.0 for j in range(n)] for i, row in enumerate(A)]
    for col in range(n):
        pivot = max(range(col, n), key=lambda r: abs(M[r][col]))
        if abs(M[pivot][col]) < 1e-12:
            raise ValueError('Singular matrix')
        M[col], M[pivot] = M[pivot], M[col]
        piv = M[col][col]
        for j in range(2 * n):
            M[col][j] /= piv
        for i in range(n):
            if i == col:
                continue
            factor = M[i][col]
            for j in range(2 * n):
                M[i][j] -= factor * M[col][j]
    return [row[n:] for row in M]


def pls_nipals(X, Y, n_components=2, max_iter=200, tol=1e-8):
    Xh = copy_matrix(X)
    Yh = copy_matrix(Y)
    n = len(X)
    p = len(X[0])
    q = len(Y[0])

    W = [[0.0] * n_components for _ in range(p)]
    P = [[0.0] * n_components for _ in range(p)]
    Q = [[0.0] * n_components for _ in range(q)]
    T = [[0.0] * n_components for _ in range(n)]

    for a in range(n_components):
        u = [row[0] for row in Yh]
        for _ in range(max_iter):
            w = [0.0] * p
            for j in range(p):
                s = 0.0
                for i in range(n):
                    s += Xh[i][j] * u[i]
                w[j] = s
            nw = norm(w)
            if nw == 0:
                break
            w = [v / nw for v in w]

            t = [dot(row, w) for row in Xh]
            nt = norm(t)
            if nt == 0:
                break

            c = [0.0] * q
            for j in range(q):
                s = 0.0
                for i in range(n):
                    s += Yh[i][j] * t[i]
                c[j] = s
            nc = norm(c)
            if nc == 0:
                break
            c = [v / nc for v in c]

            u_new = [dot(row, c) for row in Yh]
            if norm([u_new[i] - u[i] for i in range(n)]) < tol:
                u = u_new
                break
            u = u_new

        denom = dot(t, t) if dot(t, t) != 0 else 1.0
        p_vec = [sum(Xh[i][j] * t[i] for i in range(n)) / denom for j in range(p)]
        q_vec = [sum(Yh[i][j] * t[i] for i in range(n)) / denom for j in range(q)]

        for i in range(n):
            T[i][a] = t[i]
        for j in range(p):
            W[j][a] = w[j]
            P[j][a] = p_vec[j]
        for j in range(q):
            Q[j][a] = q_vec[j]

        for i in range(n):
            ti = t[i]
            for j in range(p):
                Xh[i][j] -= ti * p_vec[j]
            for j in range(q):
                Yh[i][j] -= ti * q_vec[j]

    return {'W': W, 'P': P, 'Q': Q, 'T': T}


def pls_regression_coeffs(model):
    W, P, Q = model['W'], model['P'], model['Q']
    PT = transpose(P)
    PTW = matmul(PT, W)
    PTW_inv = matrix_inverse(PTW)
    W_star = matmul(W, PTW_inv)
    QT = transpose(Q)
    B = matmul(W_star, QT)
    return B


def apply_preprocess(X, means, stds):
    return [[(row[j] - means[j]) / stds[j] for j in range(len(row))] for row in X]


def split_train_test_by_class(X, labels, exclude_label):
    Xt, yt, Xe, ye = [], [], [], []
    for row, c in zip(X, labels):
        if c == exclude_label:
            Xe.append(row)
            ye.append(c)
        else:
            Xt.append(row)
            yt.append(c)
    return Xt, yt, Xe, ye


def class_palette(classes):
    base = ['#1f77b4','#ff7f0e','#2ca02c','#d62728','#9467bd','#8c564b','#e377c2','#17becf','#bcbd22','#7f7f7f']
    return {c: base[i % len(base)] for i, c in enumerate(sorted(classes, key=lambda x: float(x)))}


def save_scatter_svg(path, points, labels, title, xlabel, ylabel, palette, extra_text=None, highlight_label=None):
    xs = [p[0] for p in points]
    ys = [p[1] for p in points]
    minx, maxx = min(xs), max(xs)
    miny, maxy = min(ys), max(ys)
    padx = (maxx - minx) * 0.1 or 1.0
    pady = (maxy - miny) * 0.1 or 1.0
    minx -= padx; maxx += padx; miny -= pady; maxy += pady

    W,H = 1000,700
    lm,rm,tm,bm = 90,220,70,80
    pw,ph = W-lm-rm, H-tm-bm

    def sx(x): return lm + (x-minx)/(maxx-minx)*pw
    def sy(y): return tm + ph - (y-miny)/(maxy-miny)*ph

    lines = [f'<svg xmlns="http://www.w3.org/2000/svg" width="{W}" height="{H}">',
             '<rect width="100%" height="100%" fill="white"/>',
             f'<text x="{W//2}" y="35" text-anchor="middle" font-size="24" font-family="Arial">{title}</text>',
             f'<line x1="{lm}" y1="{tm+ph}" x2="{lm+pw}" y2="{tm+ph}" stroke="black"/>',
             f'<line x1="{lm}" y1="{tm}" x2="{lm}" y2="{tm+ph}" stroke="black"/>']

    for i in range(6):
        tx = minx + (maxx-minx)*i/5
        ty = miny + (maxy-miny)*i/5
        x = sx(tx); y = sy(ty)
        lines.append(f'<line x1="{x:.1f}" y1="{tm+ph}" x2="{x:.1f}" y2="{tm+ph+6}" stroke="black"/>')
        lines.append(f'<text x="{x:.1f}" y="{tm+ph+24}" text-anchor="middle" font-size="12">{tx:.2f}</text>')
        lines.append(f'<line x1="{lm-6}" y1="{y:.1f}" x2="{lm}" y2="{y:.1f}" stroke="black"/>')
        lines.append(f'<text x="{lm-10}" y="{y+4:.1f}" text-anchor="end" font-size="12">{ty:.2f}</text>')

    for (x,y),lab in zip(points,labels):
        color = palette.get(lab,'#000')
        r = 7 if lab == highlight_label else 5
        stroke = 'black' if lab == highlight_label else 'none'
        lines.append(f'<circle cx="{sx(x):.2f}" cy="{sy(y):.2f}" r="{r}" fill="{color}" stroke="{stroke}" opacity="0.82"/>')

    lines.append(f'<text x="{lm+pw/2:.1f}" y="{H-25}" text-anchor="middle" font-size="16">{xlabel}</text>')
    lines.append(f'<text x="30" y="{tm+ph/2:.1f}" text-anchor="middle" font-size="16" transform="rotate(-90 30 {tm+ph/2:.1f})">{ylabel}</text>')

    legend_y = 90
    lines.append(f'<text x="{W-rm+10}" y="{legend_y-20}" font-size="16">Классы</text>')
    for i,lab in enumerate(sorted(set(labels), key=lambda x: float(x))):
        y = legend_y + i*26
        lines.append(f'<rect x="{W-rm+10}" y="{y-12}" width="14" height="14" fill="{palette.get(lab,"#000")}"/>')
        lines.append(f'<text x="{W-rm+34}" y="{y}" font-size="14">{lab}</text>')

    if extra_text:
        lines.append(f'<text x="{W-rm+10}" y="{H-90}" font-size="13">{extra_text}</text>')

    lines.append('</svg>')
    with open(path,'w',encoding='utf-8') as f:
        f.write('\n'.join(lines))


def save_pred_plot(path, true_vals, pred_vals, labels, excluded_label):
    points = list(zip(true_vals,pred_vals))
    palette = class_palette(labels)
    save_scatter_svg(
        path,
        points,
        labels,
        title=f'PLS прогноз класса (модель без класса {excluded_label})',
        xlabel='Истинный класс (числовое значение)',
        ylabel='Предсказание PLS',
        palette=palette,
        extra_text='Черная обводка: исключенный из обучения класс',
        highlight_label=excluded_label,
    )


def argmax(row):
    best_i, best_v = 0, row[0]
    for i, v in enumerate(row[1:], 1):
        if v > best_v:
            best_i, best_v = i, v
    return best_i


def main():
    os.makedirs('outputs', exist_ok=True)
    _, labels, names, Xraw = read_dataset(r'C:\Users\gusen\Downloads\аспер\5сем\data\pmma_init.csv')

    X = snv(Xraw)
    Xz, x_mean, x_std = center_scale(X)

    # PCA
    pca_t, pca_var = pca_scores(Xz, n_components=2)
    palette = class_palette(labels)
    save_scatter_svg(
        'outputs/pca_scores.svg',
        pca_t,
        labels,
        title='PCA после SNV + autoscaling',
        xlabel=f'PC1 ({pca_var[0]*100:.1f}% для первых 2 ПК)',
        ylabel=f'PC2 ({pca_var[1]*100:.1f}% для первых 2 ПК)',
        palette=palette,
    )

    # PLS (multi-class one-hot)
    Y, class_order = one_hot(labels)
    Yz, y_mean, y_std = center_scale(Y)
    pls_model = pls_nipals(Xz, Yz, n_components=2)
    T = pls_model['T']
    save_scatter_svg(
        'outputs/pls_scores.svg',
        [[row[0], row[1]] for row in T],
        labels,
        title='PLS score plot (2 латентные переменные)',
        xlabel='LV1',
        ylabel='LV2',
        palette=palette,
    )

    # PLS-DA prediction on all data
    B = pls_regression_coeffs(pls_model)
    Yhat_z = matmul(Xz, B)
    Yhat = [[Yhat_z[i][j] * y_std[j] + y_mean[j] for j in range(len(y_mean))] for i in range(len(Yhat_z))]
    pred_idx = [argmax(r) for r in Yhat]
    true_idx = [class_order.index(c) for c in labels]
    acc = sum(1 for a, b in zip(pred_idx, true_idx) if a == b) / len(labels)

    # PLS-DA score plot (same scores, but keep separate figure and include accuracy note)
    save_scatter_svg(
        'outputs/pls_da_scores.svg',
        [[row[0], row[1]] for row in T],
        labels,
        title='PLS-DA: score plot',
        xlabel='LV1',
        ylabel='LV2',
        palette=palette,
        extra_text=f'Точность на всем наборе: {acc*100:.1f}%'
    )

    # "Intermediate class" scenario: leave one class out for training and predict continuous class value
    excluded = sorted(set(labels), key=lambda x: Counter(labels)[x])[0]  # rarest class
    Xt, yt, Xe, ye = split_train_test_by_class(X, labels, excluded)
    Xt_z, xm_t, xs_t = center_scale(Xt)

    y_scalar_t = [[float(c)] for c in yt]
    ys_t_mean = [sum(v[0] for v in y_scalar_t) / len(y_scalar_t)]
    ys_t_std = [math.sqrt(sum((v[0]-ys_t_mean[0])**2 for v in y_scalar_t)/(len(y_scalar_t)-1))]
    Yt_z = [[(v[0]-ys_t_mean[0])/(ys_t_std[0] if ys_t_std[0] else 1.0)] for v in y_scalar_t]

    pls_missing = pls_nipals(Xt_z, Yt_z, n_components=2)
    Bm = pls_regression_coeffs(pls_missing)

    Xall_tz = apply_preprocess(X, xm_t, xs_t)
    yhat_z = [row[0] for row in matmul(Xall_tz, Bm)]
    yhat = [v * ys_t_std[0] + ys_t_mean[0] for v in yhat_z]
    ytrue = [float(c) for c in labels]

    save_pred_plot('outputs/pls_missing_class_prediction.svg', ytrue, yhat, labels, excluded)

    with open('outputs/report.txt', 'w', encoding='utf-8') as f:
        f.write('Датасет: pmma_init.csv\n')
        f.write(f'Объектов: {len(labels)}\n')
        f.write(f'Классы: {dict(Counter(labels))}\n')
        f.write('Предобработка: SNV по спектру + autoscaling по признакам.\n')
        f.write(f'PCA (2 компоненты): вклад внутри первых 2 компонент = {sum(pca_var)*100:.2f}%\n')
        f.write(f'PLS-DA точность (resubstitution): {acc*100:.2f}%\n')
        f.write(f'Эксперимент с исключенным классом для PLS-регрессии: исключен класс {excluded}.\n')


if __name__ == '__main__':
    main()
