In [96]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error

# Вспомогательные функции

## Функция для отрисовки графика размеченных данных

In [69]:
def plot_annotated_data(df, gt_df, selected_file):
    events = gt_df[gt_df["file"] == selected_file][["recovery", "drop"]].values[0]
    recovery_intervals = eval(events[0]) if isinstance(events[0], str) else []
    drop_intervals = eval(events[1]) if isinstance(events[1], str) else []

    fig = px.line(df, x="time", y="pressure", title=f"Visualization of {selected_file} (Reduced Data)", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")

    for start, end in recovery_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="green", opacity=0.3, layer="below", line_width=0, name="Recovery")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].max(), df["pressure"].max()],
                                    mode="lines", line=dict(color="green", width=4), name="Recovery (Bold)"))

    for start, end in drop_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="red", opacity=0.3, layer="below", line_width=0, name="Drop")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].min(), df["pressure"].min()],
                                    mode="lines", line=dict(color="red", width=4), name="Drop (Bold)"))

    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
        )

    fig.show()

## Функция для отрисовки графика неразмеченных данных

In [151]:
def plot_data(df, selected_file):
    fig = px.line(df, x="time", y="pressure", title=f"Visualization of {selected_file} (Reduced Data)", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")

    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
        )

    fig.show()

## Функция для вычисления MAE по частотному спектру

In [97]:
def fft_mae(df, df_reduced):
    fft_original = np.abs(np.fft.fft(df["pressure"]))
    fft_reduced = np.abs(np.fft.fft(df_reduced["pressure"]))
    
    min_len = min(len(fft_original), len(fft_reduced))
    return mean_absolute_error(fft_original[:min_len], fft_reduced[:min_len])

# EDA

## Анализ размеченных данных

In [71]:
train_dir = "../src/raw_data/train"
ground_truth_path = "../src/raw_data/ground_truth.csv"

gt_df = pd.read_csv(ground_truth_path)
annotated_files = gt_df["file"].tolist()

In [110]:
selected_file = annotated_files[4]
file_path = os.path.join(train_dir, selected_file)

In [111]:
df = pd.read_csv(file_path, sep="\\s+", header=None, names=["time", "pressure"])

### Понижение размерности

Отклонение давления соседних записей:

In [112]:
pressure_diffs = df["pressure"].diff().abs().dropna()

max_pressure_diff = pressure_diffs.max()
min_pressure_diff = pressure_diffs.min()

print(f"Max diff: {max_pressure_diff}")
print(f"Min diff: {min_pressure_diff}")

Max diff: 62.909517
Min diff: 0.0


In [146]:
pressure_diffs.describe()

count    16928.000000
mean         0.527282
std          0.921540
min          0.006111
25%          0.025834
50%          0.642361
75%          0.906389
max         80.460556
Name: time, dtype: float64

Отклонение времени соседних записей:

In [147]:
pressure_diffs = df["time"].diff().dropna()

max_pressure_diff = pressure_diffs.max()
min_pressure_diff = pressure_diffs.min()

mean_time_diff = pressure_diffs.mean()
print(f"Max diff: {max_pressure_diff}")
print(f"Min diff: {min_pressure_diff}")

Max diff: 80.460556
Min diff: 0.006110999999691558


In [116]:
df

Unnamed: 0,time,pressure
0,0.000000,38.335304
1,0.031111,38.335304
2,0.823889,38.335304
3,0.840000,38.335304
4,1.606944,38.335304
...,...,...
16924,8923.243333,36.399626
16925,8924.030556,37.367465
16926,8924.867222,37.367465
16927,8924.885556,37.367465


Записи, значение pressure которых отличаются от значения предыдущей записи меньше чем на threshold, будут удалены

In [149]:
threshold = 0.01

thresholds = []
errors = []

while threshold <= mean_time_diff:
    df_reduced = df.loc[(df["pressure"].diff().abs() > threshold).fillna(True)]
    error = fft_mae(df, df_reduced)
    
    thresholds.append(threshold)
    errors.append(error)
    
    threshold *= 1.1

df_plot = pd.DataFrame({"Threshold": thresholds, "FFT MAE": errors})
fig = px.line(df_plot, x="Threshold", y="FFT MAE", markers=True,
              title="Изменение ошибки FFT MAE от threshold")
fig.update_layout(xaxis_title="Threshold", yaxis_title="FFT MAE", template="plotly_white")
fig.show()

In [138]:
threshold = 0.15
df_reduced = df.loc[(df["pressure"].diff().abs() > threshold).fillna(True)]

Анализ графиков

In [152]:
plot_annotated_data(df, gt_df, selected_file)

In [135]:
plot_annotated_data(df_reduced, gt_df, selected_file)

In [139]:
df_reduced

Unnamed: 0,time,pressure
290,132.229167,39.303142
318,144.786111,38.335304
334,153.322778,39.303142
370,169.757222,38.335304
407,186.985833,39.303142
...,...,...
16895,8908.046389,36.399626
16902,8910.796389,37.367465
16924,8923.243333,36.399626
16925,8924.030556,37.367465


In [140]:
df

Unnamed: 0,time,pressure
0,0.000000,38.335304
1,0.031111,38.335304
2,0.823889,38.335304
3,0.840000,38.335304
4,1.606944,38.335304
...,...,...
16924,8923.243333,36.399626
16925,8924.030556,37.367465
16926,8924.867222,37.367465
16927,8924.885556,37.367465
