In [178]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error

# Вспомогательные функции

## Функция для отрисовки графика размеченных данных

In [179]:
def plot_annotated_data(df, gt_df, selected_file):
    events = gt_df[gt_df["file"] == selected_file][["recovery", "drop"]].values[0]
    recovery_intervals = eval(events[0]) if isinstance(events[0], str) else []
    drop_intervals = eval(events[1]) if isinstance(events[1], str) else []

    fig = px.line(df, x="time", y="pressure", title=f"Визуализация {selected_file} (рамеченные данные)", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")

    for start, end in recovery_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="green", opacity=0.3, layer="below", line_width=0, name="Recovery")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].max(), df["pressure"].max()],
                                    mode="lines", line=dict(color="green", width=4), name="Recovery (Bold)"))

    for start, end in drop_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="red", opacity=0.3, layer="below", line_width=0, name="Drop")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].min(), df["pressure"].min()],
                                    mode="lines", line=dict(color="red", width=4), name="Drop (Bold)"))

    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
        )

    fig.show()

## Функция для отрисовки графика неразмеченных данных

In [180]:
def plot_data(df, selected_file):
    fig = px.line(df, x="time", y="pressure", title=f"Визуализация {selected_file}", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")

    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
        )

    fig.show()

## Функция для вычисления MAE по частотному спектру

In [181]:
def fft_mae(df, df_reduced):
    fft_original = np.abs(np.fft.fft(df["pressure"]))
    fft_reduced = np.abs(np.fft.fft(df_reduced["pressure"]))
    
    min_len = min(len(fft_original), len(fft_reduced))
    return mean_absolute_error(fft_original[:min_len], fft_reduced[:min_len])

# EDA

## Анализ размеченных данных

In [182]:
train_dir = "../src/raw_data/train"
ground_truth_path = "../src/raw_data/ground_truth.csv"

gt_df = pd.read_csv(ground_truth_path)
annotated_files = gt_df["file"].tolist()

In [183]:
selected_file = annotated_files[1]
file_path = os.path.join(train_dir, selected_file)

In [184]:
df = pd.read_csv(file_path, sep="\\s+", header=None, names=["time", "pressure"])

### Понижение размерности

Отклонение давления и времени соседних записей:

In [185]:
pressure_diffs = df["pressure"].diff().abs().dropna()
time_diffs = df["time"].diff().dropna()

max_pressure_diff = pressure_diffs.max()
min_pressure_diff = pressure_diffs.min()
max_time_diffs = time_diffs.max()
min_time_diffs = time_diffs.min()

mean_pressure_diff = pressure_diffs.mean()

print(f"Max p diff: {max_pressure_diff}")
print(f"Min p diff: {min(pressure_diffs[pressure_diffs > 0])}")

print(f"Max time diff: {max_time_diffs}")
print(f"Min time diff: {min_time_diffs}")

Max p diff: 53.231129
Min p diff: 0.9678379999999933
Max time diff: 536.1244450000004
Min time diff: 0.0016669999995428952


Записи, значение pressure которых отличаются от значения предыдущей записи меньше чем на threshold, будут удалены

In [186]:
threshold = min(pressure_diffs[pressure_diffs > 0])
error = 0

thresholds = []
errors = []

while threshold <= mean_pressure_diff and error <= 1000:
    df_reduced = df.loc[(df["pressure"].diff().abs() > threshold).fillna(True)]
    error = fft_mae(df, df_reduced)
    
    thresholds.append(threshold)
    errors.append(error)
    
    threshold *= 1.1

df_plot = pd.DataFrame({"Threshold": thresholds, "FFT MAE": errors})
fig = px.line(df_plot, x="Threshold", y="FFT MAE", markers=True,
              title="Изменение ошибки FFT MAE от threshold")
fig.update_layout(xaxis_title="Threshold", yaxis_title="FFT MAE", template="plotly_white")
fig.show()

In [187]:
def calc_threshold(df):
    pressure_diffs = df["pressure"].diff().abs().dropna()
    mean_pressure_diff = pressure_diffs.mean()
    threshold = min(pressure_diffs[pressure_diffs > 0])
    error = 0
    thresholds = []
    errors = []

    while threshold <= mean_pressure_diff and error <= 1000:
        df_reduced = df.loc[(df["pressure"].diff().abs() > threshold).fillna(True)]
        error = fft_mae(df, df_reduced)
        
        thresholds.append(threshold)
        errors.append(error)
        
        threshold *= 1.1

    if len(thresholds) < 2:
        return min(pressure_diffs[pressure_diffs > 0])
    return thresholds[-2]

In [188]:
df_reduced = df.loc[(df["pressure"].diff().abs() > calc_threshold(df)).fillna(True)]

#### Анализ графиков

In [189]:
plot_annotated_data(df, gt_df, selected_file)

In [190]:
plot_annotated_data(df_reduced, gt_df, selected_file)

Алгоритм понижения размерности в полной мере реализован в utils/reducer.py

### Избавление от шумов

In [191]:
from scipy.signal import savgol_filter

In [218]:
df1 = pd.DataFrame()
df1["time"] = df["time"]

In [219]:
df1["pressure"] = savgol_filter(df["pressure"], window_length=20, polyorder=2)

In [220]:
plot_annotated_data(df1, gt_df, selected_file)

In [212]:
df_reduced1 = df1.loc[(df1["pressure"].diff().abs() > calc_threshold(df1)).fillna(True)]
plot_annotated_data(df_reduced1, gt_df, selected_file)

In [221]:
plot_annotated_data(df, gt_df, selected_file)

In [222]:
df2 = pd.DataFrame()
df2["time"] = df_reduced1["time"]

In [223]:
df2["pressure"] = df_reduced1["pressure"].diff()
df2["pressure"] = df2["pressure"].diff()

In [224]:
plot_annotated_data(df2, gt_df, selected_file)