In [231]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error

# Вспомогательные функции

## Функция для отрисовки графика размеченных данных

In [282]:
def plot_annotated_data(df, gt_df, selected_file):
    events = gt_df[gt_df["file"] == selected_file][["recovery", "drop"]].values[0]
    recovery_intervals = eval(events[0]) if isinstance(events[0], str) else []
    drop_intervals = eval(events[1]) if isinstance(events[1], str) else []

    fig = px.line(df, x="time", y="pressure", title=f"Визуализация {selected_file} (размеченные данные)", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")

    for start, end in recovery_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="green", opacity=0.3, layer="below", line_width=0, name="Recovery")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].max(), df["pressure"].max()],
                                    mode="lines", line=dict(color="green", width=4), name="Recovery (Bold)"))

    for start, end in drop_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="red", opacity=0.3, layer="below", line_width=0, name="Drop")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].min(), df["pressure"].min()],
                                    mode="lines", line=dict(color="red", width=4), name="Drop (Bold)"))

    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
        )

    fig.show()

## Функция для отрисовки графика неразмеченных данных

In [233]:
def plot_data(df, selected_file):
    fig = px.line(df, x="time", y="pressure", title=f"Визуализация {selected_file}", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")

    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
        )

    fig.show()

## Функция для вычисления MAE по частотному спектру

In [234]:
def fft_mae(df, df_reduced):
    fft_original = np.abs(np.fft.fft(df["pressure"]))
    fft_reduced = np.abs(np.fft.fft(df_reduced["pressure"]))
    
    min_len = min(len(fft_original), len(fft_reduced))
    return mean_absolute_error(fft_original[:min_len], fft_reduced[:min_len])

# EDA

## Анализ размеченных данных

In [235]:
train_dir = "../src/raw_data/train"
ground_truth_path = "../src/raw_data/ground_truth.csv"

gt_df = pd.read_csv(ground_truth_path)
annotated_files = gt_df["file"].tolist()

In [236]:
selected_file = annotated_files[0]
file_path = os.path.join(train_dir, selected_file)

In [237]:
df = pd.read_csv(file_path, sep="\\s+", header=None, names=["time", "pressure"])

### Понижение размерности

Отклонение давления и времени соседних записей:

In [238]:
pressure_diffs = df["pressure"].diff().abs().dropna()
time_diffs = df["time"].diff().dropna()

max_pressure_diff = pressure_diffs.max()
min_pressure_diff = pressure_diffs.min()
max_time_diffs = time_diffs.max()
min_time_diffs = time_diffs.min()

mean_pressure_diff = pressure_diffs.mean()

print(f"Max p diff: {max_pressure_diff}")
print(f"Min p diff: {min(pressure_diffs[pressure_diffs > 0])}")

print(f"Max time diff: {max_time_diffs}")
print(f"Min time diff: {min_time_diffs}")

Max p diff: 36.216525000000004
Min p diff: 0.009677999999979647
Max time diff: 69.17666699999972
Min time diff: 0.0002769999991869554


Записи, значение pressure которых отличаются от значения предыдущей записи меньше чем на threshold, будут удалены

In [239]:
threshold = min(pressure_diffs[pressure_diffs > 0])
error = 0

thresholds = []
errors = []

while threshold <= mean_pressure_diff and error <= 1000:
    df_reduced = df.loc[(df["pressure"].diff().abs() > threshold).fillna(True)]
    error = fft_mae(df, df_reduced)
    
    thresholds.append(threshold)
    errors.append(error)
    
    threshold *= 1.1

df_plot = pd.DataFrame({"Threshold": thresholds, "FFT MAE": errors})
fig = px.line(df_plot, x="Threshold", y="FFT MAE", markers=True,
              title="Изменение ошибки FFT MAE от threshold")
fig.update_layout(xaxis_title="Threshold", yaxis_title="FFT MAE", template="plotly_white")
fig.show()

In [288]:
def calc_threshold(df):
    pressure_diffs = df["pressure"].diff().abs().dropna()
    mean_pressure_diff = pressure_diffs.mean()
    threshold = min(pressure_diffs[pressure_diffs > 0])
    error = 0
    thresholds = []
    errors = []

    while threshold <= mean_pressure_diff and error <= 1000:
        df_reduced = df.loc[(df["pressure"].diff().abs() > threshold).fillna(True)]
        error = fft_mae(df, df_reduced)
        
        thresholds.append(threshold)
        errors.append(error)
        
        threshold *= 1.1

    if len(thresholds) < 2:
        return min(pressure_diffs[pressure_diffs > 0])
    return thresholds[-2]

In [241]:
df_reduced = df.loc[(df["pressure"].diff().abs() > calc_threshold(df)).fillna(True)]

#### Анализ графиков

In [242]:
plot_annotated_data(df, gt_df, selected_file)

In [243]:
plot_annotated_data(df_reduced, gt_df, selected_file)

Алгоритм понижения размерности в полной мере реализован в utils/reducer.py

### Избавление от шумов

#### Фильтр Савицкого-Голея

In [244]:
from scipy.signal import savgol_filter

In [245]:
filtered_df = pd.DataFrame()
filtered_df["time"] = df["time"]

In [246]:
filtered_df["pressure"] = savgol_filter(df["pressure"], window_length=20, polyorder=2)

In [247]:
plot_annotated_data(filtered_df, gt_df, selected_file)

In [248]:
df_reduced1 = filtered_df.loc[(filtered_df["pressure"].diff().abs() > calc_threshold(filtered_df)).fillna(True)]
plot_annotated_data(df_reduced1, gt_df, selected_file)

#### Анализ скорости изменения давления

In [297]:
def calc_dif_threshold(df):
    pressure_diffs = df["pressure"].diff().abs().dropna()
    threshold = min(pressure_diffs[pressure_diffs > 0])
    q90 = pressure_diffs.quantile(0.90)  
    error = 0
    thresholds = []
    errors = []

    while threshold <= q90 and error <= 1000:
        df_reduced = df.loc[(df["pressure"].diff().abs() > threshold).fillna(True)]
        error = fft_mae(df, df_reduced)
        
        thresholds.append(threshold)
        errors.append(error)
        
        threshold *= 1.1

    if len(thresholds) < 2:
        return min(pressure_diffs[pressure_diffs > 0])
    return thresholds[-2]

In [293]:
df_dif = pd.DataFrame()
df_dif["time"] = df["time"]
df_dif["pressure"] = df["pressure"].diff()

df_dif = df_dif.dropna()

In [273]:
plot_annotated_data(df_dif, gt_df, selected_file)

In [299]:
df_dif_reduced = df_dif.loc[(df_dif["pressure"].diff().abs() > calc_dif_threshold(df_dif)).fillna(True)]

In [306]:
plot_annotated_data(df_dif_reduced, gt_df, selected_file)

In [307]:
filtered_df_dif = pd.DataFrame()
filtered_df_dif["time"] = df_dif["time"]
filtered_df_dif["pressure"] = savgol_filter(df_dif["pressure"], window_length=10, polyorder=2)

plot_annotated_data(filtered_df_dif, gt_df, selected_file)