In [1]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error

# Вспомогательные функции

## Функция для отрисовки графика размеченных данных

In [2]:
def plot_annotated_data(df, gt_df, selected_file):
    events = gt_df[gt_df["file"] == selected_file][["recovery", "drop"]].values[0]
    recovery_intervals = eval(events[0]) if isinstance(events[0], str) else []
    drop_intervals = eval(events[1]) if isinstance(events[1], str) else []

    fig = px.line(df, x="time", y="pressure", title=f"Визуализация {selected_file} (размеченные данные)", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")

    for start, end in recovery_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="green", opacity=0.3, layer="below", line_width=0, name="Recovery")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].max(), df["pressure"].max()],
                                    mode="lines", line=dict(color="green", width=4), name="Recovery"))

    for start, end in drop_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="red", opacity=0.3, layer="below", line_width=0, name="Drop")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].min(), df["pressure"].min()],
                                    mode="lines", line=dict(color="red", width=4), name="Drop"))

    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
        )

    fig.show()

## Функция для отрисовки графика неразмеченных данных

In [3]:
def plot_data(df, selected_file):
    fig = px.line(df, x="time", y="pressure", title=f"Визуализация {selected_file}", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")

    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
        )

    fig.show()

## Функция для вычисления MAE по частотному спектру

In [4]:
def fft_mae(df, df_reduced):
    fft_original = np.abs(np.fft.fft(df["pressure"]))
    fft_reduced = np.abs(np.fft.fft(df_reduced["pressure"]))
    
    min_len = min(len(fft_original), len(fft_reduced))
    return mean_absolute_error(fft_original[:min_len], fft_reduced[:min_len])

# EDA

## Анализ размеченных данных

In [5]:
ground_truth_path = "../src/raw_data/ground_truth.csv"
train_dir = "../src/raw_data/train"
test_dir = "../src/raw_data/test"

gt_df = pd.read_csv(ground_truth_path)
annotated_files = gt_df["file"].tolist()

Эти размеченные файлы из теста, а не из трейна:

In [6]:
for file in annotated_files:
    file_path = os.path.join(train_dir, file)
    if not os.path.exists(file_path):
        print(f"File not found: {file}")

File not found: 1c0e8d10-ba4a-499f-8159-bde6dc70b1c8
File not found: 1c9db047-e335-46ac-8039-effd8589b25b
File not found: 1cbce6e5-9f0b-419f-9527-7add4e255217


### Проверка гипотезы: drop и recovery чредуются

In [7]:
idx = 0
for file in annotated_files:
    events = gt_df[gt_df["file"] == file][["recovery", "drop"]].values[0]
    
    recovery_intervals = eval(events[0]) if isinstance(events[0], str) else []
    drop_intervals = eval(events[1]) if isinstance(events[1], str) else []

    all_events = [(s, e, "recovery") for s, e in recovery_intervals] + [(s, e, "drop") for s, e in drop_intervals]
    
    all_events.sort()

    last_type = None
    for _, _, event_type in all_events:
        if event_type == last_type:
            print(f"Chek annotated_files[{idx}] - {file}")
            break
        last_type = event_type
        
    idx+=1


Chek annotated_files[0] - 00e03657-8e1e-4c8c-a724-1d3c77b48510
Chek annotated_files[11] - 0a497cb3-4a85-4df9-9dc2-b814f4ad33b4
Chek annotated_files[21] - 0bb24bd2-b325-40ae-8de6-a60782494f72
Chek annotated_files[24] - 0c2c3798-bf13-4d0b-8af1-c097862caa17
Chek annotated_files[29] - 0cd8b373-f6a5-4769-95aa-cbb8459b9395
Chek annotated_files[31] - 0cf09d9d-2504-4989-ad68-62d41d151eff
Chek annotated_files[38] - 0d4db2c6-1795-4e37-bb05-f67f238bb483
Chek annotated_files[39] - 0d993ced-dbd8-4079-bacf-a0b300480da8
Chek annotated_files[75] - 1c17a84f-a6b4-4df6-924e-f96690180ebf
Chek annotated_files[77] - 1c565fd4-b9ec-4c59-a2c0-1a5b8c1361d0
Chek annotated_files[84] - 1cbce6e5-9f0b-419f-9527-7add4e255217
Chek annotated_files[88] - 1ced674f-111f-4f68-920b-906f87ade09f
Chek annotated_files[94] - 1deeb502-bbec-439a-a049-1601bc3cfcd8
Chek annotated_files[98] - 1e4b4c18-1e32-45eb-917a-5760e33fbaca


### Оценка изменения давления во время drop/recovery

In [8]:
recovery_avg_changes = []
drop_avg_changes = []
recovery_avg_rates = []
drop_avg_rates = []
recovery_avg_percentages = []
drop_avg_percentages = []

idx = 0
for file in annotated_files:
    file_path = os.path.join(train_dir, file)
    
    if not os.path.exists(file_path):
        file_path = os.path.join(test_dir, file)
    
    df = pd.read_csv(file_path, sep="\\s+", header=None, names=["time", "pressure"])
    
    events = gt_df[gt_df["file"] == file][["recovery", "drop"]].values[0]
    recovery_intervals = eval(events[0]) if isinstance(events[0], str) else []
    drop_intervals = eval(events[1]) if isinstance(events[1], str) else []
    
    print(f"{idx} {file}")
    
    def analyze_intervals(event_type, intervals, avg_changes_list, avg_rates_list, avg_percentages_list):
        changes = []
        rates = []
        percentages = []
        
        for i, (start, end) in enumerate(intervals):
            event_df = df[(df["time"] >= start) & (df["time"] <= end)]
            if not event_df.empty:
                pressure_start = event_df["pressure"].iloc[0]
                pressure_end = event_df["pressure"].iloc[-1]
                pressure_change = pressure_end - pressure_start
                duration = event_df["time"].iloc[-1] - event_df["time"].iloc[0]
                rate_of_change = pressure_change / duration if duration > 0 else 0
                percentage_change = (pressure_change / pressure_start * 100) if pressure_start != 0 else 0
                
                changes.append(pressure_change)
                rates.append(rate_of_change)
                percentages.append(percentage_change)
                
                print(f"{i+1}) {pressure_change:.3f} | {rate_of_change:.3f} | {percentage_change:.2f}%")
        
        if changes:
            avg_change = sum(changes) / len(changes)
            avg_rate = sum(rates) / len(rates)
            avg_percentage = sum(percentages) / len(percentages)
            
            avg_changes_list.append(avg_change)
            avg_rates_list.append(avg_rate)
            avg_percentages_list.append(avg_percentage)
            
            print(f"{event_type} : {avg_change:.3f} | {avg_rate:.3f} | {avg_percentage:.2f}%")
        else:
            print(f"{event_type} : no data")
    
    analyze_intervals("recovery", recovery_intervals, recovery_avg_changes, recovery_avg_rates, recovery_avg_percentages)
    analyze_intervals("drop", drop_intervals, drop_avg_changes, drop_avg_rates, drop_avg_percentages)
    print("\n------------------------------\n")
    idx += 1




0 00e03657-8e1e-4c8c-a724-1d3c77b48510
1) 26.499 | 1.616 | 26.56%
2) 45.508 | 0.165 | 38.77%
3) 0.842 | 0.152 | 0.70%
recovery : 24.283 | 0.644 | 22.01%
1) -43.020 | -0.139 | -26.84%
drop : -43.020 | -0.139 | -26.84%

------------------------------

1 00e4dba2-36d2-42b4-beb1-c55aed75f506
recovery : no data
1) -34.842 | -0.006 | -41.27%
drop : -34.842 | -0.006 | -41.27%

------------------------------

2 00f035b7-ad7a-4f30-9081-522a3c10805b
recovery : no data
1) -227.099 | -5.312 | -81.24%
drop : -227.099 | -5.312 | -81.24%

------------------------------

3 01a0c034-6afc-4e73-95fa-621f702a0b7d
recovery : no data
1) -70.101 | -0.146 | -46.07%
drop : -70.101 | -0.146 | -46.07%

------------------------------

4 01a530d3-6496-4515-9fbb-4f44e298fd29
recovery : no data
1) -58.070 | -0.045 | -54.25%
drop : -58.070 | -0.045 | -54.25%

------------------------------

5 01aaea9a-ad84-4e6d-9945-48c8b12437a7
recovery : no data
1) -38.714 | -0.017 | -25.46%
drop : -38.714 | -0.017 | -25.46%

-----

In [9]:
overall_recovery_avg_change = sum(recovery_avg_changes) / len(recovery_avg_changes)
overall_recovery_avg_rate = sum(recovery_avg_rates) / len(recovery_avg_rates)
overall_recovery_avg_percentage = sum(recovery_avg_percentages) / len(recovery_avg_percentages)
print(f"Avg recovery: {overall_recovery_avg_change:.3f} | {overall_recovery_avg_rate:.3f} | {overall_recovery_avg_percentage:.2f}%")

overall_drop_avg_change = sum(drop_avg_changes) / len(drop_avg_changes)
overall_drop_avg_rate = sum(drop_avg_rates) / len(drop_avg_rates)
overall_drop_avg_percentage = sum(drop_avg_percentages) / len(drop_avg_percentages)
print(f"Avg drop: {overall_drop_avg_change:.3f} | {overall_drop_avg_rate:.3f} | {overall_drop_avg_percentage:.2f}%")

Avg recovery: 83.007 | 0.807 | 170.84%
Avg drop: -77.690 | -0.620 | -48.38%


### Выбор файла

In [10]:
selected_file = annotated_files[75]
file_path = os.path.join(train_dir, selected_file)

In [11]:
df = pd.read_csv(file_path, sep="\\s+", header=None, names=["time", "pressure"])

### Понижение количества записей

Отклонение давления и времени соседних записей:

In [12]:
pressure_diffs = df["pressure"].diff().abs().dropna()
time_diffs = df["time"].diff().dropna()

max_pressure_diff = pressure_diffs.max()
min_pressure_diff = pressure_diffs.min()
max_time_diffs = time_diffs.max()
min_time_diffs = time_diffs.min()

mean_pressure_diff = pressure_diffs.mean()

print(f"Max p diff: {max_pressure_diff}")
print(f"Min p diff: {min(pressure_diffs[pressure_diffs > 0])}")

print(f"Max time diff: {max_time_diffs}")
print(f"Min time diff: {min_time_diffs}")

Max p diff: 238.37867599999998
Min p diff: 0.009677999999979647
Max time diff: 484.14527700000053
Min time diff: 0.04944499999874097


Записи, значение pressure которых отличаются от значения предыдущей записи меньше чем на threshold, будут удалены

In [13]:
threshold = min(pressure_diffs[pressure_diffs > 0])
error = 0

thresholds = []
errors = []

while threshold <= mean_pressure_diff and error <= 1000:
    df_reduced = df.loc[(df["pressure"].diff().abs() > threshold).fillna(True)]
    error = fft_mae(df, df_reduced)
    
    thresholds.append(threshold)
    errors.append(error)
    
    threshold *= 1.1

df_plot = pd.DataFrame({"Threshold": thresholds, "FFT MAE": errors})
fig = px.line(df_plot, x="Threshold", y="FFT MAE", markers=True,
              title="Изменение ошибки FFT MAE от threshold")
fig.update_layout(xaxis_title="Threshold", yaxis_title="FFT MAE", template="plotly_white")
fig.show()

In [14]:
def calc_threshold(df):
    pressure_diffs = df["pressure"].diff().abs().dropna()
    mean_pressure_diff = pressure_diffs.mean()
    threshold = min(pressure_diffs[pressure_diffs > 0])
    error = 0
    thresholds = []
    errors = []

    while threshold <= mean_pressure_diff and error <= 1000:
        df_reduced = df.loc[(df["pressure"].diff().abs() > threshold).fillna(True)]
        error = fft_mae(df, df_reduced)
        
        thresholds.append(threshold)
        errors.append(error)
        
        threshold *= 1.1

    if len(thresholds) < 2:
        return min(pressure_diffs[pressure_diffs > 0])
    return thresholds[-2]

In [15]:
df_reduced = df.loc[(df["pressure"].diff().abs() > calc_threshold(df)).fillna(True)]

#### Анализ графиков

In [19]:
plot_annotated_data(df, gt_df, selected_file)

In [20]:
plot_annotated_data(df_reduced, gt_df, selected_file)

Алгоритм понижения понижения количества записей в полной мере реализован в utils/reducer.py

### Избавление от шумов

#### Фильтр Савицкого-Голея

In [21]:
from scipy.signal import savgol_filter

In [22]:
filtered_df = pd.DataFrame()
filtered_df["time"] = df["time"]

In [23]:
filtered_df["pressure"] = savgol_filter(df["pressure"], window_length=20, polyorder=2)

In [24]:
plot_annotated_data(filtered_df, gt_df, selected_file)

In [25]:
filtered_df_reduced = filtered_df.loc[(filtered_df["pressure"].diff().abs() > calc_threshold(filtered_df)).fillna(True)]
plot_annotated_data(filtered_df_reduced, gt_df, selected_file)

#### Анализ скорости изменения давления

In [26]:
def calc_dif_threshold(df):
    pressure_diffs = df["pressure"].diff().abs().dropna()
    threshold = min(pressure_diffs[pressure_diffs > 0])
    q90 = pressure_diffs.quantile(0.90)  
    error = 0
    thresholds = []
    errors = []

    while threshold <= q90 and error <= 1000:
        df_reduced = df.loc[(df["pressure"].diff().abs() > threshold).fillna(True)]
        error = fft_mae(df, df_reduced)
        
        thresholds.append(threshold)
        errors.append(error)
        
        threshold *= 1.1

    if len(thresholds) < 2:
        return min(pressure_diffs[pressure_diffs > 0])
    return thresholds[-2]

In [27]:
df_dif = pd.DataFrame()
df_dif["time"] = df["time"]
df_dif["pressure"] = df["pressure"].diff()

df_dif = df_dif.dropna()

In [28]:
plot_annotated_data(df_dif, gt_df, selected_file)

In [29]:
df_dif_reduced = df_dif.loc[(df_dif["pressure"].diff().abs() > calc_dif_threshold(df_dif)).fillna(True)]

In [30]:
plot_annotated_data(df_dif_reduced, gt_df, selected_file)

In [31]:
filtered_df_dif = pd.DataFrame()
filtered_df_dif["time"] = df_dif["time"]
filtered_df_dif["pressure"] = savgol_filter(df_dif["pressure"], window_length=10, polyorder=2)

plot_annotated_data(filtered_df_dif, gt_df, selected_file)