In [68]:
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Вспомогательные функции

## Функция для отрисовки графика размеченных данных

In [69]:
def plot_annotated_data(df, gt_df, selected_file):
    events = gt_df[gt_df["file"] == selected_file][["recovery", "drop"]].values[0]
    recovery_intervals = eval(events[0]) if isinstance(events[0], str) else []
    drop_intervals = eval(events[1]) if isinstance(events[1], str) else []

    fig = px.line(df, x="time", y="pressure", title=f"Visualization of {selected_file} (Reduced Data)", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")

    for start, end in recovery_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="green", opacity=0.3, layer="below", line_width=0, name="Recovery")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].max(), df["pressure"].max()],
                                    mode="lines", line=dict(color="green", width=4), name="Recovery (Bold)"))

    for start, end in drop_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="red", opacity=0.3, layer="below", line_width=0, name="Drop")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].min(), df["pressure"].min()],
                                    mode="lines", line=dict(color="red", width=4), name="Drop (Bold)"))

    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
        )

    fig.show()

## Функция для отрисовки графика неразмеченных данных

In [70]:
def plot_data(df):
    fig = px.line(df, x="time", y="pressure", title=f"Visualization of {selected_file} (Reduced Data)", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")

    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
        )

    fig.show()

# EDA

## Анализ размеченных данных

In [71]:
train_dir = "../src/raw_data/train"
ground_truth_path = "../src/raw_data/ground_truth.csv"

gt_df = pd.read_csv(ground_truth_path)
annotated_files = gt_df["file"].tolist()

In [72]:
selected_file = annotated_files[0]
file_path = os.path.join(train_dir, selected_file)

In [73]:
df = pd.read_csv(file_path, sep="\\s+", header=None, names=["time", "pressure"])

### Понижение размерности

Отклонение соседних записей:

In [74]:
pressure_diffs = df["pressure"].diff().abs().dropna()

max_pressure_diff = pressure_diffs.max()
min_pressure_diff = pressure_diffs.min()

print(f"Max diff: {max_pressure_diff}")
print(f"Min diff: {min_pressure_diff}")

Max diff: 36.216525000000004
Min diff: 0.0


Записи, значение pressure которых отличаются от значения предыдущей записи меньше чем на threshold будут удалены

In [75]:
threshold = 0.15

df_reduced = df.loc[(df["pressure"].diff().abs() > threshold).fillna(True)]

Анализ графиков

In [77]:
plot_annotated_data(df_reduced, gt_df, selected_file)