In [2]:
import os
import ast
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

import warnings 
warnings.filterwarnings('ignore')

# Пути и основные датасеты

In [3]:
ground_truth_path = "../src/raw_data/ground_truth.csv"
train_dir = "../src/raw_data/train"
test_dir = "../src/raw_data/test"
models_dir = "../models"

gt_df = pd.read_csv(ground_truth_path, sep =";")
annotated_files = gt_df["file"].tolist()

# Вспомогательные функции

## Функция отрисовки размеченных данных

In [4]:
def plot_annotated_data(df, selected_file):
    fig = px.line(df, x="time", y="pressure", title=f"Визуализация {selected_file} (размеченные данные)", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")
    
    recovery_intervals = []
    drop_intervals = []
    
    prev_pattern = None
    start_time = None
    
    for i in range(len(df)):
        current_pattern = df.loc[df.index[i], "pattern"]
        current_time = df.loc[df.index[i], "time"]
        
        if current_pattern != prev_pattern:
            if prev_pattern == 1: 
                recovery_intervals.append((start_time, current_time))
            elif prev_pattern == 2:  
                drop_intervals.append((start_time, current_time))
            
            start_time = current_time
        
        prev_pattern = current_pattern
    
    for start, end in recovery_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="green", opacity=0.3, layer="below", line_width=0, name="Recovery")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].max(), df["pressure"].max()],
                                    mode="lines", line=dict(color="green", width=4), name="Recovery"))
    
    for start, end in drop_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="red", opacity=0.3, layer="below", line_width=0, name="Drop")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].min(), df["pressure"].min()],
                                    mode="lines", line=dict(color="red", width=4), name="Drop"))
    
    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
    )
    
    fig.show()

## Функция отрисовки предсказанных данных

In [5]:
def plot_predicted_data(df, file_name):
    fig = px.line(df, x="time", y="pressure", title=f"Визуализация {file_name} (предсказанные данные)", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")

    recovery_intervals = []
    drop_intervals = []
    pred_recovery_intervals = []
    pred_drop_intervals = []

    prev_pattern = None
    prev_prediction = None
    start_time_pattern = None
    start_time_prediction = None

    for i in range(len(df)):
        current_pattern = df.loc[df.index[i], "pattern"]
        current_prediction = df.loc[df.index[i], "prediction"]
        current_time = df.loc[df.index[i], "time"]

        if current_pattern != prev_pattern:
            if prev_pattern == 1:
                recovery_intervals.append((start_time_pattern, current_time))
            elif prev_pattern == 2:
                drop_intervals.append((start_time_pattern, current_time))
            start_time_pattern = current_time

        if current_prediction != prev_prediction:
            if prev_prediction == 1:
                pred_recovery_intervals.append((start_time_prediction, current_time))
            elif prev_prediction == 2:
                pred_drop_intervals.append((start_time_prediction, current_time))
            start_time_prediction = current_time

        prev_pattern = current_pattern
        prev_prediction = current_prediction

    for start, end in recovery_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="lightgreen", opacity=0.3, layer="below", line_width=0, name="Recovery")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].max(), df["pressure"].max()],
                                    mode="lines", line=dict(color="lightgreen", width=4), name="Recovery"))

    for start, end in drop_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="red", opacity=0.3, layer="below", line_width=0, name="Drop")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].min(), df["pressure"].min()],
                                    mode="lines", line=dict(color="red", width=4), name="Drop"))

    for start, end in pred_recovery_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="darkgreen", opacity=0.3, layer="below", line_width=0, name="Predicted Recovery")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].max(), df["pressure"].max()],
                                    mode="lines", line=dict(color="darkgreen", width=2, dash="dash"), name="Predicted Recovery"))

    for start, end in pred_drop_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="orange", opacity=0.3, layer="below", line_width=0, name="Predicted Drop")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].min(), df["pressure"].min()],
                                    mode="lines", line=dict(color="orange", width=2, dash="dash"), name="Predicted Drop"))

    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
    )

    fig.show()

# Импорт моделей

In [6]:
from catboost import CatBoostClassifier

cb_model = CatBoostClassifier()
cb_model.load_model(os.path.join(models_dir, "catboost_model_2.bin"))

<catboost.core.CatBoostClassifier at 0x10a423c40>

# Анализ предсказаний

In [7]:
test_files = []
test_dfs = []
window = 5

for i in range(gt_df.shape[0]):
    file = gt_df["file"].iloc[i]
    path = os.path.join(test_dir, file)
    
    if os.path.exists(path):
        test_files.append(file)
        df = pd.read_csv(path, sep="\s+", header=None, names=["time", "pressure"])

        recovery_range = ast.literal_eval(gt_df["recovery"].iloc[i])
        drop_range =  ast.literal_eval(gt_df["drop"].iloc[i])

        scaler = MinMaxScaler()
        df[['pressure']] = scaler.fit_transform(df[['pressure']])

        df['pattern'] = 0

        try:
            for start, end in recovery_range:
                df.loc[(df['time'] >= start) & (df['time'] <= end), 'pattern'] = 1
        except ValueError:
            print("No recovery pattern")

        try:
            for start, end in drop_range:
                df.loc[(df['time'] >= start) & (df['time'] <= end), 'pattern'] = 2
        except ValueError:
            print("No drop pattern")

        for i in range(1, window + 1):
            df[f'pressure_lag_{i}'] = df['pressure'].shift(i, fill_value=0)
        
        test_dfs.append(df)

## CatBoost

In [8]:
predicted_dfs = []

for df in test_dfs:
    x = df.drop(labels = "pattern", axis = 1)
    y = df["pattern"]

    y_pred = cb_model.predict(x)
    print(classification_report(y, y_pred))
    
    df["prediction"] = y_pred
    predicted_dfs.append(df)


              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1233
           1       0.00      0.00      0.00         0
           2       1.00      0.17      0.29       107

    accuracy                           0.93      1340
   macro avg       0.66      0.39      0.42      1340
weighted avg       0.97      0.93      0.93      1340

              precision    recall  f1-score   support

           0       0.82      1.00      0.90     45059
           1       0.89      0.90      0.90       224
           2       0.56      0.00      0.00      9846

    accuracy                           0.82     55129
   macro avg       0.76      0.63      0.60     55129
weighted avg       0.77      0.82      0.74     55129

              precision    recall  f1-score   support

           0       0.96      0.96      0.96     49605
           1       0.98      0.83      0.90      1961
           2       0.31      0.36      0.33      2649

    accuracy        

In [9]:
for i in range(len(predicted_dfs)):
    plot_predicted_data(predicted_dfs[i], test_files[i])

In [10]:
test_files[2]

'1cbce6e5-9f0b-419f-9527-7add4e255217'