In [27]:
import os
import ast
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

import warnings 
warnings.filterwarnings('ignore')

# Пути и основные датасеты

In [28]:
ground_truth_path = "../src/raw_data/ground_truth.csv"
train_dir = "../src/raw_data/train"
test_dir = "../src/raw_data/test"
models_dir = "../models"

gt_df = pd.read_csv(ground_truth_path)
annotated_files = gt_df["file"].tolist()

# Вспомогательные функции

## Функция отрисовки размеченных данных

In [29]:
def plot_annotated_data(df, selected_file):
    fig = px.line(df, x="time", y="pressure", title=f"Визуализация {selected_file} (размеченные данные)", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")
    
    recovery_intervals = []
    drop_intervals = []
    
    prev_pattern = None
    start_time = None
    
    for i in range(len(df)):
        current_pattern = df.loc[df.index[i], "pattern"]
        current_time = df.loc[df.index[i], "time"]
        
        if current_pattern != prev_pattern:
            if prev_pattern == 1: 
                recovery_intervals.append((start_time, current_time))
            elif prev_pattern == 2:  
                drop_intervals.append((start_time, current_time))
            
            start_time = current_time
        
        prev_pattern = current_pattern
    
    for start, end in recovery_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="green", opacity=0.3, layer="below", line_width=0, name="Recovery")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].max(), df["pressure"].max()],
                                    mode="lines", line=dict(color="green", width=4), name="Recovery"))
    
    for start, end in drop_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="red", opacity=0.3, layer="below", line_width=0, name="Drop")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].min(), df["pressure"].min()],
                                    mode="lines", line=dict(color="red", width=4), name="Drop"))
    
    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
    )
    
    fig.show()

## Функция отрисовки предсказанных данных

In [30]:
def plot_predicted_data(df, file_name):
    fig = px.line(df, x="time", y="pressure", title=f"Визуализация {file_name} (предсказанные данные)", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")

    recovery_intervals = []
    drop_intervals = []
    pred_recovery_intervals = []
    pred_drop_intervals = []

    prev_pattern = None
    prev_prediction = None
    start_time_pattern = None
    start_time_prediction = None

    for i in range(len(df)):
        current_pattern = df.loc[df.index[i], "pattern"]
        current_prediction = df.loc[df.index[i], "prediction"]
        current_time = df.loc[df.index[i], "time"]

        if current_pattern != prev_pattern:
            if prev_pattern == 1:
                recovery_intervals.append((start_time_pattern, current_time))
            elif prev_pattern == 2:
                drop_intervals.append((start_time_pattern, current_time))
            start_time_pattern = current_time

        if current_prediction != prev_prediction:
            if prev_prediction == 1:
                pred_recovery_intervals.append((start_time_prediction, current_time))
            elif prev_prediction == 2:
                pred_drop_intervals.append((start_time_prediction, current_time))
            start_time_prediction = current_time

        prev_pattern = current_pattern
        prev_prediction = current_prediction

    for start, end in recovery_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="lightgreen", opacity=0.3, layer="below", line_width=0, name="Recovery")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].max(), df["pressure"].max()],
                                    mode="lines", line=dict(color="lightgreen", width=4), name="Recovery"))

    for start, end in drop_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="red", opacity=0.3, layer="below", line_width=0, name="Drop")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].min(), df["pressure"].min()],
                                    mode="lines", line=dict(color="red", width=4), name="Drop"))

    for start, end in pred_recovery_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="darkgreen", opacity=0.3, layer="below", line_width=0, name="Predicted Recovery")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].max(), df["pressure"].max()],
                                    mode="lines", line=dict(color="darkgreen", width=2, dash="dash"), name="Predicted Recovery"))

    for start, end in pred_drop_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="orange", opacity=0.3, layer="below", line_width=0, name="Predicted Drop")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].min(), df["pressure"].min()],
                                    mode="lines", line=dict(color="orange", width=2, dash="dash"), name="Predicted Drop"))

    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
    )

    fig.show()

# Импорт моделей

In [31]:
from catboost import CatBoostClassifier

cb_model = CatBoostClassifier()
cb_model.load_model(os.path.join(models_dir, "catboost_model.bin"))

<catboost.core.CatBoostClassifier at 0x179282a30>

# Анализ предсказаний

In [32]:
test_files = []
test_dfs = []

for i in range(gt_df.shape[0]):
    file = gt_df["file"].iloc[i]
    path = os.path.join(test_dir, file)

    if os.path.exists(path):
        test_files.append(file)

        df = pd.read_csv(path, sep="\\s+", header=None, names=["time", "pressure"])

        recovery_range = ast.literal_eval(gt_df["recovery"].iloc[i])
        drop_range =  ast.literal_eval(gt_df["drop"].iloc[i])

        scaler = MinMaxScaler()
        df[['time', 'pressure']] = scaler.fit_transform(df[['time', 'pressure']])

        df['pattern'] = 0

        try:
            for start, end in recovery_range:
                norm_start, norm_end = scaler.transform([[start, df['pressure'].min()]])[0][0], scaler.transform([[end, df['pressure'].min()]])[0][0]
                df.loc[(df['time'] >= norm_start) & (df['time'] <= norm_end), 'pattern'] = 1
        except ValueError:
            print("No recovery pattern")

        try:
            for start, end in drop_range:
                norm_start, norm_end = scaler.transform([[start, df['pressure'].min()]])[0][0], scaler.transform([[end, df['pressure'].min()]])[0][0]
                df.loc[(df['time'] >= norm_start) & (df['time'] <= norm_end), 'pattern'] = 2
        except ValueError:
            print("No drop pattern")
        
        test_dfs.append(df)

In [33]:
plot_annotated_data(test_dfs[1], annotated_files[1])

## CatBoost

In [34]:
predicted_dfs = []

for df in test_dfs:
    x = df[["time", "pressure"]]
    y = df["pattern"]

    y_pred = cb_model.predict(x)
    print(classification_report(y, y_pred))
    
    df["prediction"] = y_pred
    predicted_dfs.append(df)


              precision    recall  f1-score   support

           0       0.92      1.00      0.96      1233
           2       0.33      0.02      0.04       107

    accuracy                           0.92      1340
   macro avg       0.63      0.51      0.50      1340
weighted avg       0.87      0.92      0.88      1340

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     45058
           1       0.94      0.77      0.85       222
           2       0.96      0.43      0.60      9849

    accuracy                           0.89     55129
   macro avg       0.93      0.73      0.80     55129
weighted avg       0.90      0.89      0.88     55129

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     49608
           1       0.97      0.92      0.94      1961
           2       0.91      0.47      0.62      2646

    accuracy                           0.97     54215
   macro avg       0

In [35]:
for i in range(len(predicted_dfs)):
    plot_predicted_data(predicted_dfs[i], annotated_files[i])