In [14]:
import os
import warnings
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import ruptures as rpt
import ast
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [61]:
warnings.filterwarnings('ignore')

SUBMIT_INPUT_PATH = "../src/raw_data/submit.csv"
SUBMIT_OUTPUT_PATH = "../src/submit3.csv"
DATA_PATH = "../src/test_denoised"

In [68]:
import os
import warnings
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

def process_file(filename, data_path):
    df = pd.read_csv(f'{data_path}/{filename}', names=['time', 'pressure'], sep='\\s+')
    
    scaler = StandardScaler()
    df[['time_scaled', 'pressure_scaled']] = scaler.fit_transform(df[['time', 'pressure']])
    
    eps_value = 0.085
    dbscan = DBSCAN(eps=eps_value, min_samples=20)
    df['cluster'] = dbscan.fit_predict(df[['time_scaled', 'pressure_scaled']])
    df['cluster'] = df['cluster'].apply(lambda x: 1 if x == -1 else 0)
    
    df['segment_id'] = (df['cluster'] != df['cluster'].shift()).cumsum()
    segment_sizes = df.groupby('segment_id')['cluster'].transform('sum')
    df['clean_cluster'] = np.where((df['cluster'] == 1) & (segment_sizes >= 40), 1, 0)
    
    df['delta_pressure'] = df['pressure'].diff().rolling(3, center=True).mean()
    df['final_cluster'] = 0
    df.loc[(df['clean_cluster'] == 1) & (df['delta_pressure'] < 0), 'final_cluster'] = 1
    df.loc[(df['clean_cluster'] == 1) & (df['delta_pressure'] > 0), 'final_cluster'] = 2
    
    df['segment_id'] = (df['final_cluster'] != df['final_cluster'].shift()).cumsum()
    segment_sizes = df.groupby('segment_id')['final_cluster'].transform('count')
    df.loc[segment_sizes < 5, 'final_cluster'] = 0
    
    df['segment_id'] = (df['cluster'] != df['cluster'].shift()).cumsum()
    drop_segments = []
    recovery_segments = []
    
    for _, group in df.groupby('segment_id'):
        cluster_type = group['final_cluster'].iloc[0]
        start_time = group['time'].iloc[0]
        end_time = group['time'].iloc[-1]
        
        if cluster_type == 1:
            drop_segments.append([start_time, end_time])
        elif cluster_type == 2:
            recovery_segments.append([start_time, end_time])
    
    def merge_intervals(segments, gap_threshold=50):
        merged = []
        for interval in segments:
            if not merged or interval[0] - merged[-1][1] > gap_threshold:
                merged.append(interval)
            else:
                merged[-1][1] = interval[1]
        return merged
    
    merged_drop_segments = merge_intervals(drop_segments)
    merged_recovery_segments = merge_intervals(recovery_segments)
    
    return filename, merged_recovery_segments, merged_drop_segments, df

def process_all_files(input_path, data_path, output_path):
    submit_df = pd.read_csv(input_path)
    results = []
    
    for file in submit_df['file']:
        filename, recovery, drop, df_clust = process_file(file, data_path)
        results.append({'file': filename, 'recovery': recovery, 'drop': drop})
    
    output_df = pd.DataFrame(results)
    output_df.to_csv(output_path, index=False)

In [69]:
process_all_files(SUBMIT_INPUT_PATH, DATA_PATH, SUBMIT_OUTPUT_PATH)

[[2339.244444, 2365.675]]


# Визуализация 

In [54]:
def plot_data(clust_df, filename):
    segment_sizes = clust_df.groupby('segment_id')['final_cluster'].transform('count')
    clust_df.loc[segment_sizes < 5, 'final_cluster'] = 0

    clust_df.loc[segment_sizes < 5, 'final_cluster'] = 0

    clust_df = clust_df[['time', 'pressure', 'final_cluster']]
    clust_df.rename(columns={'final_cluster': 'cluster'}, inplace=True)

    fig = go.Figure()
    colors = {0: "blue", 1: "red", 2: "green"}

    for cluster in clust_df["cluster"].unique():
        cluster_data = clust_df[clust_df["cluster"] == cluster]
        marker_symbol = "circle" if cluster != 0 else "x"
        fig.add_trace(go.Scatter(
            x=cluster_data["time"], 
            y=cluster_data["pressure"],
            mode="markers",
            marker=dict(size=6, symbol=marker_symbol, color=colors.get(cluster, "black")),
            name=f"Cluster {cluster}"
        ))

    fig.update_layout(
        title=f"DBSCAN Clustering ({filename})",
        xaxis_title="Time",
        yaxis_title="Pressure",
        showlegend=True
    )

    fig.show()

In [64]:
labels_df = pd.read_csv('raw_data/submit.csv', sep = ',')
filename = labels_df['file'][1]

df = pd.read_csv(f'test_denoised/{filename}', names = ['time', 'pressure'], sep = '\\s+')

In [70]:
filename, recovery, drop, clust_df = process_file(filename, DATA_PATH)
plot_data(clust_df, filename)

[[2339.244444, 2365.675]]
