In [2]:
import ast
import warnings
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

In [3]:
warnings.filterwarnings('ignore')

SUBMIT_INPUT_PATH = "../src/raw_data/submit.csv"
SUBMIT_OUTPUT_PATH = "../src/submit4.csv"
DATA_PATH = "../src/test_denoised"

In [4]:
def process_file(filename, data_path):
    df = pd.read_csv(f'{data_path}/{filename}', names=['time', 'pressure'], sep='\\s+')
    
    scaler = StandardScaler()
    df[['time_scaled', 'pressure_scaled']] = scaler.fit_transform(df[['time', 'pressure']])
    
    eps_value = 0.0005
    dbscan = DBSCAN(eps=eps_value, min_samples=20)
    df['cluster'] = dbscan.fit_predict(df[['time_scaled', 'pressure_scaled']])
    df['cluster'] = df['cluster'].apply(lambda x: 1 if x == -1 else 0)
    
    df['segment_id'] = (df['cluster'] != df['cluster'].shift()).cumsum()

    segment_sizes = df.groupby('segment_id')['cluster'].transform('sum')
    df['clean_cluster'] = np.where((df['cluster'] == 1) & (segment_sizes >= 40), 1, 0)
    
    df['delta_pressure'] = df['pressure'].diff().rolling(15, center=True).mean()
    df['final_cluster'] = 0
    df.loc[(df['clean_cluster'] == 1) & (df['delta_pressure'] < 0), 'final_cluster'] = 1
    df.loc[(df['clean_cluster'] == 1) & (df['delta_pressure'] > 0), 'final_cluster'] = 2
    
    df['segment_id'] = (df['final_cluster'] != df['final_cluster'].shift()).cumsum()
    segment_sizes = df.groupby('segment_id')['final_cluster'].transform('count')
    df.loc[segment_sizes < 50, 'final_cluster'] = 0  

    def merge_close_clusters(df, gap_threshold=50):
        merged_cluster = df['final_cluster'].copy()
        prev_cluster = None
        prev_index = None
        
        for i in range(len(df)):
            current_cluster = df.loc[i, 'final_cluster']
            
            if current_cluster != 0: 
                if prev_cluster == current_cluster and (i - prev_index) <= gap_threshold:
                    merged_cluster.iloc[prev_index:i] = current_cluster 
                
                prev_cluster = current_cluster
                prev_index = i 
        
        df['final_cluster'] = merged_cluster

    merge_close_clusters(df, gap_threshold=50) 
    df['segment_id'] = (df['final_cluster'] != df['final_cluster'].shift()).cumsum() 

    first_valid_idx = df[df['final_cluster'] != 0].index.min()
    if first_valid_idx and first_valid_idx <= 20:  
        df.loc[:first_valid_idx - 1, 'final_cluster'] = df.loc[first_valid_idx, 'final_cluster'] 
        df.loc[:first_valid_idx - 1, 'segment_id'] = df.loc[first_valid_idx, 'segment_id'] 

    last_valid_idx = df[df['final_cluster'] != 0].index.max()
    if last_valid_idx and last_valid_idx >= len(df) - 20:  
        df.loc[last_valid_idx + 1:, 'final_cluster'] = df.loc[last_valid_idx, 'final_cluster'] 
        df.loc[last_valid_idx + 1:, 'segment_id'] = df.loc[last_valid_idx, 'segment_id'] 
        
    drop_segments = []
    recovery_segments = []
    
    for _, group in df.groupby('segment_id'):
        cluster_type = group['final_cluster'].iloc[0]
        start_time = group['time'].iloc[0]
        end_time = group['time'].iloc[-1]
        
        if cluster_type == 1 and df.loc[df["time"] == start_time, "pressure"].values[0] > 0.75 and df.loc[df["time"] == end_time, "pressure"].values[0] < 0.75:
            drop_segments.append([start_time, end_time])
        elif cluster_type == 2 and df.loc[df["time"] == end_time, "pressure"].values[0] > 0.75  and df.loc[df["time"] == start_time, "pressure"].values[0] < 0.75:
            recovery_segments.append([start_time, end_time])
    
    def merge_intervals(segments, gap_threshold=50):
        merged = []
        for interval in segments:
            if not merged or interval[0] - merged[-1][1] > gap_threshold:
                merged.append(interval)
            else:
                merged[-1][1] = interval[1]
        return merged
    
    merged_drop_segments = merge_intervals(drop_segments)
    merged_recovery_segments = merge_intervals(recovery_segments)
    
    return filename, merged_recovery_segments, merged_drop_segments, df

def process_all_files(input_path, data_path, output_path):
    submit_df = pd.read_csv(input_path)
    results = []
    
    for file in submit_df['file']:
        filename, recovery, drop, df_clust = process_file(file, data_path)
        results.append({'file': filename, 'recovery': recovery, 'drop': drop})
    
    output_df = pd.DataFrame(results)
    output_df.to_csv(output_path, index=False)

In [5]:
process_all_files(SUBMIT_INPUT_PATH, DATA_PATH, SUBMIT_OUTPUT_PATH)

# Визуализация 

## Функции отрисовки

In [6]:
def plot_data(clust_df, filename):
    clust_df = clust_df[['time', 'pressure', 'final_cluster']]
    clust_df.rename(columns={'final_cluster': 'cluster'}, inplace=True)

    fig = go.Figure()
    colors = {0: "blue", 1: "red", 2: "green"}

    for cluster in clust_df["cluster"].unique():
        cluster_data = clust_df[clust_df["cluster"] == cluster]
        marker_symbol = "circle" if cluster != 0 else "x"
        fig.add_trace(go.Scatter(
            x=cluster_data["time"], 
            y=cluster_data["pressure"],
            mode="markers",
            marker=dict(size=6, symbol=marker_symbol, color=colors.get(cluster, "black")),
            name=f"Cluster {cluster}"
        ))

    fig.update_layout(
        title=f"DBSCAN Clustering ({filename})",
        xaxis_title="Time",
        yaxis_title="Pressure",
        showlegend=True
    )

    fig.show()

In [7]:
def plot_data_from_submit(clust_df, filename, submit_file):
    submit_df = pd.read_csv(submit_file)
    file_entry = submit_df[submit_df['file'] == filename]
    
    recovery_intervals = ast.literal_eval(file_entry.iloc[0]['recovery'])
    drop_intervals = ast.literal_eval(file_entry.iloc[0]['drop'])
    
    clust_df = clust_df[['time', 'pressure']]
    clust_df['cluster'] = 0 
    
    for start, end in drop_intervals:
        clust_df.loc[(clust_df['time'] >= start) & (clust_df['time'] <= end), 'cluster'] = 1
    
    for start, end in recovery_intervals:
        clust_df.loc[(clust_df['time'] >= start) & (clust_df['time'] <= end), 'cluster'] = 2
    
    fig = go.Figure()
    colors = {0: "blue", 1: "red", 2: "green"}

    for cluster in clust_df["cluster"].unique():
        cluster_data = clust_df[clust_df["cluster"] == cluster]
        marker_symbol = "circle" if cluster != 0 else "x"
        fig.add_trace(go.Scatter(
            x=cluster_data["time"], 
            y=cluster_data["pressure"],
            mode="markers",
            marker=dict(size=6, symbol=marker_symbol, color=colors.get(cluster, "black")),
            name=f"Cluster {cluster}"
        ))

    fig.update_layout(
        title=f"Intervals from submit ({filename})",
        xaxis_title="Time",
        yaxis_title="Pressure",
        showlegend=True
    )

    fig.show()

## Загрузка файла

Интересные файлы:

In [8]:
#0  13  37  

In [9]:
labels_df = pd.read_csv(SUBMIT_INPUT_PATH, sep = ',')
filename = labels_df['file'][37] # 0 - 37

df = pd.read_csv(f'../src/test_denoised/{filename}', names = ['time', 'pressure'], sep = '\\s+')

## Сырая кластеризация

In [10]:
filename, recovery, drop, clust_df = process_file(filename, DATA_PATH)
plot_data(clust_df, filename)

## Обработанная кластеризация

In [11]:
plot_data_from_submit(clust_df, filename, SUBMIT_OUTPUT_PATH)

## Все файлы:

In [None]:
# for filename in labels_df['file']:
#    df = pd.read_csv(f'../src/test_denoised/{filename}', names = ['time', 'pressure'], sep = '\\s+')
#    plot_data_from_submit(df, filename, SUBMIT_OUTPUT_PATH)
#    break