In [7]:
import ast
import warnings
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

In [8]:
warnings.filterwarnings('ignore')

SUBMIT_INPUT_PATH = "../src/raw_data/submit.csv"
SUBMIT_OUTPUT_PATH = "../src/submits/submit.csv"
DATA_PATH = "../src/test_denoised"

# Кластеризация

In [111]:
def preprocess_data(filename, data_path, eps_value = 0.0005):
    df = pd.read_csv(f'{data_path}/{filename}', names=['time', 'pressure'], sep='\\s+')

    scaler = StandardScaler()
    df[['time_scaled', 'pressure_scaled']] = scaler.fit_transform(df[['time', 'pressure']])

    dbscan = DBSCAN(eps=eps_value, min_samples = 20)
    df['cluster'] = dbscan.fit_predict(df[['time_scaled', 'pressure_scaled']])
    df['cluster'] = df['cluster'].apply(lambda x: 1 if x == -1 else 0)

    return df

## Функции постаброботки:

In [123]:
# фильтрация кластеров - идея отрезать все сегменты где изменение давления меньше 0.2 так как есть пара таких файлов, но оно не работает
def filter_clusters(df):
    df['segment_id'] = (df['cluster'] != df['cluster'].shift()).cumsum()
    segment_sizes = df.groupby('segment_id')['cluster'].transform('count')
    
    df['clean_cluster'] = np.where((df['cluster'] == 1) & (segment_sizes >= 40), 1, 0)

    df['delta_pressure'] = df['pressure'].diff()
    df['final_cluster'] = 0

    df.loc[(df['clean_cluster'] == 1) & (df['delta_pressure'] < 0), 'final_cluster'] = 1
    df.loc[(df['clean_cluster'] == 1) & (df['delta_pressure'] > 0), 'final_cluster'] = 2
    df['segment_id'] = (df['final_cluster'] != df['final_cluster'].shift()).cumsum()

    return df



# Объединение близких кластеров одного вида, но разрывы > 100 часов исключают мелкие сегменты
def merge_clusters(df, gap_threshold=50, max_gap_hours=100, min_cluster_size=20):
    merged_cluster = df['final_cluster'].copy()
    prev_cluster = None
    prev_index = None
    segment_sizes = df.groupby('segment_id')['final_cluster'].transform('count')

    for i in range(len(df)):
        current_cluster = df.loc[i, 'final_cluster']

        if current_cluster != 0:  
            time_gap = df.loc[i, 'time'] - df.loc[prev_index, 'time'] if prev_index is not None else 0

            if (prev_cluster == current_cluster and 
                (i - prev_index) <= gap_threshold and 
                time_gap <= max_gap_hours and 
                (df.loc[prev_index+1:i-1, 'final_cluster'] != current_cluster).all()):
                
                merged_cluster.iloc[prev_index:i] = current_cluster  

            prev_cluster = current_cluster
            prev_index = i  

    df['final_cluster'] = merged_cluster
    df['segment_id'] = (df['final_cluster'] != df['final_cluster'].shift()).cumsum()

    segment_sizes = df.groupby('segment_id')['final_cluster'].transform('count')
    df.loc[segment_sizes < min_cluster_size, 'final_cluster'] = 0
    df['segment_id'] = (df['final_cluster'] != df['final_cluster'].shift()).cumsum() 

    for i in range(1, len(df)):
        prev_cluster = df.loc[i - 1, 'final_cluster']  
        current_cluster = df.loc[i, 'final_cluster']   

        if current_cluster == 0:
            if prev_cluster == 2 and df.loc[i, 'pressure'] >= df.loc[i - 1, 'pressure'] - 0.00015:
                df['final_cluster'][i] = 2
            elif prev_cluster == 1 and df.loc[i, 'pressure'] <= df.loc[i - 1, 'pressure'] + 0.00015:
                df['final_cluster'][i] = 1 

    df['segment_id'] = (df['final_cluster'] != df['final_cluster'].shift()).cumsum() 

    return df


def remove_outliers(df, pressure_threshold=0.3, time_threshold=40):
    df['rolling_max'] = df['pressure'].rolling(window=5, center=True).max()
    df['rolling_min'] = df['pressure'].rolling(window=5, center=True).min()

    outliers = [] 

    for i in range(len(df) - time_threshold):
        start_pressure = df.loc[i, 'pressure']
        max_pressure = df.loc[i:i+time_threshold, 'rolling_max'].max()
        min_pressure = df.loc[i:i+time_threshold, 'rolling_min'].min()
        end_pressure = df.loc[i+time_threshold, 'pressure']

        # Условие выброса: сначала резкий рост > 0.3, затем падение на то же значение за короткое время
        if (max_pressure - start_pressure > pressure_threshold and
            max_pressure - min_pressure > pressure_threshold and
            abs(end_pressure - start_pressure) < 0.07):  

            outliers.append((df.loc[i, 'time'], df.loc[i+time_threshold, 'time']))

    for start, end in outliers:
        df.loc[(df['time'] >= start) & (df['time'] <= end), 'final_cluster'] = 0

    df.drop(columns=['rolling_max', 'rolling_min'], inplace=True)
    
    return df


def extract_segments(df):
    drop_segments, recovery_segments = [], []

    segment_ranges = df.groupby('segment_id')['pressure'].agg(lambda x: x.max() - x.min())

    small_segments = segment_ranges[segment_ranges < 0.2].index
    df.loc[df['segment_id'].isin(small_segments), 'final_cluster'] = 0  

    for _, group in df.groupby('segment_id'):
        cluster_type = group['final_cluster'].iloc[0]
        start_time, end_time = group['time'].iloc[0], group['time'].iloc[-1]

        if cluster_type == 1 and df.loc[df["time"] == start_time, "pressure"].values[0] > 0.7 and df.loc[df["time"] == end_time, "pressure"].values[0] < 0.7:
            drop_segments.append([start_time, end_time])
        elif cluster_type == 2 and df.loc[df["time"] == start_time, "pressure"].values[0] < 0.7 and df.loc[df["time"] == end_time, "pressure"].values[0] > 0.7:
            recovery_segments.append([start_time, end_time])

    return drop_segments, recovery_segments


## Обработка файла: кластеризация -> постаброботка (фильтрация -> объединение кластеров -> обработка краевых точек -> выделение паттернов)

In [124]:
def process_file(filename, data_path):
    df = preprocess_data(filename, data_path)   
    df = filter_clusters(df)                   
    df = merge_clusters(df, gap_threshold=100, max_gap_hours=200)
    df = remove_outliers(df)  
    drop_segments, recovery_segments = extract_segments(df) 

    return filename, recovery_segments, drop_segments, df

In [125]:
def process_file_without_postfilters(filename, data_path):
    df = preprocess_data(filename, data_path)
    df = filter_clusters(df)       
    drop_segments, recovery_segments = extract_segments(df)
    

    return filename, recovery_segments, drop_segments, df

# Инференс

In [126]:
def build_submit(input_path, data_path, output_path):
    submit_df = pd.read_csv(input_path)
    results = []
    idx = 0
    for file in submit_df['file']:
        filename, recovery, drop, df_clust = process_file(file, data_path)
        results.append({'file': filename, 'recovery': recovery, 'drop': drop})
        idx+=1
    
    output_df = pd.DataFrame(results)
    output_df.to_csv(output_path, index=False)

In [127]:
build_submit(SUBMIT_INPUT_PATH, DATA_PATH, SUBMIT_OUTPUT_PATH)

# Визуализация 

## Функции отрисовки

In [128]:
def plot_data(clust_df, filename):
    clust_df = clust_df[['time', 'pressure', 'final_cluster']]
    clust_df.rename(columns={'final_cluster': 'cluster'}, inplace=True)

    fig = go.Figure()
    colors = {0: "blue", 1: "red", 2: "green"}

    for cluster in clust_df["cluster"].unique():
        cluster_data = clust_df[clust_df["cluster"] == cluster]
        marker_symbol = "circle" if cluster != 0 else "x"
        fig.add_trace(go.Scatter(
            x=cluster_data["time"], 
            y=cluster_data["pressure"],
            mode="markers",
            marker=dict(size=6, symbol=marker_symbol, color=colors.get(cluster, "black")),
            name=f"Cluster {cluster}"
        ))

    fig.update_layout(
        title=f"DBSCAN Clustering ({filename})",
        xaxis_title="Time",
        yaxis_title="Pressure",
        showlegend=True
    )

    fig.show()

In [129]:
def plot_data_from_submit(clust_df, filename, submit_file):
    submit_df = pd.read_csv(submit_file)
    file_entry = submit_df[submit_df['file'] == filename]
    
    recovery_intervals = ast.literal_eval(file_entry.iloc[0]['recovery'])
    drop_intervals = ast.literal_eval(file_entry.iloc[0]['drop'])
    
    clust_df = clust_df[['time', 'pressure']]
    clust_df['cluster'] = 0 
    
    for start, end in drop_intervals:
        clust_df.loc[(clust_df['time'] >= start) & (clust_df['time'] <= end), 'cluster'] = 1
    
    for start, end in recovery_intervals:
        clust_df.loc[(clust_df['time'] >= start) & (clust_df['time'] <= end), 'cluster'] = 2
    
    fig = go.Figure()
    colors = {0: "blue", 1: "red", 2: "green"}

    for cluster in clust_df["cluster"].unique():
        cluster_data = clust_df[clust_df["cluster"] == cluster]
        marker_symbol = "circle" if cluster != 0 else "x"
        fig.add_trace(go.Scatter(
            x=cluster_data["time"], 
            y=cluster_data["pressure"],
            mode="markers",
            marker=dict(size=6, symbol=marker_symbol, color=colors.get(cluster, "black")),
            name=f"Cluster {cluster}"
        ))

    fig.update_layout(
        title=f"Intervals from submit ({filename})",
        xaxis_title="Time",
        yaxis_title="Pressure",
        showlegend=True
    )

    fig.show()

## Загрузка файла

In [130]:
SUBMIT_PATH = "../src/submits/submit.csv"
labels_df = pd.read_csv(SUBMIT_PATH, sep = ',')
filename = labels_df['file'][27] # Range of file indexes: 0 - 37

df = pd.read_csv(f'../src/test_denoised/{filename}', names = ['time', 'pressure'], sep = '\\s+')

## Сырая кластеризация

In [131]:
filename, recovery, drop, clust_df = process_file_without_postfilters(filename, DATA_PATH)
plot_data(clust_df, filename)

In [121]:
filename, recovery, drop, clust_df = process_file(filename, DATA_PATH)
plot_data(clust_df, filename)

## Обработанная кластеризация

In [122]:
plot_data_from_submit(clust_df, filename, SUBMIT_OUTPUT_PATH)

## Все файлы:

In [16]:
# for filename in labels_df['file']:
#    df = pd.read_csv(f'../src/test_denoised/{filename}', names = ['time', 'pressure'], sep = '\\s+')
#    plot_data_from_submit(df, filename, SUBMIT_OUTPUT_PATH)