In [None]:
import ast
import os
import warnings

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, precision_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
warnings.filterwarnings('ignore')
os.chdir('C:/Users/shara/Projects/2025_Siam-ML-Hack')

In [None]:
test_dfs = []
test_scalers = []
window = 5

def add_feature(row, time_offset):
    base_path = "src/raw_data"

    file_name = row.iloc[0]
    recovery_range = ast.literal_eval(row.iloc[1])
    drop_range = ast.literal_eval(row.iloc[2])

    train_path = os.path.join(base_path, "train", file_name)
    test_path = os.path.join(base_path, "test", file_name)

    file_path = train_path if os.path.exists(train_path) else test_path

    df = pd.read_csv(file_path, names=["time", "pressure"], header=None, sep="\t")

    scaler = MinMaxScaler()
    df[['pressure']] = scaler.fit_transform(df[['pressure']])

    df["time"] += time_offset
    df['pattern'] = 0

    for start, end in recovery_range:
        df.loc[(df['time'] >= start + time_offset) & (df['time'] <= end + time_offset), 'pattern'] = 1

    for start, end in drop_range:
        df.loc[(df['time'] >= start + time_offset) & (df['time'] <= end + time_offset), 'pattern'] = 2

    for i in range(1, window + 1):
        df[f'pressure_lag_{i}'] = df['pressure'].shift(i, fill_value=0)
    
    return df, scaler

ground_truth = pd.read_csv('src/raw_data/ground_truth.csv', sep=";")
ground_truth = ground_truth.drop(labels = "mark", axis = 1)
result_df, scaler = add_feature(ground_truth.iloc[0], 0)

last_time = result_df["time"].max()

for i in range(1, ground_truth.shape[0]):
    row = ground_truth.iloc[i]
    if os.path.exists("src/raw_data/test/" + row.iloc[0]):
        print(row.iloc[0])
        test_df, test_scaler = add_feature(row, last_time)
        test_dfs.append(test_df)
        test_scalers.append(test_scaler)
    else:
        changed_df, scaler = add_feature(row, last_time)
        result_df = pd.concat([result_df, changed_df], ignore_index=True)

    last_time = result_df["time"].max()

result_df

1c0e8d10-ba4a-499f-8159-bde6dc70b1c8
1c9db047-e335-46ac-8039-effd8589b25b
1cbce6e5-9f0b-419f-9527-7add4e255217


Unnamed: 0,time,pressure,pattern,pressure_lag_1,pressure_lag_2,pressure_lag_3,pressure_lag_4,pressure_lag_5
0,0.000000,0.297505,0,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.026111,0.297505,0,0.297505,0.000000,0.000000,0.000000,0.000000
2,0.555000,0.297931,0,0.297505,0.297505,0.000000,0.000000,0.000000
3,0.589444,0.297931,0,0.297931,0.297505,0.297505,0.000000,0.000000
4,1.262500,0.297931,0,0.297931,0.297931,0.297505,0.297505,0.000000
...,...,...,...,...,...,...,...,...
1835607,915618.414166,0.113267,0,0.113267,0.113267,0.113267,0.113267,0.113267
1835608,915618.744721,0.113267,0,0.113267,0.113267,0.113267,0.113267,0.113267
1835609,915618.957499,0.113267,0,0.113267,0.113267,0.113267,0.113267,0.113267
1835610,915619.897499,0.113267,0,0.113267,0.113267,0.113267,0.113267,0.113267


In [None]:
X = result_df.drop(labels="pattern", axis=1)
y = result_df['pattern'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=52)

In [None]:
catboost_model = CatBoostClassifier(
    loss_function='MultiClass',
    eval_metric='TotalF1',
    random_seed=42,
    boosting_type='Plain',
    grow_policy='Lossguide',
    task_type="GPU",
    bootstrap_type='Bayesian',
    sampling_frequency='PerTree',
    verbose=0
)

param_grid = {
    'iterations': [1000, 2000], 
    'learning_rate': [0.01, 0.03], 
    'l2_leaf_reg': [1, 5, 10], 
    'depth': [6, 8, 10],
}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=52)

grid_search = GridSearchCV(catboost_model, param_grid, cv=cv, scoring='precision_macro', n_jobs=1, verbose=1, error_score='raise')
grid_search.fit(X_train, y_train)

print("Лучшие параметры:", grid_search.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Лучшие параметры: {'depth': 8, 'iterations': 2000, 'l2_leaf_reg': 1, 'learning_rate': 0.03}


In [7]:
# old params - 1000, 0.01, 1, 10

# best_params = grid_search.best_params_
final_model = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.03,
    l2_leaf_reg=1,
    depth=8,
    loss_function='MultiClass',
    eval_metric='TotalF1',
    random_seed=52,
    boosting_type='Plain',
    grow_policy='Lossguide',
    task_type="GPU",
    bootstrap_type='Bayesian',
    sampling_frequency='PerTree',
    verbose=0
)

final_model.fit(
    X_train, 
    y_train,
    eval_set=(X_test, y_test),  
    use_best_model=True
)

y_pred = final_model.predict(X_test)

precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
print(f"Precision: {precision:.4f}")
print(classification_report(y_test, y_pred))

Precision: 0.9762
              precision    recall  f1-score   support

           0       0.98      0.99      0.99    324026
           1       0.99      0.98      0.98     13739
           2       0.89      0.82      0.85     29358

    accuracy                           0.98    367123
   macro avg       0.95      0.93      0.94    367123
weighted avg       0.98      0.98      0.98    367123



In [13]:
ground_truth

Unnamed: 0,file,recovery,drop
0,00e03657-8e1e-4c8c-a724-1d3c77b48510,"[[2420.9805555555554, 2438.4241666666667], [31...","[[3454.6875, 3764.9605555555554]]"
1,00e4dba2-36d2-42b4-beb1-c55aed75f506,[],"[[13285.465, 19439.800555555557]]"
2,00f035b7-ad7a-4f30-9081-522a3c10805b,[],"[[0.0, 42.75]]"
3,01a0c034-6afc-4e73-95fa-621f702a0b7d,[],"[[0.0, 491.98305555555555]]"
4,01a530d3-6496-4515-9fbb-4f44e298fd29,[],"[[4921.376666666667, 6209.231666666667]]"
...,...,...,...
95,1dfaf03c-e297-4d92-a0bf-40b1a829391f,[],[]
96,1e149fbd-41c6-4779-b87d-c5dc17fbb4c0,[],"[[0.0, 635.3127777777778]]"
97,1e19b77c-8a0e-4749-a384-9c1e679035bf,[],[]
98,1e4b4c18-1e32-45eb-917a-5760e33fbaca,"[[9541.77638888889, 10288.5075]]","[[10339.343055555555, 10739.613055555556], [13..."


In [14]:
df_filtered = ground_truth[ground_truth['file'].isin(['1c9db047-e335-46ac-8039-effd8589b25b', '1cbce6e5-9f0b-419f-9527-7add4e255217'])]
df_filtered

Unnamed: 0,file,recovery,drop
80,1c9db047-e335-46ac-8039-effd8589b25b,"[[329.5966666666667, 341.3513888888889], [2354...","[[341.3513888888889, 2087.836388888889], [2386..."
84,1cbce6e5-9f0b-419f-9527-7add4e255217,"[[3187.110277777778, 3637.1241666666665]]","[[3637.1241666666665, 3768.15], [7493.51666666..."


In [None]:
test_y = test_df['pattern']
test_df = test_df[['time', 'pressure']]

y_pred = final_model.predict(test_df)
print(classification_report(test_y, y_pred))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     45058
           1       0.94      0.77      0.85       222
           2       0.96      0.43      0.60      9849

    accuracy                           0.89     55129
   macro avg       0.93      0.73      0.80     55129
weighted avg       0.90      0.89      0.88     55129



In [122]:
test_df['pattern'] = y_pred
test_df

Unnamed: 0,time,pressure,pattern
0,0.000000,0.107558,0
1,0.000016,0.107735,0
2,0.000032,0.107676,0
3,0.000062,0.106853,0
4,0.000079,0.106677,0
...,...,...,...
55124,0.999899,0.042494,0
55125,0.999941,0.042494,0
55126,0.999959,0.042553,0
55127,0.999981,0.042612,0


In [19]:
test_df = test_df[test_df['pattern'] != 0]

In [26]:
test_df

Unnamed: 0,time,pressure,pattern
0,0.000000,0.107558,0
1,0.000016,0.107735,0
2,0.000032,0.107676,0
3,0.000062,0.106853,0
4,0.000079,0.106677,0
...,...,...,...
55124,0.999899,0.042494,0
55125,0.999941,0.042494,0
55126,0.999959,0.042553,0
55127,0.999981,0.042612,0


# Интерпретация предсказаний

In [28]:
test_df_original = test_df.copy()
test_df_original[["time", "pressure"]] = test_scaler.inverse_transform(test_df[["time", "pressure"]])
test_df_original["pattern"] = test_df["pattern"]
test_df_original

Unnamed: 0,time,pressure,pattern
0,0.000000,55.096654,0
1,0.206389,55.125689,0
2,0.408889,55.116010,0
3,0.800278,54.980513,0
4,1.013333,54.951478,0
...,...,...,...
55124,12814.318611,44.382679,0
55125,12814.860000,44.382679,0
55126,12815.085833,44.392357,0
55127,12815.366389,44.402036,0


In [None]:
def plot_annotated_data(df, selected_file):
    fig = px.line(df, x="time", y="pressure", title=f"Визуализация {selected_file} (размеченные данные)", markers=True)
    fig.update_layout(xaxis_title="Time", yaxis_title="Pressure", template="plotly_white")
    
    recovery_intervals = []
    drop_intervals = []
    
    prev_pattern = None
    start_time = None
    
    for i in range(len(df)):
        current_pattern = df.loc[df.index[i], "pattern"]
        current_time = df.loc[df.index[i], "time"]
        
        if current_pattern != prev_pattern:
            if prev_pattern == 1: 
                recovery_intervals.append((start_time, current_time))
            elif prev_pattern == 2:  
                drop_intervals.append((start_time, current_time))
            
            start_time = current_time
        
        prev_pattern = current_pattern
    
    for start, end in recovery_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="green", opacity=0.3, layer="below", line_width=0, name="Recovery")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].max(), df["pressure"].max()],
                                    mode="lines", line=dict(color="green", width=4), name="Recovery (Bold)"))
    
    for start, end in drop_intervals:
        fig.add_vrect(x0=start, x1=end, fillcolor="red", opacity=0.3, layer="below", line_width=0, name="Drop")
        fig.add_trace(go.Scatter(x=[start, end], y=[df["pressure"].min(), df["pressure"].min()],
                                    mode="lines", line=dict(color="red", width=4), name="Drop (Bold)"))
    
    fig.update_layout(
        xaxis=dict(rangeslider=dict(visible=True), type="linear"),
        yaxis=dict(fixedrange=False),
        legend=dict(title="Legend", x=0.99, y=0.99, xanchor="right", yanchor="top")
    )
    
    fig.show()

In [32]:
plot_annotated_data(test_df, "scaled 1c9db047-e335-46ac-8039-effd8589b25b")

In [9]:
final_model.save_model("models/catboost_model2.bin")