In [54]:
import pandas as pd 
import ast
from sklearn.preprocessing import MinMaxScaler
import os
import warnings 
warnings.filterwarnings('ignore')


os.chdir('C:/Users/shara/Projects/2025_Siam-ML-Hack')

In [110]:

def add_feature(row):
    base_path = "src/raw_data"
    
    file_name = row.iloc[0]
    recovery_range = ast.literal_eval(row.iloc[1])
    drop_range = ast.literal_eval(row.iloc[2])

    train_path = os.path.join(base_path, "train", file_name)
    test_path = os.path.join(base_path, "test", file_name)

    if os.path.exists(train_path):
        file_path = train_path
    else: 
        file_path = test_path

    df = pd.read_csv(file_path, names=["time", "pressure"], header=None, sep="\t")

    scaler = MinMaxScaler()
    df[['time', 'pressure']] = scaler.fit_transform(df[['time', 'pressure']])

    df['pattern'] = 0

    for start, end in recovery_range:
        norm_start, norm_end = scaler.transform([[start, df['pressure'].min()]])[0][0], scaler.transform([[end, df['pressure'].min()]])[0][0]
        df.loc[(df['time'] >= norm_start) & (df['time'] <= norm_end), 'pattern'] = 1

    for start, end in drop_range:
        norm_start, norm_end = scaler.transform([[start, df['pressure'].min()]])[0][0], scaler.transform([[end, df['pressure'].min()]])[0][0]
        df.loc[(df['time'] >= norm_start) & (df['time'] <= norm_end), 'pattern'] = 2

    #count_nonzero = (df["pattern"] != 0).sum()
    #print(f"Количество строк с ненулевым значением: {count_nonzero}")
    #print(df)
    return df


ground_truth = pd.read_csv('src/raw_data/ground_truth.csv')
result_df = add_feature(ground_truth.iloc[0])
for i in range(1, ground_truth.shape[0]):
    row = ground_truth.iloc[i]
    if (row.iloc[0] == '1c9db047-e335-46ac-8039-effd8589b25b'):
        test_df = add_feature(row)
    changed_df = add_feature(row)
    result_df = pd.concat([result_df, changed_df], ignore_index=True)

result_df

Unnamed: 0,time,pressure,pattern
0,0.000000,0.297505,0
1,0.000003,0.297505,0
2,0.000055,0.297931,0
3,0.000058,0.297931,0
4,0.000125,0.297931,0
...,...,...,...
1946291,0.999839,0.113267,0
1946292,0.999869,0.113267,0
1946293,0.999889,0.113267,0
1946294,0.999976,0.113267,0


In [78]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

X = result_df[['time', 'pressure']]
y = result_df['pattern'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=52)


In [87]:
from catboost import CatBoostClassifier
from sklearn.metrics import precision_score

catboost_model = CatBoostClassifier(
    loss_function='MultiClass',
    eval_metric='TotalF1',
    random_seed=42,
    boosting_type='Plain',
    grow_policy='Lossguide',
    task_type="GPU",
    bootstrap_type='Bayesian',
    sampling_frequency='PerTree',
    verbose=0
)

param_grid = {
    'iterations': [1000, 2000], 
    'learning_rate': [0.01, 0.03], 
    'l2_leaf_reg': [1, 5, 10], 
    'depth': [6, 8, 10],
}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=52)


grid_search = GridSearchCV(catboost_model, param_grid, cv=cv, scoring='precision_macro', n_jobs=1, verbose=1, error_score='raise')
grid_search.fit(X_train, y_train)


print("Лучшие параметры:", grid_search.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Лучшие параметры: {'depth': 8, 'iterations': 1000, 'l2_leaf_reg': 1, 'learning_rate': 0.03}


In [88]:
from sklearn.metrics import classification_report

best_params = grid_search.best_params_
final_model = CatBoostClassifier(
    iterations=best_params['iterations'],
    learning_rate=best_params['learning_rate'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    depth=best_params['depth'],
    loss_function='MultiClass',
    eval_metric='TotalF1',
    random_seed=52,
    boosting_type='Plain',
    grow_policy='Lossguide',
    task_type="GPU",
    bootstrap_type='Bayesian',
    sampling_frequency='PerTree',
    verbose=0
)

final_model.fit(
    X_train, 
    y_train,
    eval_set=(X_test, y_test),  
    use_best_model=True
)

y_pred = final_model.predict(X_test)

precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
print(f"Precision: {precision:.4f}")
print(classification_report(y_test, y_pred))

Precision: 0.9316
              precision    recall  f1-score   support

           0       0.94      0.99      0.97    343212
           1       0.94      0.94      0.94     14112
           2       0.81      0.37      0.51     31936

    accuracy                           0.94    389260
   macro avg       0.90      0.77      0.81    389260
weighted avg       0.93      0.94      0.93    389260



In [89]:
ground_truth

Unnamed: 0,file,recovery,drop
0,00e03657-8e1e-4c8c-a724-1d3c77b48510,"[[2420.9805555555554, 2438.4241666666667], [31...","[[3454.6875, 3764.9605555555554]]"
1,00e4dba2-36d2-42b4-beb1-c55aed75f506,[],"[[13285.465, 19439.800555555557]]"
2,00f035b7-ad7a-4f30-9081-522a3c10805b,[],"[[0.0, 42.75]]"
3,01a0c034-6afc-4e73-95fa-621f702a0b7d,[],"[[0.0, 491.98305555555555]]"
4,01a530d3-6496-4515-9fbb-4f44e298fd29,[],"[[4921.376666666667, 6209.231666666667]]"
...,...,...,...
95,1dfaf03c-e297-4d92-a0bf-40b1a829391f,[],[]
96,1e149fbd-41c6-4779-b87d-c5dc17fbb4c0,[],"[[0.0, 635.3127777777778]]"
97,1e19b77c-8a0e-4749-a384-9c1e679035bf,[],[]
98,1e4b4c18-1e32-45eb-917a-5760e33fbaca,"[[9541.77638888889, 10288.5075]]","[[10339.343055555555, 10739.613055555556], [13..."


In [92]:
df_filtered = ground_truth[ground_truth['file'].isin(['1c9db047-e335-46ac-8039-effd8589b25b', '1cbce6e5-9f0b-419f-9527-7add4e255217'])]
df_filtered

Unnamed: 0,file,recovery,drop
80,1c9db047-e335-46ac-8039-effd8589b25b,"[[329.5966666666667, 341.3513888888889], [2354...","[[341.3513888888889, 2087.836388888889], [2386..."
84,1cbce6e5-9f0b-419f-9527-7add4e255217,"[[3187.110277777778, 3637.1241666666665]]","[[3637.1241666666665, 3768.15], [7493.51666666..."


In [105]:
test_df

Unnamed: 0,time,pressure,pattern
0,0.000000,0.107558,0
1,0.000016,0.107735,0
2,0.000032,0.107676,0
3,0.000062,0.106853,0
4,0.000079,0.106677,0
...,...,...,...
55124,0.999899,0.042494,0
55125,0.999941,0.042494,0
55126,0.999959,0.042553,0
55127,0.999981,0.042612,0


In [None]:
test_y = test_df['pattern']
test_df = test_df[['time', 'pressure']]


y_pred = final_model.predict(test_df)
print(classification_report(test_y, y_pred))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     45058
           1       0.94      0.77      0.85       222
           2       0.96      0.43      0.60      9849

    accuracy                           0.89     55129
   macro avg       0.93      0.73      0.80     55129
weighted avg       0.90      0.89      0.88     55129



In [118]:
new_df = pd.DataFrame(y_pred)
new_df

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
55124,0
55125,0
55126,0
55127,0


In [119]:
sum(y_pred)

array([9075], dtype=int64)

In [121]:
new_df[new_df[0] != 0]

Unnamed: 0,0
1583,2
1584,2
1585,2
1586,2
1587,2
...,...
16269,2
16270,2
26370,2
45215,2


In [122]:
test_df['pattern'] = y_pred
test_df

Unnamed: 0,time,pressure,pattern
0,0.000000,0.107558,0
1,0.000016,0.107735,0
2,0.000032,0.107676,0
3,0.000062,0.106853,0
4,0.000079,0.106677,0
...,...,...,...
55124,0.999899,0.042494,0
55125,0.999941,0.042494,0
55126,0.999959,0.042553,0
55127,0.999981,0.042612,0


In [123]:
test_df = test_df[test_df['pattern'] != 0]

In [124]:
test_df

Unnamed: 0,time,pressure,pattern
1583,0.025886,0.369931,2
1584,0.025901,0.397496,2
1585,0.025915,0.421947,2
1586,0.025928,0.446162,2
1587,0.025942,0.469790,2
...,...,...,...
16269,0.282689,0.022276,2
16270,0.282704,0.022276,2
26370,0.452188,0.150288,2
45215,0.798943,0.238039,2
