In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

def read_large_file_mod(file_path, chunksize=10**6):
    selected_rows = []
    
    for chunk in tqdm(pd.read_csv(file_path, chunksize=chunksize)):
        selected_rows.append(chunk)
    
    return pd.concat(selected_rows, ignore_index=True)

raw = read_large_file_mod("train.csv")
raw.head()

630it [01:16,  8.18it/s]


Unnamed: 0,acoustic_data,time_to_failure
0,12,1.4691
1,6,1.4691
2,8,1.4691
3,5,1.4691
4,8,1.4691


In [14]:
import numpy as np
import pandas as pd
from tsfresh.feature_extraction import feature_calculators
import librosa
import pywt

np.random.seed(1337)
noise_signal = np.random.normal(0, 0.5, 150_000).astype(np.float64)

def denoise_signal(x, wavelet='db4', level=1):
    print("Denoising signal")
    coefficients = pywt.wavedec(x, wavelet, mode="per")
    threshold = 10
    coefficients[1:] = [pywt.threshold(coeff, value=threshold, mode='hard') for coeff in coefficients[1:]]
    return pywt.waverec(coefficients, wavelet, mode='per')


def generate_features(signal, sr=16000):
    features = pd.DataFrame(index=[0], dtype=np.float64)

    signal = signal.astype(np.float64)  # Ensure signal is float64
    signal += noise_signal
    signal -= np.median(signal)
    
    denoised_signal = denoise_signal(signal)
    mfcc_features = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=20)  # Provide signal and sampling rate
    mfcc_mean = mfcc_features.mean(axis=1)
    rolling_std_percentile = np.percentile(
        pd.Series(signal).rolling(50).std().dropna().values, 20
    )
    
    features['num_peaks_denoised_2'] = feature_calculators.number_peaks(denoised_signal, 2)
    features['percentile_roll50_std_20'] = rolling_std_percentile
    features['mfcc_mean_18'] = mfcc_mean[18]
    features['mfcc_mean_4'] = mfcc_mean[4]
    
    return features



In [16]:
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
from tqdm import tqdm
import gc

def parse_sample(sample, start):
    print(f"[parse_sample] Processing sample starting at index {start}")
    features = generate_features(sample['acoustic_data'].values)
    print(f"[parse_sample] Features generated for index {start}")
    features['start'] = start
    features['target'] = sample['time_to_failure'].values[-1]
    print(f"[parse_sample] Added 'start' and 'target' for index {start}")
    return features

def sample_train_gen(df, segment_size=150_000, indices_to_calculate=None):
    indices_to_calculate = indices_to_calculate if len(indices_to_calculate) > 0 else []
    print(f"[sample_train_gen] Indices to calculate: {indices_to_calculate}")

    with tqdm(total=len(indices_to_calculate), desc="[sample_train_gen] Processing Train Segments") as pbar:
        result = []
        print("[sample_train_gen] Starting parallel processing...")
        for res in Parallel(
            n_jobs=4, temp_folder="./tmp", max_nbytes=None, backend="loky"
        )(delayed(parse_sample)(df[int(i):int(i) + segment_size], int(i)) for i in indices_to_calculate):
            print(f"[sample_train_gen] Finished processing index {indices_to_calculate}")
            result.append(res)
            pbar.update(1)
    
    print("[sample_train_gen] Combining parsed samples into a DataFrame")
    data = np.vstack([r.values for r in result])
    print("[sample_train_gen] Data concatenated into array")
    X = pd.DataFrame(data, columns=result[0].columns)
    print("[sample_train_gen] DataFrame created and sorted by 'start'")
    return X.sort_values("start")

def parse_sample_test(seg_id):
    print(f"[parse_sample_test] Parsing test segment ID: {seg_id}")
    sample = pd.read_csv(f'./test/{seg_id}.csv', dtype={'acoustic_data': np.int32})
    print(f"[parse_sample_test] Test segment {seg_id} loaded")
    features = generate_features(sample['acoustic_data'].values)
    print(f"[parse_sample_test] Features generated for test segment {seg_id}")
    features['seg_id'] = seg_id
    print(f"[parse_sample_test] Added 'seg_id' for test segment {seg_id}")
    return features

def sample_test_gen():
    print("[sample_test_gen] Reading sample submission file")
    submission = pd.read_csv('sample_submission.csv', index_col='seg_id')
    print(f"[sample_test_gen] Found {len(submission)} test segments to process")
    
    print("[sample_test_gen] Starting parallel processing for test segments...")
    result = Parallel(
        n_jobs=4, temp_folder="./tmp", max_nbytes=None, backend="loky"
    )(delayed(parse_sample_test)(seg_id) for seg_id in tqdm(submission.index))
    
    print("[sample_test_gen] Combining parsed test segments into a DataFrame")
    data = np.vstack([r.values for r in result])
    print("[sample_test_gen] Test data concatenated into array")
    return pd.DataFrame(data, columns=result[0].columns)

print("[Main] Calculating training indices")
indices_to_calculate = raw.index.values[::150_000][:-1]
print(f"[Main] Training indices calculated: {indices_to_calculate}")

print("[Main] Generating training data")
train = sample_train_gen(raw, indices_to_calculate=indices_to_calculate)
print("[Main] Training data generation completed")

gc.collect()

print("[Main] Generating test data")
test = sample_test_gen()
print("[Main] Test data generation completed")


[Main] Calculating training indices
[Main] Training indices calculated: [        0    150000    300000 ... 628650000 628800000 628950000]
[Main] Generating training data
[sample_train_gen] Indices to calculate: [        0    150000    300000 ... 628650000 628800000 628950000]



[sample_train_gen] Processing Train Segments:   0%|                                           | 0/4194 [00:00<?, ?it/s][A

[sample_train_gen] Starting parallel processing...



[sample_train_gen] Processing Train Segments:   0%|                                | 1/4194 [00:51<60:15:59, 51.74s/it][A
[sample_train_gen] Processing Train Segments: 100%|████████████████████████████████| 4194/4194 [00:51<00:00, 80.82it/s][A

[sample_train_gen] Finished processing index [        0    150000    300000 ... 628650000 628800000 628950000]
[sample_train_gen] Finished processing index [        0    150000    300000 ... 628650000 628800000 628950000]
[sample_train_gen] Finished processing index [        0    150000    300000 ... 628650000 628800000 628950000]
[sample_train_gen] Finished processing index [        0    150000    300000 ... 628650000 628800000 628950000]
[sample_train_gen] Finished processing index [        0    150000    300000 ... 628650000 628800000 628950000]
[sample_train_gen] Finished processing index [        0    150000    300000 ... 628650000 628800000 628950000]
[sample_train_gen] Finished processing index [        0    150000    300000 ... 628650000 628800000 628950000]
[sample_train_gen] Finished processing index [        0    150000    300000 ... 628650000 628800000 628950000]
[sample_train_gen] Finished processing index [        0    150000    300000 ... 628650000 628800000 628950000]
[




[sample_train_gen] Data concatenated into array
[sample_train_gen] DataFrame created and sorted by 'start'
[Main] Training data generation completed
[Main] Generating test data
[sample_test_gen] Reading sample submission file
[sample_test_gen] Found 2624 test segments to process
[sample_test_gen] Starting parallel processing for test segments...



  0%|                                                                                         | 0/2624 [00:00<?, ?it/s][A
  1%|▍                                                                              | 16/2624 [00:00<00:22, 113.72it/s][A
  1%|▊                                                                              | 28/2624 [00:00<00:24, 106.10it/s][A
  2%|█▏                                                                             | 40/2624 [00:00<00:25, 102.65it/s][A
  2%|█▋                                                                              | 56/2624 [00:00<00:31, 81.69it/s][A
  3%|██▏                                                                             | 72/2624 [00:00<00:34, 74.59it/s][A
  3%|██▋                                                                             | 88/2624 [00:01<00:35, 72.01it/s][A
  4%|███▏                                                                           | 104/2624 [00:01<00:35, 70.97it/s][A
  5%|███▌      

[sample_test_gen] Combining parsed test segments into a DataFrame
[sample_test_gen] Test data concatenated into array
[Main] Test data generation completed


In [17]:
etq_meta = [
{"start":0,         "end":5656574},
{"start":5656574,   "end":50085878},
{"start":50085878,  "end":104677356},
{"start":104677356, "end":138772453},
{"start":138772453, "end":187641820},
{"start":187641820, "end":218652630},
{"start":218652630, "end":245829585},
{"start":245829585, "end":307838917},
{"start":307838917, "end":338276287},
{"start":338276287, "end":375377848},
{"start":375377848, "end":419368880},
{"start":419368880, "end":461811623},
{"start":461811623, "end":495800225},
{"start":495800225, "end":528777115},
{"start":528777115, "end":585568144},
{"start":585568144, "end":621985673},
{"start":621985673, "end":629145480},
]

for i, meta in enumerate(etq_meta):
    train.loc[
        (train['start'] + 150_000 >= meta["start"]) & 
        (train['start'] <= meta["end"] - 150_000), 
        "eq"
    ] = i

train_sample = train[train["eq"].isin([2, 7, 0, 4, 11, 13, 9, 1, 14, 10])]

    

In [18]:
print(f"Mean:   {train_sample['target'].mean():.4}")
print(f"Median: {train_sample['target'].median():.4}")

Mean:   6.258
Median: 6.031


In [22]:
train_sample

Unnamed: 0,num_peaks_denoised_2,percentile_roll50_std_20,mfcc_mean_18,mfcc_mean_4,start,target,eq
0,6210.0,2.697163,3.832010,-18.311167,0.0,1.430797,0.0
1,6116.0,2.701290,3.885398,-18.294075,150000.0,1.391499,0.0
2,6748.0,2.780605,3.937254,-20.345921,300000.0,1.353196,0.0
3,6259.0,2.718767,3.746066,-18.466328,450000.0,1.313798,0.0
4,6230.0,2.718618,3.747766,-17.991143,600000.0,1.274400,0.0
...,...,...,...,...,...,...,...
3898,5705.0,2.664090,3.674239,-15.818801,584700000.0,0.186797,14.0
3899,4202.0,2.566411,3.183607,-12.900424,584850000.0,0.147399,14.0
3900,3718.0,2.531012,3.186320,-12.114131,585000000.0,0.109096,14.0
3901,2477.0,2.482978,2.288461,-10.011615,585150000.0,0.069798,14.0


In [27]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from tqdm import tqdm
import lightgbm as lgb
import numpy as np
import pandas as pd

random.seed(1234)

# Features and data
features = [
    'num_peaks_denoised_2',
    'percentile_roll50_std_20',
    'mfcc_mean_4',
    'mfcc_mean_18'
]
print("[Info] Using features:", features)

target = train_sample["target"].values
train_X = train_sample[features].values
test_X = test[features].values
print(f"[Info] Training data shape: {train_X.shape}, Test data shape: {test_X.shape}")

submission = pd.read_csv('sample_submission.csv', index_col='seg_id')
oof = np.zeros(len(train_X))
prediction = np.zeros(len(submission))
n_fold = 3
kf = KFold(n_splits=n_fold, shuffle=True, random_state=1337)
print(f"[Info] K-Fold initialized with {n_fold} splits")

# Train LightGBM with verbose progress
print("\n[Training] Starting K-Fold cross-validation...")
for fold_n, (train_index, valid_index) in enumerate(kf.split(train_sample)):
    print(f"\n[Training] Fold {fold_n + 1}/{n_fold}")
    print(f"[Fold {fold_n}] Training set size: {len(train_index)}, Validation set size: {len(valid_index)}")
    
    trn_data = lgb.Dataset(train_X[train_index], label=target[train_index])
    val_data = lgb.Dataset(train_X[valid_index], label=target[valid_index])
    print(f"[Fold {fold_n}] Datasets prepared")

    params = {
        'num_leaves': 4,
        'min_data_in_leaf': 5,
        'objective': 'fair',
        'max_depth': -1,
        'learning_rate': 0.02,
        "boosting": "gbdt",
        'boost_from_average': True,
        "feature_fraction": 0.9,
        "bagging_freq": 1,
        "bagging_fraction": 0.5,
        "bagging_seed": 0,
        "metric": 'mae',
        "verbosity": -1,
        'max_bin': 500,
        'reg_alpha': 0,
        'reg_lambda': 0,
        'seed': 0,
        'n_jobs': 1
    }
    print(f"[Fold {fold_n}] LightGBM parameters set")

    # Training with progress tracking
    with tqdm(total=1000000, desc=f"[Fold {fold_n}] Training LightGBM") as pbar:
        clf = lgb.train(
            params,
            trn_data,
            num_boost_round=1000000,
            valid_sets=[trn_data, val_data],
            callbacks=[lambda env: pbar.update(env.iteration - pbar.n)]
        )
    print(f"[Fold {fold_n}] Training completed. Best iteration: {clf.best_iteration}")

    # Predictions
    oof[valid_index] += clf.predict(train_X[valid_index], num_iteration=clf.best_iteration)
    print(f"[Fold {fold_n}] OOF predictions updated for validation set")
    prediction += clf.predict(test_X, num_iteration=clf.best_iteration)
    print(f"[Fold {fold_n}] Predictions updated for test set")

# Average predictions across folds
prediction /= n_fold
print("\n[Info] Final predictions averaged across folds")

# Calculate MAE
mae = mean_absolute_error(target, oof)
print(f"\n[Results] MAE: {mae}")


[Info] Using features: ['num_peaks_denoised_2', 'percentile_roll50_std_20', 'mfcc_mean_4', 'mfcc_mean_18']
[Info] Training data shape: (2857, 4), Test data shape: (2624, 4)
[Info] K-Fold initialized with 3 splits

[Training] Starting K-Fold cross-validation...

[Training] Fold 1/3
[Fold 0] Training set size: 1904, Validation set size: 953
[Fold 0] Datasets prepared
[Fold 0] LightGBM parameters set


[Fold 0] Training LightGBM: 100%|██████████████████████████████████████████▉| 999999/1000000 [02:42<00:00, 6143.75it/s]


[Fold 0] Training completed. Best iteration: 0
[Fold 0] OOF predictions updated for validation set
[Fold 0] Predictions updated for test set

[Training] Fold 2/3
[Fold 1] Training set size: 1905, Validation set size: 952
[Fold 1] Datasets prepared
[Fold 1] LightGBM parameters set


[Fold 1] Training LightGBM: 100%|██████████████████████████████████████████▉| 999999/1000000 [02:38<00:00, 6321.74it/s]


[Fold 1] Training completed. Best iteration: 0
[Fold 1] OOF predictions updated for validation set
[Fold 1] Predictions updated for test set

[Training] Fold 3/3
[Fold 2] Training set size: 1905, Validation set size: 952
[Fold 2] Datasets prepared
[Fold 2] LightGBM parameters set


[Fold 2] Training LightGBM: 100%|██████████████████████████████████████████▉| 999999/1000000 [02:41<00:00, 6174.95it/s]


[Fold 2] Training completed. Best iteration: 0
[Fold 2] OOF predictions updated for validation set
[Fold 2] Predictions updated for test set

[Info] Final predictions averaged across folds

[Results] MAE: 2.485138713774192


In [28]:
submission['time_to_failure'] = prediction 
print(submission.head())
submission.to_csv('submission.csv')

            time_to_failure
seg_id                     
seg_00030f         3.122526
seg_0012b5         3.076737
seg_00184e         6.765220
seg_003339        12.330254
seg_0042cc         5.617962
