In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm
import os
import warnings
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import EfficientFCParameters

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
DATA_PATH = '../data/'
train_log_df = pd.read_csv(os.path.join(DATA_PATH, 'train_log.csv'))
test_log_df = pd.read_csv(os.path.join(DATA_PATH, 'test_log.csv'))

# Load all lightcurve data (both train and test)
full_lc_df = []
for split_folder in tqdm(train_log_df['split'].unique(), desc="Loading Lightcurves"):
    train_path = os.path.join(DATA_PATH, split_folder, 'train_full_lightcurves.csv')
    test_path = os.path.join(DATA_PATH, split_folder, 'test_full_lightcurves.csv')
    full_lc_df.append(pd.read_csv(train_path))
    full_lc_df.append(pd.read_csv(test_path))
full_lc_df = pd.concat(full_lc_df).dropna(subset=['Flux'])
print("All data loaded.")

Loading Lightcurves: 100%|██████████| 20/20 [00:00<00:00, 28.32it/s]


All data loaded.


In [4]:
# Feature Engineering with tsfresh (on ALL data)
band_features = []
for f in tqdm(full_lc_df['Filter'].unique(), desc="Extracting features per filter"):
    subset = full_lc_df[full_lc_df['Filter'] == f][['object_id', 'Time (MJD)', 'Flux']].copy()
    subset.columns = ['id', 'time', 'value']
    
    feats = extract_features(
        subset, column_id='id', column_sort='time',
        default_fc_parameters=EfficientFCParameters(),
        disable_progressbar=True, n_jobs=4
    )
    feats.columns = [f"{col}_{f}" for col in feats.columns]
    band_features.append(feats)

extracted_features = pd.concat(band_features, axis=1)
impute(extracted_features)
print("tsfresh feature extraction complete.")

Extracting features per filter: 100%|██████████| 6/6 [11:51<00:00, 118.54s/it]


tsfresh feature extraction complete.


In [10]:
# --- 3. Prepare Train and Test Sets ---
print("\n--- Preparing Train and Test Sets ---")
train_ids = train_log_df['object_id'].unique()
test_ids = test_log_df['object_id'].unique()
train_features = extracted_features[extracted_features.index.isin(train_ids)]
test_features = extracted_features[extracted_features.index.isin(test_ids)]
y = train_log_df.set_index('object_id').loc[train_features.index]['target']

print("Performing feature selection...")
relevant_feature_cols = select_features(train_features, y, fdr_level=0.005).columns
train_features = train_features[relevant_feature_cols]
test_features = test_features[relevant_feature_cols]

train_full = train_features.merge(train_log_df[['object_id', 'Z', 'EBV']], left_index=True, right_on='object_id').set_index('object_id')
test_full = test_features.merge(test_log_df[['object_id', 'Z', 'EBV']], left_index=True, right_on='object_id').set_index('object_id')
train_full, test_full = train_full.align(test_full, join='left', axis=1, fill_value=0)



--- Preparing Train and Test Sets ---
Performing feature selection...


In [None]:
train_full.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train_full.columns]
test_full.columns = train_full.columns # Ensure test columns match exactly

scaler = StandardScaler()
X_train = scaler.fit_transform(train_full)
X_test = scaler.transform(test_full)
X_train = pd.DataFrame(X_train, columns=train_full.columns, index=train_full.index)
X_test = pd.DataFrame(X_test, columns=test_full.columns, index=test_full.index)
print("Train and test sets are ready.")

Train and test sets are ready.


In [12]:
print("\n--- Finding Optimal Threshold with Cross-Validation ---")
best_params = {
    'learning_rate': 0.0361, 'num_leaves': 120, 'max_depth': 11, 'min_child_samples': 80,
    'subsample': 0.5577, 'colsample_bytree': 0.5736, 'reg_alpha': 0.3158, 'reg_lambda': 0.3171,
    'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', 'n_estimators': 2000,
    'device': 'gpu', 'verbose': -1
}



--- Finding Optimal Threshold with Cross-Validation ---


In [13]:
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
thresholds_list = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y)):
    X_train_fold, y_train_fold = X_train.iloc[train_idx], y.iloc[train_idx]
    X_val_fold, y_val_fold = X_train.iloc[val_idx], y.iloc[val_idx]

    pos_count = y_train_fold.sum()
    neg_count = len(y_train_fold) - pos_count
    best_params['scale_pos_weight'] = neg_count / pos_count if pos_count > 0 else 1

    model = lgb.LGBMClassifier(**best_params)
    model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)],
              eval_metric='f1', callbacks=[lgb.early_stopping(100, verbose=False)])

    val_preds_proba = model.predict_proba(X_val_fold)[:, 1]
    thresholds = np.linspace(0.01, 0.99, 100)
    f1_values = [f1_score(y_val_fold, (val_preds_proba > t).astype(int)) for t in thresholds]
    best_threshold = thresholds[np.argmax(f1_values)]
    thresholds_list.append(best_threshold)

OPTIMAL_THRESHOLD = np.mean(thresholds_list)
print(f"Determined Optimal Threshold: {OPTIMAL_THRESHOLD:.4f}")

Determined Optimal Threshold: 0.2733


In [14]:
import re

print("\n--- Training Final Model and Generating Predictions ---")
pos_count = y.sum()
neg_count = len(y) - pos_count
best_params['scale_pos_weight'] = neg_count / pos_count if pos_count > 0 else 1

final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X_train, y)
test_predictions_proba = final_model.predict_proba(X_test)[:, 1]
test_predictions = (test_predictions_proba > OPTIMAL_THRESHOLD).astype(int)
print("Predictions generated.")



--- Training Final Model and Generating Predictions ---
Predictions generated.


In [15]:
# --- 6. Create and Save Submission File ---
print("\n--- Creating Submission File ---")
submission_df = pd.DataFrame({
    'object_id': X_test.index,
    'prediction': test_predictions
})
submission_df.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created successfully!")
print("\n--- Final Submission Head ---")
print(submission_df.head())
print("-" * 28)



--- Creating Submission File ---
Submission file 'submission.csv' created successfully!

--- Final Submission Head ---
                object_id  prediction
0    Dornhoth_adar_imrath           0
1    Dornhoth_celeb_achad           0
2    Dornhoth_firion_fern           0
3      Dornhoth_glae_aras           1
4  Dornhoth_lain_tinuviel           1
----------------------------
