# Mallorn Challenge: Astronomical Time-Series Classification
## Model: Feature Engineering + LightGBM

1. Imports & Setup

In [None]:
import os
import gc
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings('ignore')

# C·∫•u h√¨nh hi·ªÉn th·ªã
pd.set_option('display.max_columns', 500)
plt.style.use('seaborn-v0_8-darkgrid')

print(f"LightGBM version: {lgb.__version__}")

2. CONFIG & Helper Functions

In [None]:
class CFG:
    # ƒê∆∞·ªùng d·∫´n (T·ª± ƒë·ªông t√¨m)
    INPUT_ROOT = '/kaggle/input'
    WORKING_DIR = '/kaggle/working'

    # N·∫øu ch·∫°y l·∫ßn ƒë·∫ßu: ƒë·ªÉ None
    # N·∫øu ch·∫°y l·∫ßn 2: ƒëi·ªÅn ƒë∆∞·ªùng d·∫´n dataset ch·ª©a output l·∫ßn 1 (VD: '/kaggle/input/mallorn-part1-output')
    RESUME_PATH = None 
    # RESUME_PATH = '/kaggle/input/my-previous-output-dataset'

    # Model Params
    n_folds = 5
    seed = 42
    target_col = 'target'
    id_col = 'object_id'
    
    # LightGBM Hyperparameters
    lgb_params = {
        'objective': 'binary', # Ho·∫∑c 'multiclass' n·∫øu > 2 l·ªõp
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',

        # H·ªçc ch·∫≠m l·∫°i ƒë·ªÉ t√¨m ƒëi·ªÉm t·ªëi ∆∞u t·ªët h∆°n
        'learning_rate': 0.03, 

        # Ki·ªÉm so√°t ƒë·ªô ph·ª©c t·∫°p (Tr√°nh Overfitting)
        'num_leaves': 31,        # Model ƒë∆°n gi·∫£n h∆°n s·∫Ω t·ªïng qu√°t t·ªët h∆°n
        'max_depth': -1,          # Gi·ªõi h·∫°n ƒë·ªô s√¢u c√¢y (ƒëang ƒë·ªÉ t·ª± do)
        'min_data_in_leaf': 20,

        # Regularization (Ph·∫°t model n·∫øu tr·ªçng s·ªë qu√° l·ªõn)
        'lambda_l1': 0.05,        # L1 Regularization
        'lambda_l2': 0.05,        # L2 Regularization

        # Sampling (Gi√∫p model kh√¥ng nh√¨n th·∫•y to√†n b·ªô data m·ªói l·∫ßn -> ch·ªëng h·ªçc v·∫πt)
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'n_jobs': -1,
        'verbose': -1,
        'seed': 42,
        'is_unbalance': True # Quan tr·ªçng v√¨ dataset m·∫•t c√¢n b·∫±ng (0 >> 1)
    }

def seed_everything(seed):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)

# H√†m t√¨m dataset
def find_files():
    """Qu√©t to√†n b·ªô th∆∞ m·ª•c ƒë·ªÉ t√¨m file logs v√† lightcurves"""
    train_log_path = None
    test_log_path = None
    sample_sub_path = None
    
    # T√¨m file logs ch√≠nh
    for root, dirs, files in os.walk(CFG.INPUT_ROOT):
        if 'train_log.csv' in files:
            train_log_path = os.path.join(root, 'train_log.csv')
        if 'test_log.csv' in files:
            test_log_path = os.path.join(root, 'test_log.csv')
        if 'sample_submission.csv' in files:
            sample_sub_path = os.path.join(root, 'sample_submission.csv')
            
    print(f"Train Log: {train_log_path}")
    print(f"Test Log: {test_log_path}")
    
    # T√¨m c√°c file lightcurves trong c√°c th∆∞ m·ª•c split
    # Logic: C√≥ th·ªÉ c√≥ nhi·ªÅu file tr√πng t√™n trong c√°c split, ta s·∫Ω gom h·∫øt
    lc_files = glob.glob(os.path.join(CFG.INPUT_ROOT, '**', '*_full_lightcurves.csv'), recursive=True)
    print(f"Found {len(lc_files)} lightcurve files.")
    
    return train_log_path, test_log_path, sample_sub_path, lc_files

train_log_path, test_log_path, sample_sub_path, lc_files = find_files()

3. Data Loading & Merging

In [None]:
print("=== Loading Metadata (Logs) ===")
# ƒê·ªçc Metadata (Labels, Z, EBV...)
if train_log_path:
    train_meta = pd.read_csv(train_log_path)
    print(f"Train Meta Shape: {train_meta.shape}")
    display(train_meta.head(3))
else:
    raise FileNotFoundError("Kh√¥ng t√¨m th·∫•y train_log.csv!")

if test_log_path:
    test_meta = pd.read_csv(test_log_path)
    print(f"Test Meta Shape: {test_meta.shape}")
else:
    print("Warning: Test log not found.")

print("\n=== Loading Lightcurves (This may take a while) ===")
# Chi·∫øn thu·∫≠t: ƒê·ªçc sample ƒë·ªÉ ki·ªÉm tra c·ªôt, sau ƒë√≥ ƒë·ªçc v√† concat
# L∆∞u √Ω: V√¨ file c√≥ th·ªÉ r·∫•t l·ªõn, ta ch·ªâ ƒë·ªçc nh·ªØng c·ªôt c·∫ßn thi·∫øt
dfs = []
cols_to_use = ['object_id', 'Time (MJD)', 'Flux', 'Flux_err', 'Filter']

# Thanh ti·∫øn tr√¨nh ƒë·ªçc file
for f in tqdm(lc_files, desc="Reading LC files"):
    try:
        # Ki·ªÉm tra xem file l√† train hay test ƒë·ªÉ t·ªëi ∆∞u (t√πy ch·ªçn)
        df_chunk = pd.read_csv(f, usecols=lambda c: c in cols_to_use)
        dfs.append(df_chunk)
    except Exception as e:
        print(f"Error reading {f}: {e}")

if dfs:
    full_lc = pd.concat(dfs, ignore_index=True)
    # Lo·∫°i b·ªè duplicate n·∫øu c√°c split ch·ª©a d·ªØ li·ªáu tr√πng l·∫∑p
    full_lc = full_lc.drop_duplicates()
    print(f"Total Lightcurve Points: {len(full_lc)}")
    display(full_lc.head())
else:
    raise ValueError("Kh√¥ng ƒë·ªçc ƒë∆∞·ª£c d·ªØ li·ªáu Lightcurve n√†o!")

4. Feature Engineering

In [None]:
from scipy.stats import linregress

feature_file_name = 'extracted_features.csv'
loaded_features = False

# 1. Ki·ªÉm tra Resume (Session c≈©)
if CFG.RESUME_PATH and os.path.exists(os.path.join(CFG.RESUME_PATH, feature_file_name)):
    print(f"üîÑ Found saved features in Input: {CFG.RESUME_PATH}")
    lc_features = pd.read_csv(os.path.join(CFG.RESUME_PATH, feature_file_name))
    loaded_features = True

# 2. Ki·ªÉm tra Working Dir (Session hi·ªán t·∫°i)
elif os.path.exists(os.path.join(CFG.WORKING_DIR, feature_file_name)):
    print(f"üîÑ Found saved features in Working Directory")
    lc_features = pd.read_csv(os.path.join(CFG.WORKING_DIR, feature_file_name))
    loaded_features = True

# 3. T√≠nh to√°n m·ªõi
if not loaded_features:
    print("‚ö° No cached features found. Starting extraction (This takes time)...")
    
    def extract_features(lc_df):
        print("Extracting features (Phase 2: Advanced Statistics)...")
        
        # 1. Th·ªëng k√™ chung (Global Stats)
        aggs = {
            'Flux': [
                'min', 'max', 'mean', 'std', 'skew', 
                lambda x: np.percentile(x, 25), 
                lambda x: np.percentile(x, 50), 
                lambda x: np.percentile(x, 75), 
                lambda x: np.percentile(x, 95) - np.percentile(x, 5)
            ],
            'Flux_err': ['mean', 'max'],
            'Time (MJD)': [lambda x: x.max() - x.min(), 'count']
        }
        
        features = lc_df.groupby('object_id').agg(aggs)
        features.columns = ['_'.join(col).strip() for col in features.columns.values]
        
        rename_dict = {
            'Flux_<lambda_0>': 'flux_q25',
            'Flux_<lambda_1>': 'flux_median',
            'Flux_<lambda_2>': 'flux_q75',
            'Flux_<lambda_3>': 'flux_range90',
            'Time (MJD)_<lambda_0>': 'duration',
            'Time (MJD)_count': 'n_obs'
        }
        features.rename(columns=rename_dict, inplace=True)
        
        # 2. Pivot Table (Filter Stats)
        pivot_stats = pd.pivot_table(lc_df, index='object_id', columns='Filter', 
                                     values=['Flux'], # D√πng list
                                     aggfunc=['mean', 'std', 'max', 'min'])
        
        # ƒê·∫∑t t√™n c·ªôt d·∫°ng: u_mean_Flux, g_std_Flux... (Filter_Agg_Value)
        # col[2]=Filter, col[1]=Agg, col[0]=Value
        pivot_stats.columns = [f"{col[2]}_{col[1]}_{col[0]}" for col in pivot_stats.columns.values]
        
        # 3. T√≠nh Color Index
        bands = ['u', 'g', 'r', 'i', 'z', 'y']
        for i in range(len(bands)-1):
            b1, b2 = bands[i], bands[i+1]
            # V√¨ ta ƒë√£ ƒë·∫∑t t√™n c·ªôt l√† u_mean_Flux n√™n d√πng startswith('u') l√† chu·∫©n
            c1 = [c for c in pivot_stats.columns if c.startswith(b1) and 'mean' in c]
            c2 = [c for c in pivot_stats.columns if c.startswith(b2) and 'mean' in c]
            
            if c1 and c2:
                features[f'color_{b1}-{b2}'] = pivot_stats[c1[0]] - pivot_stats[c2[0]]

        # Merge l·∫°i
        final_features = features.merge(pivot_stats, on='object_id', how='left')
        
        # 4. Ratios
        final_features['flux_std_over_mean'] = final_features['Flux_std'] / (final_features['Flux_mean'].abs() + 1e-6)
        final_features['amplitude'] = final_features['Flux_max'] - final_features['Flux_min']
        
        return final_features

    # Ki·ªÉm tra bi·∫øn
    if 'full_lc' not in globals():
        raise NameError("Bi·∫øn 'full_lc' ch∆∞a ƒë∆∞·ª£c t·∫°o. H√£y ch·∫°y l·∫°i Cell 4!")

    # Th·ª±c thi
    lc_features = extract_features(full_lc)
    
    # Save
    save_path = os.path.join(CFG.WORKING_DIR, feature_file_name)
    lc_features.to_csv(save_path, index=True)
    print(f"üíæ Features saved to: {save_path}")

    del full_lc, dfs
    gc.collect()

print(f"Features shape: {lc_features.shape}")
display(lc_features.head())

5. Prepare Train/Test Datasets

In [None]:
print("=== Preparing Final Datasets ===")

# Merge features v·ªõi Metadata (Z, EBV)
# Train Set
train_df = train_meta.merge(lc_features, on='object_id', how='left')
# Test Set
test_df = test_meta.merge(lc_features, on='object_id', how='left')

# Drop c√°c c·ªôt kh√¥ng d√πng train (Text, SpecType, split)
drop_cols = ['SpecType', 'English Translation', 'split', 'target', 'object_id']
# Gi·ªØ l·∫°i danh s√°ch Feature
feature_cols = [c for c in train_df.columns if c not in drop_cols]

print(f"Features used for training ({len(feature_cols)}):")
print(feature_cols[:10], "...")

# Check target
print("Target Distribution:")
print(train_df['target'].value_counts())


6. Training Model (LightGBM)

In [None]:
from sklearn.metrics import f1_score

print("=== Starting Training with Resume Logic ===")

X = train_df[feature_cols]
y = train_df['target']
X_test = test_df[feature_cols]

skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
models = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\n--- Fold {fold+1}/{CFG.n_folds} ---")
    
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    # T√™n file model cho Fold n√†y
    model_filename = f"lgb_model_fold_{fold}.txt"
    
    # 1. T√¨m model trong RESUME_PATH (∆Øu ti√™n load t·ª´ session tr∆∞·ªõc)
    resume_model_path = os.path.join(CFG.RESUME_PATH, model_filename) if CFG.RESUME_PATH else None
    
    model = None
    
    if resume_model_path and os.path.exists(resume_model_path):
        print(f"üîÑ Resuming: Loading existing model from {resume_model_path}")
        model = lgb.Booster(model_file=resume_model_path)
    else:
        # 2. N·∫øu kh√¥ng c√≥, Train m·ªõi
        print(f"‚ö° Training New Model for Fold {fold+1}...")
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        dtrain = lgb.Dataset(X_train, label=y_train)
        dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        dtrain = lgb.Dataset(X_train, label=y_train)
        dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)
        
        callbacks = [
            # TƒÉng ki√™n nh·∫´n t·ª´ : Model s·∫Ω ƒë·ª£i l√¢u h∆°n xem loss c√≥ gi·∫£m ti·∫øp kh√¥ng
            lgb.early_stopping(stopping_rounds=100), 
            # In log m·ªói 500 v√≤ng cho ƒë·ª° r·ªëi m·∫Øt
            lgb.log_evaluation(period=500)
        ]

        # Train
        model = lgb.train(
            CFG.lgb_params,
            dtrain,
            valid_sets=[dtrain, dval],
            # TƒÉng s·ªë v√≤ng t·ªëi ƒëa t·ª´ ƒë·ªÉ ph√π h·ª£p v·ªõi Learning Rate th·∫•p
            num_boost_round=5000, 
            callbacks=callbacks
        )
        
        # L∆∞u model ngay sau khi train xong
        save_path = os.path.join(CFG.WORKING_DIR, model_filename)
        model.save_model(save_path)
        print(f"üíæ Model saved to {save_path}")

    models.append(model)

    # Predict
    val_probs = model.predict(X_val, num_iteration=model.best_iteration)
    oof_preds[val_idx] = val_probs
    
    test_preds += model.predict(X_test, num_iteration=model.best_iteration) / CFG.n_folds
    
    # === T√åM THRESHOLD T·ªêI ∆ØU CHO FOLD N√ÄY ===
    best_f1 = 0
    best_thr = 0.5
    for thr in np.arange(0.1, 0.9, 0.05):
        current_f1 = f1_score(y_val, (val_probs > thr).astype(int), average='macro')
        if current_f1 > best_f1:
            best_f1 = current_f1
            best_thr = thr
            
    print(f"Fold {fold+1} Best Threshold: {best_thr:.2f} | F1-Macro: {best_f1:.4f}")

print("\n=== Training Finished ===")

# === T√åM THRESHOLD T·ªîNG TH·ªÇ ===
best_global_f1 = 0
best_global_thr = 0.5
for thr in np.arange(0.1, 0.9, 0.01):
    current_f1 = f1_score(y, (oof_preds > thr).astype(int), average='macro')
    if current_f1 > best_global_f1:
        best_global_f1 = current_f1
        best_global_thr = thr

print(f"\nüèÜ OVERALL Best Threshold: {best_global_thr:.2f}")
print(f"üèÜ OVERALL CV F1-Macro: {best_global_f1:.4f}")

# C·∫≠p nh·∫≠t l·∫°i k·∫øt qu·∫£ submission v·ªõi ng∆∞·ª°ng t·ªëi ∆∞u n√†y
final_preds_labels = (test_preds > best_global_thr).astype(int)

print("\n=== Training Finished ===")

7. Evaluation & Feature Importance

In [None]:
# ƒê√°nh gi√° t·ªïng th·ªÉ
oof_labels = (oof_preds > 0.5).astype(int)
overall_f1 = f1_score(y, oof_labels, average='macro')
overall_acc = accuracy_score(y, oof_labels)

print(f"Overall CV F1-Macro: {overall_f1:.4f}")
print(f"Overall CV Accuracy: {overall_acc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y, oof_labels))
print("\nClassification Report:")
print(classification_report(y, oof_labels))

# Plot Feature Importance
feature_importance = pd.DataFrame()
feature_importance["feature"] = feature_cols
feature_importance["importance"] = sum([m.feature_importance() for m in models])
feature_importance = feature_importance.sort_values(by="importance", ascending=False).head(20)

plt.figure(figsize=(10, 8))
sns.barplot(x="importance", y="feature", data=feature_importance)
plt.title("Top 20 Important Features")
plt.tight_layout()
plt.show()

8. Submission

In [None]:
print("=== Generating Submission ===")

# Chuy·ªÉn x√°c su·∫•t th√†nh nh√£n (0/1) 
final_preds_labels = (test_preds > 0.5).astype(int)

submission = pd.DataFrame({
    'object_id': test_df['object_id'],
    'prediction': final_preds_labels 
})

# Ki·ªÉm tra format v·ªõi sample_submission n·∫øu c√≥
if sample_sub_path:
    sample_sub = pd.read_csv(sample_sub_path)
    # ƒê·∫£m b·∫£o th·ª© t·ª± object_id kh·ªõp (n·∫øu c·∫ßn thi·∫øt, th∆∞·ªùng Kaggle ch·∫•m theo ID n√™n merge l√† an to√†n nh·∫•t)
    submission = submission.set_index('object_id').reindex(sample_sub['object_id']).reset_index()
    # Fill missing n·∫øu c√≥ (ph√≤ng tr∆∞·ªùng h·ª£p test_log thi·∫øu ID so v·ªõi sample)
    submission['prediction'] = submission['prediction'].fillna(0).astype(int)

output_path = os.path.join(CFG.WORKING_DIR, 'submission.csv')
submission.to_csv(output_path, index=False)

print(f"Submission saved to: {output_path}")
print(submission.head())