In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import lightgbm as lgb
from tqdm import tqdm
# Load data
train = pd.read_csv('hacktrain.csv')
test = pd.read_csv('hacktest.csv')

# Fix column name case sensitivity
ndvi_cols = [col for col in train.columns if '_N' in col]

# Time-series preprocessing function
def preprocess_ndvi(df, window=3):
    df = df.copy()

    # Interpolation
    df[ndvi_cols] = df[ndvi_cols].interpolate(axis=1, limit_direction='both')
    # Fixed rolling window with transposition
    df[ndvi_cols] = (df[ndvi_cols].T
                       .rolling(window=window, min_periods=1)
                       .mean()
                       .T
                       .values)
    return df
# Apply preprocessing
train = preprocess_ndvi(train)
test = preprocess_ndvi(test)
# Advanced feature engineering
def create_features(df):
    # Basic stats
    df['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    df['ndvi_std'] = df[ndvi_cols].std(axis=1)
    df['ndvi_max'] = df[ndvi_cols].max(axis=1)
    df['ndvi_min'] = df[ndvi_cols].min(axis=1)
    df['ndvi_amplitude'] = df['ndvi_max'] - df['ndvi_min']
    # Temporal features
    df['ndvi_median'] = df[ndvi_cols].median(axis=1)
    df['ndvi_skew'] = df[ndvi_cols].skew(axis=1)
    df['ndvi_kurtosis'] = df[ndvi_cols].kurtosis(axis=1)

    # Slope (linear trend)
    x = np.arange(len(ndvi_cols))
    slopes = []
    for _, row in df[ndvi_cols].iterrows():
        coef = np.polyfit(x, row.values, 1)[0]
        slopes.append(coef)
    df['ndvi_slope'] = slopes

    # Peak features
    df['ndvi_peak_count'] = (df[ndvi_cols].diff(axis=1) > 0).sum(axis=1)
    return df
train = create_features(train)
test = create_features(test)

# Prepare data
X = train.drop(columns=['ID', 'class', 'Unnamed: 0'], errors='ignore')
y = train['class']
X_test = test.drop(columns=['ID', 'Unnamed: 0'], errors='ignore')

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# LightGBM with Stratified K-Fold
params = {
    'objective': 'multiclass',
    'num_class': len(le.classes_),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'min_child_samples': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'verbose': -1
}
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
oof_preds = np.zeros((len(X), len(le.classes_)))
test_preds = np.zeros((len(X_test), len(le.classes_)))
# Import early stopping callback
from lightgbm import early_stopping

for fold, (train_idx, val_idx) in tqdm(enumerate(skf.split(X, y_encoded)), total=n_folds):
  X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
  y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
  # Handle class imbalance
  class_weights = dict(
      zip(np.unique(y_train),
      len(y_train) / (len(np.unique(y_train)) * np.bincount(y_train)))
  )
  sample_weights = np.array([class_weights[c] for c in y_train])
  # Train model
  model = lgb.LGBMClassifier(**params, n_estimators=2000)
  model.fit(
      X_train, y_train,
      eval_set=[(X_val, y_val)],
      eval_metric='multi_logloss',
      # Use callbacks for early stopping
      callbacks=[early_stopping(stopping_rounds=100, verbose=False)],
      #early_stopping_rounds=100,
      #verbose=False,
      sample_weight=sample_weights
  )
  # Store predictions
  oof_preds[val_idx] = model.predict_proba(X_val)
  test_preds += model.predict_proba(X_test) / n_folds

# Generate submission
test_pred_labels = le.inverse_transform(test_preds.argmax(axis=1))
submission = pd.DataFrame({'ID': test['ID'], 'class': test_pred_labels})
submission.to_csv('improved_submission.csv', index=False)

100%|██████████| 5/5 [00:41<00:00,  8.36s/it]
