# NDVI-based Land Cover Classification

### STEP 1: Import libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
warnings.filterwarnings('ignore')

### Section 2: Custom Transformer for NDVI Feature Engineering

In [2]:
class NDVIFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, ndvi_columns):
        self.ndvi_columns = ndvi_columns
        self.imputer = SimpleImputer(strategy='median')

    def fit(self, X, y=None):
        self.imputer.fit(X[self.ndvi_columns])
        return self

    def transform(self, X):
        df = X.copy()
        # Impute missing NDVI values
        ndvi_df = pd.DataFrame(
            self.imputer.transform(df[self.ndvi_columns]),
            columns=self.ndvi_columns,
            index=df.index
        )
        df[self.ndvi_columns] = ndvi_df

        # Basic statistical features
        values = ndvi_df.values
        df['ndvi_mean'] = ndvi_df.mean(axis=1)
        df['ndvi_median'] = ndvi_df.median(axis=1)
        df['ndvi_std'] = ndvi_df.std(axis=1)
        df['ndvi_skew'] = skew(values, axis=1)
        df['ndvi_kurtosis'] = kurtosis(values, axis=1)

        # Recent behaviour
        df['ndvi_last3_mean'] = ndvi_df.iloc[:, -3:].mean(axis=1)

        # Quartile means
        n = len(self.ndvi_columns)
        quart = n // 4
        for i in range(4):
            cols = (self.ndvi_columns[i*quart:(i+1)*quart]
                    if i < 3 else self.ndvi_columns[i*quart:])
            df[f'ndvi_q{i+1}_mean'] = ndvi_df[cols].mean(axis=1)

        # Missing-count features
        df['ndvi_missing_count'] = (X[self.ndvi_columns].isnull()).sum(axis=1)
        df['ndvi_missing_ratio'] = df['ndvi_missing_count'] / n

        # Derived features
        df['ndvi_stability'] = df['ndvi_last3_mean'] / (df['ndvi_mean'] + 1e-6)
        df['ndvi_seasonality'] = df[[f'ndvi_q{i+1}_mean' for i in range(4)]].std(axis=1)

        return df

### Section 3: Load Data

In [3]:
train_df = pd.read_csv("hacktrain.csv").drop(columns=['Unnamed: 0'], errors='ignore')
test_df  = pd.read_csv("hacktest.csv").drop(columns=['Unnamed: 0'], errors='ignore')

# Identify NDVI columns
ndvi_columns = sorted([c for c in train_df.columns if c.endswith('_N')])
fe = NDVIFeatureEngineer(ndvi_columns)
train_df = fe.fit_transform(train_df)
test_df  = fe.transform(test_df)

engineered_features = [col for col in train_df.columns if col.startswith('ndvi_')]
X_train = train_df[engineered_features]
y_train = train_df['class']
X_test  = test_df[engineered_features]

### Section 4: Build Pipeline

In [4]:
pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('selector', SelectFromModel(
        LogisticRegression(
            penalty='l1', solver='liblinear',
            C=0.6, class_weight='balanced', random_state=42,
            max_iter=1000
        )
    )),
    ('classifier', CalibratedClassifierCV(
        estimator=LogisticRegression(
            penalty='l2', solver='lbfgs',
            C=0.2, max_iter=2000, multi_class='multinomial',
            random_state=42
        ),
        cv=5, ensemble=False
    ))
])

### Section 5: Evaluation and Submission

In [5]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')
print(f"10-Fold CV Accuracy: {scores.mean():.5f} ± {scores.std():.5f}")

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

submission = pd.DataFrame({
    'ID': test_df['ID'],
    'class': y_pred
})
submission.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv")

10-Fold CV Accuracy: 0.84400 ± 0.00483
Submission file saved as submission.csv
