In [4]:
# 1. Imports
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.decomposition import PCA

# 2. Load data
train_df = pd.read_csv("hacktrain.csv")
test_df = pd.read_csv("hacktest.csv")

# 3. NDVI columns
ndvi_columns = [col for col in train_df.columns if col.endswith('_N')]

# 4. Combine data
train_df['is_train'] = 1
test_df['is_train'] = 0
test_df['class'] = np.nan
full_df = pd.concat([train_df, test_df], ignore_index=True)

# 5. Ensure NDVI columns are numeric
for col in ndvi_columns:
    full_df[col] = pd.to_numeric(full_df[col], errors='coerce')

# 6. Interpolation for missing values
full_df[ndvi_columns] = full_df[ndvi_columns].interpolate(axis=1, limit_direction='both')

# 7. Rolling mean smoothing
full_df[ndvi_columns] = full_df[ndvi_columns].rolling(window=3, axis=1, min_periods=1).mean()

# 8. NDVI Feature Engineering
def extract_ndvi_features(row):
    values = pd.to_numeric(row[ndvi_columns], errors='coerce').values
    return pd.Series({
        'ndvi_mean': np.mean(values),
        'ndvi_std': np.std(values),
        'ndvi_min': np.min(values),
        'ndvi_max': np.max(values),
        'ndvi_range': np.max(values) - np.min(values),
        'ndvi_q25': np.percentile(values, 25),
        'ndvi_q75': np.percentile(values, 75),
        'ndvi_median': np.median(values),
        'ndvi_trend': values[-1] - values[0] if not np.isnan(values[0]) and not np.isnan(values[-1]) else 0,
        'ndvi_skew': pd.Series(values).skew(),
        'ndvi_kurtosis': pd.Series(values).kurt(),
        'ndvi_slope': np.polyfit(np.arange(len(values))[~np.isnan(values)], values[~np.isnan(values)], 1)[0]
                     if np.sum(~np.isnan(values)) > 1 else 0
    })

ndvi_features = full_df.apply(extract_ndvi_features, axis=1)
full_df = pd.concat([full_df[['ID', 'class', 'is_train']], ndvi_features], axis=1)

# 9. Split into train and test
train_processed = full_df[full_df['is_train'] == 1].copy()
test_processed = full_df[full_df['is_train'] == 0].copy()

# 10. Encode target
label_encoder = LabelEncoder()
train_processed['class'] = label_encoder.fit_transform(train_processed['class'])

X = train_processed.drop(columns=['ID', 'class', 'is_train'])
y = train_processed['class']
X_test = test_processed.drop(columns=['ID', 'class', 'is_train'])

# 11. Handle missing values
X = X.fillna(X.mean())
X_test = X_test.fillna(X_test.mean())

# 12. Standard Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# 13. PCA for dimensionality reduction
pca = PCA(n_components=0.99, random_state=42)
X_pca = pca.fit_transform(X_scaled)
X_test_pca = pca.transform(X_test_scaled)

# 14. Model: Random Forest
model = RandomForestClassifier(n_estimators=200, max_depth=25, random_state=42)

# 15. Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_pca, y, cv=cv, scoring='accuracy')
print(f"CV Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.4f}")

# 16. Train on full training data
model.fit(X_pca, y)
y_pred = model.predict(X_test_pca)
y_pred_labels = label_encoder.inverse_transform(y_pred)

# 17. Submission file
submission_df = pd.DataFrame({
    'ID': test_processed['ID'].astype(int),
    'class': y_pred_labels
})
submission_df.to_csv("submission_boosted.csv", index=False)
print("Submission saved as submission_boosted.csv")


  full_df[ndvi_columns] = full_df[ndvi_columns].rolling(window=3, axis=1, min_periods=1).mean()


CV Accuracy: 0.86 ± 0.0038
Submission saved as submission_boosted.csv
