In [43]:
import numpy as np
import pandas as pd

# Load the training dataset
df_train = pd.read_csv("hacktrain.csv")

# Identify NDVI columns and separate features (X) and target (y)
ndvi_cols = [col for col in df_train.columns if '_N' in col]
X = df_train[ndvi_cols].copy()
y = df_train['class']

In [44]:
# --- Denoising using a Rolling Mean ---
# A rolling mean smooths out short-term fluctuations (noise) and highlights longer-term trends.
# The `window` size is the number of observations used for calculating the statistic.
# A window of 3 is a good starting point.
print("Applying a rolling mean to denoise the data...")
X_denoised = X.rolling(window=3, axis=1, min_periods=1).mean() # min_periods=1 handles edges

print("Denoising complete.")
X_denoised.head()

Applying a rolling mean to denoise the data...
Denoising complete.


Unnamed: 0,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,20150226_N,20150210_N,20150125_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,637.595,648.1315,-195.255667,-1049.240667,-936.162,-888.815333,-37.333,-554.9515,-347.7765,-1134.862,...,573.8855,-193.695,-1492.825,-906.170667,-837.676,267.138,211.328,-995.846,-1057.294,-983.101333
1,634.24,613.9725,-132.615,-901.468333,-794.637333,-483.502667,309.812667,-551.783333,-790.534667,-1356.171667,...,530.328,-133.6655,-779.6595,-479.753333,-46.822667,320.629667,354.236,-517.383333,-1129.894,-1028.828333
2,58.0174,-770.5713,-770.5713,-1325.895,-1052.63,-1308.63,-1564.63,-417.42,-1216.175,-1216.175,...,-1050.32,-328.629,-328.629,-429.589,-461.5895,-461.5895,336.9175,-916.453,-916.453,-1772.835
3,72.518,72.518,226.477,-438.247,-120.229667,-718.101667,-566.772333,-510.956,-983.029333,-1137.895333,...,-416.39,-325.255667,-829.163,-384.2245,-245.9615,-69.696667,337.767667,-538.735667,-949.52,-1512.0435
4,1136.44,1136.44,1136.44,1647.83,1791.815,1791.815,2047.39,2158.98,1700.925,-701.59,...,1347.625,1747.215,1071.704333,431.732667,-259.212667,174.133,855.988,854.856333,325.266333,-680.663667


In [45]:
# --- Imputation on Denoised Data ---
# 1. Interpolate along rows for any gaps the rolling mean couldn't handle
X_imputed = X_denoised.interpolate(method='linear', axis=1)

# 2. Fill any remaining NaNs using the column's mean as a fallback
X_imputed.fillna(X_imputed.mean(), inplace=True)

# 3. Verification
if X_imputed.isnull().sum().sum() == 0:
    print("Successfully imputed all missing values from the denoised data.")
else:
    print(f"Warning: {X_imputed.isnull().sum().sum()} NaNs remain.")

Successfully imputed all missing values from the denoised data.


In [46]:
# --- Feature Engineering on Cleaned Data ---
print("Engineering features from denoised data...")
X_featured = X_imputed.copy()

X_featured['mean_ndvi'] = X_featured[ndvi_cols].mean(axis=1)
X_featured['std_ndvi'] = X_featured[ndvi_cols].std(axis=1)
X_featured['max_ndvi'] = X_featured[ndvi_cols].max(axis=1)
X_featured['min_ndvi'] = X_featured[ndvi_cols].min(axis=1)
X_featured['range_ndvi'] = X_featured['max_ndvi'] - X_featured['min_ndvi']
X_featured['median_ndvi'] = X_featured[ndvi_cols].median(axis=1)

print("Feature engineering complete.")
X_featured.head()

Engineering features from denoised data...
Feature engineering complete.


Unnamed: 0,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,20150226_N,20150210_N,20150125_N,...,20140218_N,20140202_N,20140117_N,20140101_N,mean_ndvi,std_ndvi,max_ndvi,min_ndvi,range_ndvi,median_ndvi
0,637.595,648.1315,-195.255667,-1049.240667,-936.162,-888.815333,-37.333,-554.9515,-347.7765,-1134.862,...,211.328,-995.846,-1057.294,-983.101333,-289.963833,685.41396,648.1315,-1492.825,2140.9565,-291.258667
1,634.24,613.9725,-132.615,-901.468333,-794.637333,-483.502667,309.812667,-551.783333,-790.534667,-1356.171667,...,354.236,-517.383333,-1129.894,-1028.828333,-220.899212,589.823212,634.24,-1356.171667,1990.411667,-133.6655
2,58.0174,-770.5713,-770.5713,-1325.895,-1052.63,-1308.63,-1564.63,-417.42,-1216.175,-1216.175,...,336.9175,-916.453,-916.453,-1772.835,-568.494217,687.845737,679.491,-1772.835,2452.326,-461.5895
3,72.518,72.518,226.477,-438.247,-120.229667,-718.101667,-566.772333,-510.956,-983.029333,-1137.895333,...,337.767667,-538.735667,-949.52,-1512.0435,-285.067438,596.458149,733.578,-1512.0435,2245.6215,-325.255667
4,1136.44,1136.44,1136.44,1647.83,1791.815,1791.815,2047.39,2158.98,1700.925,-701.59,...,855.988,854.856333,325.266333,-680.663667,803.983568,893.322059,2158.98,-880.4,3039.38,855.988


In [54]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    X_featured, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Initialize and train the Logistic Regression model with regularization
# C=1.0 is the default. Lower values (e.g., 0.1) increase regularization.
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    C=1.0,  # Regularization parameter
    max_iter=1000, # Keep max_iter high to ensure convergence
)
model.fit(X_train, y_train)

# --- Evaluation ---
y_pred_val = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
print(f"\nValidation Accuracy: {accuracy:.4f}\n")
print("Classification Report on Validation Set:")
print(classification_report(y_val, y_pred_val, target_names=label_encoder.classes_))


Validation Accuracy: 0.8363

Classification Report on Validation Set:
              precision    recall  f1-score   support

        farm       0.60      0.29      0.39       168
      forest       0.86      0.97      0.91      1232
       grass       0.71      0.26      0.38        39
  impervious       0.78      0.53      0.63       134
     orchard       0.50      0.17      0.25         6
       water       0.80      0.57      0.67        21

    accuracy                           0.84      1600
   macro avg       0.71      0.46      0.54      1600
weighted avg       0.82      0.84      0.81      1600



In [None]:
# Load test data
df_test = pd.read_csv("hacktest.csv")
ID = df_test['ID']
X_test = df_test[ndvi_cols].copy()

# --- Apply FULL Preprocessing Pipeline to Test Data ---

# 1. Denoise
X_test_denoised = X_test.rolling(window=3, axis=1, min_periods=1).mean()

# 2. Impute
X_test_imputed = X_test_denoised.interpolate(method='linear', axis=1)
X_test_imputed.fillna(X_featured.mean(), inplace=True) # Fill with means from TRAINING data

# 3. Feature Engineering
X_test_featured = X_test_imputed.copy()
X_test_featured['mean_ndvi'] = X_test_featured[ndvi_cols].mean(axis=1)
X_test_featured['std_ndvi'] = X_test_featured[ndvi_cols].std(axis=1)
X_test_featured['max_ndvi'] = X_test_featured[ndvi_cols].max(axis=1)
X_test_featured['min_ndvi'] = X_test_featured[ndvi_cols].min(axis=1)
X_test_featured['range_ndvi'] = X_test_featured['max_ndvi'] - X_test_featured['min_ndvi']
X_test_featured['median_ndvi'] = X_test_featured[ndvi_cols].median(axis=1)


# --- Make Predictions ---
test_predictions = model.predict(X_test_featured)
test_predictions_decoded = label_encoder.inverse_transform(test_predictions)

# --- Create Submission File ---
submission_df = pd.DataFrame({'ID': ID, 'class': test_predictions_decoded})
submission_df.to_csv("submission.csv", index=False)

print("\nSubmission file 'submission.csv' created successfully!")
print(submission_df.head())


Submission file 'submission.csv' created successfully!
   ID    class
0   1   forest
1   2   forest
2   3  orchard
3   4     farm
4   5   forest
