In [None]:
# Install required Python libraries (Google Colab usually has these pre-installed)
!pip install pandas numpy scikit-learn matplotlib seaborn --quiet


In [None]:
from google.colab import files
uploaded = files.upload()  # Upload hacktrain.csv and hacktest.csv


Saving hacktest.csv to hacktest.csv
Saving hacktrain.csv to hacktrain.csv


In [None]:
import pandas as pd
import numpy as np

# Load datasets
train = pd.read_csv("hacktrain.csv")
test = pd.read_csv("hacktest.csv")

# Basic overview
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Target classes:", train['class'].unique())
train.head()


Train shape: (8000, 30)
Test shape: (2845, 29)
Target classes: ['water' 'forest' 'impervious' 'farm' 'grass' 'orchard']


Unnamed: 0.1,Unnamed: 0,ID,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,water,637.595,658.668,-1882.03,-1924.36,997.904,-1739.99,630.087,...,,-1043.16,-1942.49,267.138,,,211.328,-2203.02,-1180.19,433.906
1,1,2,water,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075
2,3,4,water,58.0174,-1599.16,,-1052.63,,-1564.63,,...,-1025.88,368.622,,-1227.8,304.621,,369.214,-2202.12,,-1343.55
3,4,5,water,72.518,,380.436,-1256.93,515.805,-1413.18,-802.942,...,-1813.95,155.624,,-924.073,432.15,282.833,298.32,-2197.36,,-826.727
4,7,8,water,1136.44,,,1647.83,1935.8,,2158.98,...,1535.0,1959.43,-279.317,-384.915,-113.406,1020.72,1660.65,-116.801,-568.05,-1357.14


In [None]:
ndvi_cols = [col for col in train.columns if '_N' in col]
print(f"NDVI columns: {len(ndvi_cols)} found.")


NDVI columns: 27 found.


In [None]:
# Fill missing NDVI values using column median (robust to outliers)
train[ndvi_cols] = train[ndvi_cols].fillna(train[ndvi_cols].median())
test[ndvi_cols] = test[ndvi_cols].fillna(train[ndvi_cols].median())


In [None]:
# Add NDVI stats: mean, std, max, min
for df in [train, test]:
    df['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    df['ndvi_std'] = df[ndvi_cols].std(axis=1)
    df['ndvi_max'] = df[ndvi_cols].max(axis=1)
    df['ndvi_min'] = df[ndvi_cols].min(axis=1)

# Final feature list
feature_cols = ndvi_cols + ['ndvi_mean', 'ndvi_std', 'ndvi_max', 'ndvi_min']


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

le = LabelEncoder()
train['target'] = le.fit_transform(train['class'])

X = train[feature_cols]
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(test[feature_cols])


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
model.fit(X_train, y_train)

val_preds = model.predict(X_val)
print("✅ Validation Accuracy:", accuracy_score(y_val, val_preds))


✅ Validation Accuracy: 0.915


In [None]:
# Predict and convert labels back
test_preds = model.predict(X_test)
test_labels = le.inverse_transform(test_preds)

# Submission DataFrame
submission = pd.DataFrame({
    'ID': test['ID'],
    'class': test_labels
})

# Save and download
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv created.")


✅ submission.csv created.


In [None]:
from google.colab import files
files.download("submission.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>