In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv
/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv


In [2]:
import pandas as pd

# Load train and test data
train = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv")
test = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv")

# Identify NDVI columns (they end with '_N')
data_columns = [col for col in train.columns if col.endswith('_N')]
#data_columns

In [3]:
import numpy as np

def create_ndvi_features(df, ndvi_cols):
    feats = pd.DataFrame()
    feats['ID'] = df['ID']

    # Basic statistics
    feats['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    feats['ndvi_std'] = df[ndvi_cols].std(axis=1)
    feats['ndvi_min'] = df[ndvi_cols].min(axis=1)
    feats['ndvi_max'] = df[ndvi_cols].max(axis=1)
    feats['ndvi_range'] = feats['ndvi_max'] - feats['ndvi_min']
    
    # Quartiles and median
    feats['ndvi_median'] = df[ndvi_cols].median(axis=1)
    feats['ndvi_q1'] = df[ndvi_cols].quantile(0.25, axis=1)
    feats['ndvi_q3'] = df[ndvi_cols].quantile(0.75, axis=1)
    
    # Coefficient of variation
    feats['ndvi_cv'] = feats['ndvi_std'] / (feats['ndvi_mean'] + 1e-6)
    
    # First and last valid NDVI
    feats['ndvi_first'] = df[ndvi_cols].apply(lambda row: row[row.first_valid_index()] if row.first_valid_index() else np.nan, axis=1)
    feats['ndvi_last'] = df[ndvi_cols].apply(lambda row: row[row.last_valid_index()] if row.last_valid_index() else np.nan, axis=1)
    feats['ndvi_diff'] = feats['ndvi_last'] - feats['ndvi_first']

    # Slope across time (trend)
    def slope(row):
        x = np.arange(len(ndvi_cols))
        y = row.values
        mask = ~np.isnan(y)
        if mask.sum() > 1:
            return np.polyfit(x[mask], y[mask], 1)[0]
        return 0
    feats['ndvi_slope'] = df[ndvi_cols].apply(slope, axis=1)

    # Missing value ratio
    feats['ndvi_nan_ratio'] = df[ndvi_cols].isna().sum(axis=1) / len(ndvi_cols)

    return feats


In [4]:
train_feats = create_ndvi_features(train, data_columns)
test_feats = create_ndvi_features(test, data_columns)

train_feats
test_feats

Unnamed: 0,ID,ndvi_mean,ndvi_std,ndvi_min,ndvi_max,ndvi_range,ndvi_median,ndvi_q1,ndvi_q3,ndvi_cv,ndvi_first,ndvi_last,ndvi_diff,ndvi_slope,ndvi_nan_ratio
0,1,3343.529778,2477.036970,340.949,7466.420,7125.471,3097.110,840.498,5567.45500,0.740845,7466.4200,6639.760,-826.6600,-58.403572,0.0
1,2,3750.664704,2514.860621,535.296,7425.840,6890.544,3298.110,1152.290,6061.24500,0.670511,7235.2600,842.101,-6393.1590,-68.965761,0.0
2,3,3790.384667,2807.079716,522.798,7644.430,7121.632,2206.100,1107.250,6949.45500,0.740579,7425.0800,831.441,-6593.6390,-73.940891,0.0
3,4,3242.310704,2627.535640,465.979,7128.420,6662.441,1731.620,967.842,6094.60000,0.810390,7119.1200,6883.610,-235.5100,-49.364616,0.0
4,5,4468.600704,2980.248232,550.508,8130.260,7579.752,4345.240,1448.030,7685.10000,0.666931,7519.5500,1336.920,-6182.6300,-78.656631,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2840,2841,-1997.699207,1786.044716,-5611.400,491.678,6103.078,-2257.890,-2684.515,-95.75030,-0.894051,-1673.7400,356.233,2029.9730,63.221354,0.0
2841,2842,-2318.400781,1984.884216,-5648.660,556.093,6204.753,-2168.190,-4031.820,-254.77515,-0.856144,-96.8233,-4316.580,-4219.7567,8.944875,0.0
2842,2843,-1640.061593,1724.990648,-5010.320,987.461,5997.781,-1713.400,-2995.260,49.47800,-1.051784,-2364.6000,-1170.750,1193.8500,41.066144,0.0
2843,2844,-1170.447741,1782.525273,-5574.510,973.953,6548.463,-317.092,-2931.085,363.84150,-1.522943,-3004.6300,460.419,3465.0490,19.546456,0.0


In [5]:
train_feats.isnull().sum()

ID                0
ndvi_mean         0
ndvi_std          0
ndvi_min          0
ndvi_max          0
ndvi_range        0
ndvi_median       0
ndvi_q1           0
ndvi_q3           0
ndvi_cv           0
ndvi_first        0
ndvi_last         0
ndvi_diff         0
ndvi_slope        0
ndvi_nan_ratio    0
dtype: int64

In [6]:
from sklearn.preprocessing import StandardScaler

X_train_raw = train_feats.drop(columns=['ID'])
X_test_raw = test_feats.drop(columns=['ID'])

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode the target class
le = LabelEncoder()
y = le.fit_transform(train['class'])

# Train-validation split (80% train, 20% val)
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_scaled, y, test_size=0.2, random_state=42, stratify=y
)


In [8]:
from sklearn.metrics import accuracy_score



In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize logistic regression model
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    C=0.1,
    max_iter=1000,
    random_state=42
)

# Train the model
model.fit(X_train_final, y_train_final)

# Predict on training set
y_train_pred = model.predict(X_train_final)

# Compute training accuracy
train_accuracy = accuracy_score(y_train_final, y_train_pred)

print("Training Accuracy:", train_accuracy)

# Evaluate on validation set
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)

print("Validation Accuracy:", val_accuracy)


Training Accuracy: 0.87171875
Validation Accuracy: 0.86625


In [10]:
# Predict class indices (e.g., 0, 1, 2, ...)
y_test_pred = model.predict(X_test_scaled)

# Decode class indices back to original labels (e.g., 'forest', 'water', etc.)
y_test_labels = le.inverse_transform(y_test_pred)

# Use test['ID'] if it exists, otherwise fallback to index
ID_col = test['ID'] if 'ID' in test.columns else test.index

# Create submission DataFrame
submission = pd.DataFrame({
    'ID': ID_col,
    'class': y_test_labels
})

# Save as submission file
submission.to_csv('submission.csv', index=False)

# Optional: View first few rows
print(submission.tail())


        ID  class
2840  2841  water
2841  2842  water
2842  2843  water
2843  2844  water
2844  2845  water
