In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load datasets
train_df = pd.read_csv("hacktrain.csv")
test_df = pd.read_csv("hacktest.csv")

# Remove unnamed column if exists
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]

# Get NDVI columns
ndvi_columns = [col for col in train_df.columns if col.endswith('_N')]

# Feature Engineering
def add_features(df):
    df['ndvi_mean'] = df[ndvi_columns].mean(axis=1)
    df['ndvi_std'] = df[ndvi_columns].std(axis=1)
    df['ndvi_min'] = df[ndvi_columns].min(axis=1)
    df['ndvi_max'] = df[ndvi_columns].max(axis=1)
    df['ndvi_range'] = df['ndvi_max'] - df['ndvi_min']
    df['ndvi_median'] = df[ndvi_columns].median(axis=1)
    df['ndvi_skew'] = df[ndvi_columns].skew(axis=1)
    df['ndvi_trend'] = df[ndvi_columns].iloc[:, -1] - df[ndvi_columns].iloc[:, 0]
    
    # Seasonal averages
    df['spring_mean'] = df[[col for col in ndvi_columns if '03' in col or '04' in col]].mean(axis=1)
    df['summer_mean'] = df[[col for col in ndvi_columns if '06' in col or '07' in col]].mean(axis=1)
    df['winter_mean'] = df[[col for col in ndvi_columns if '11' in col or '12' in col]].mean(axis=1)
    
    return df

# Add features to train/test
train_df = add_features(train_df)
test_df = add_features(test_df)

# Combine features
feature_cols = ndvi_columns + [
    'ndvi_mean', 'ndvi_std', 'ndvi_min', 'ndvi_max',
    'ndvi_range', 'ndvi_median', 'ndvi_skew', 'ndvi_trend',
    'spring_mean', 'summer_mean', 'winter_mean'
]
X = train_df[feature_cols]
X_test = test_df[feature_cols]
y = train_df['class']

# Remove low-variance columns
low_var_cols = X.columns[X.std() < 0.01]
X.drop(columns=low_var_cols, inplace=True)
X_test.drop(columns=low_var_cols, inplace=True)

# Impute missing values
imputer = KNNImputer(n_neighbors=3)
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)

# Normalize (important for PCA & logistic regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# PCA for noise reduction
pca = PCA(n_components=20, random_state=42)
X_pca = pca.fit_transform(X_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split for validation check
X_train, X_val, y_train, y_val = train_test_split(X_pca, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Train Logistic Regression
model = LogisticRegression(multi_class='multinomial', solver='saga', C=10, max_iter=2000, random_state=42)
model.fit(X_train, y_train)

# Validation Accuracy
y_val_pred = model.predict(X_val)
val_acc = accuracy_score(y_val, y_val_pred)
print("ðŸ§ª Validation Accuracy:", round(val_acc * 100, 2), "%")

# Final prediction on test set
y_test_pred = model.predict(X_test_pca)
y_test_labels = le.inverse_transform(y_test_pred)

# Save submission
submission = pd.DataFrame({'ID': test_df['ID'], 'class': y_test_labels})
submission.to_csv("ndvi_submission_new.csv", index=False)
print("âœ… Submission file saved as: ndvi_submission_new.csv")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=low_var_cols, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(columns=low_var_cols, inplace=True)


ðŸ§ª Validation Accuracy: 89.94 %
âœ… Submission file saved as: ndvi_submission_new.csv
