In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight


In [2]:
try:
    train_df = pd.read_csv("/kaggle/input/hackathon/hacktrain.csv")
    test_df = pd.read_csv("/kaggle/input/hackathon/hacktest.csv")
    test_ids = test_df['ID']
except FileNotFoundError:
    print("Ensure 'hacktrain.csv' and 'hacktest.csv' are in the same directory.")
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    test_ids = pd.Series()

In [3]:
def create_features(df):
 ndvi_cols = [col for col in df.columns if '_N' in col]
 ndvi_cols.sort()
 df[ndvi_cols] = df[ndvi_cols].ffill(axis=1).bfill(axis=1)
 time_indices = np.arange(len(ndvi_cols))
 df['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
 df['ndvi_std'] = df[ndvi_cols].std(axis=1)
 df['ndvi_max'] = df[ndvi_cols].max(axis=1)
 df['ndvi_min'] = df[ndvi_cols].min(axis=1)
 df['ndvi_range'] = df['ndvi_max'] - df['ndvi_min']
 df['ndvi_median'] = df[ndvi_cols].median(axis=1)
 df['ndvi_q25'] = df[ndvi_cols].quantile(0.25, axis=1)
 df['ndvi_q75'] = df[ndvi_cols].quantile(0.75, axis=1)
 df['ndvi_slope'] = df[ndvi_cols].apply(
        lambda row: np.polyfit(time_indices, row.values, 1)[0], axis=1
    )
 return df

In [4]:
if not train_df.empty and not test_df.empty:
    train_df_featured = create_features(train_df)
    test_df_featured = create_features(test_df)

In [5]:
label_encoder = LabelEncoder()
if 'class' in train_df_featured.columns:
        train_df_featured['class'] = label_encoder.fit_transform(train_df_featured['class'])

        X = train_df_featured.drop(columns=['class', 'ID'])
        y = train_df_featured['class']

        X_final_test = test_df_featured[X.columns]

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        X_final_test_scaled = scaler.transform(X_final_test)

        X_train, X_val, y_train, y_val = train_test_split(
            X_scaled, y, test_size=0.2, stratify=y, random_state=42
        )

        class_weights = dict(zip(
            np.unique(y_train),
            compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
        ))

        model = LogisticRegression(
            multi_class='multinomial',
            solver='lbfgs',
            max_iter=2000,
            class_weight=class_weights,
            random_state=42
        )
        model.fit(X_train, y_train)

        print("--- Model Validation Report ---")
        y_val_pred = model.predict(X_val)
        print(classification_report(
            y_val, y_val_pred,
            target_names=label_encoder.classes_
        ))

        print("\n--- Generating Submission File ---")
        final_predictions_encoded = model.predict(X_final_test_scaled)
        final_predictions_decoded = label_encoder.inverse_transform(final_predictions_encoded)

        submission_df = pd.DataFrame({'ID': test_ids, 'class': final_predictions_decoded})
        submission_df.to_csv('submission.csv', index=False)

        print("\n'submission.csv' created successfully!")
        print("Final class distribution in submission file:")
        print(submission_df['class'].value_counts())
    

--- Model Validation Report ---
              precision    recall  f1-score   support

        farm       0.87      0.79      0.83       168
      forest       1.00      0.97      0.98      1232
       grass       0.60      0.82      0.70        39
  impervious       0.89      0.89      0.89       134
     orchard       0.12      0.50      0.19         6
       water       0.49      0.90      0.63        21

    accuracy                           0.94      1600
   macro avg       0.66      0.81      0.70      1600
weighted avg       0.96      0.94      0.94      1600


--- Generating Submission File ---

'submission.csv' created successfully!
Final class distribution in submission file:
class
forest    2084
water      761
Name: count, dtype: int64
