In [1]:
from typing import Tuple

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch import Tensor
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("covtype.csv")

In [3]:
df

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,...,0,0,0,0,0,0,0,0,0,3
581008,2391,152,19,67,12,95,240,237,119,845,...,0,0,0,0,0,0,0,0,0,3
581009,2386,159,17,60,7,90,236,241,130,854,...,0,0,0,0,0,0,0,0,0,3
581010,2384,170,15,60,5,90,230,245,143,864,...,0,0,0,0,0,0,0,0,0,3


In [4]:
# Check basic info
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)
print("\nFirst few rows:")
print(df.head())
print("\nUnique values per column:")
print(df.nunique())
print("\nDataset info:")
print(df.info())

Dataset shape: (581012, 55)

Column names:
['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40', 'Cover_Type']

Data types:
Elevation                      

In [5]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# The Covertype dataset typically has:
# - 54 features (numerical)
# - 1 target column (Cover_Type) with values 1-7

# Separate features and target
X = df.iloc[:, :-1]  # All columns except last
y = df.iloc[:, -1]   # Last column is Cover_Type (target)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target classes: {sorted(y.unique())}")
print(f"Target value counts:\n{y.value_counts().sort_index()}")

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y.values, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {X_train.shape}")
print(f"Val set: {X_val.shape}")

Features shape: (581012, 54)
Target shape: (581012,)
Target classes: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7)]
Target value counts:
Cover_Type
1    211840
2    283301
3     35754
4      2747
5      9493
6     17367
7     20510
Name: count, dtype: int64

Train set: (464809, 54)
Val set: (116203, 54)


In [6]:
class CovtypeDataset(Dataset):
    def __init__(self,x,y):
        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(y-1, dtype=torch.long)
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]


In [7]:
train_ds = CovtypeDataset(X_train, y_train)
val_ds = CovtypeDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=256, shuffle=False)

In [8]:
def build_model(input_dim: int, num_classes: int = 7) -> nn.Module:
    model = nn.Sequential(
        nn.Linear(input_dim, 128),
        nn.ReLU(),
        nn.Dropout(0.2),
        nn.Linear(128, 64),
        nn.ReLU(),
        nn.Dropout(0.2),
        nn.Linear(64, num_classes)
    )
    return model

In [10]:
input_dim = X_train.shape[1]
model = build_model(input_dim, num_classes=7)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [11]:
def train_one_epoch(model, train_loader, criterion, optimizer):
    model.train()
    losses = []
    
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    return sum(losses) / len(losses)

In [12]:
from sklearn.metrics import accuracy_score

def evaluate(model, val_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.numpy())
            all_labels.extend(labels.numpy())
    return accuracy_score(all_labels, all_preds)

In [13]:
epochs = 20
train_losses = []
val_accuracies = []

for epoch in range(epochs):
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer)
    val_acc = evaluate(model, val_loader)
    train_losses.append(train_loss)
    val_accuracies.append(val_acc)
    print(f"Epoch {epoch+1}/{epochs} | Train loss: {train_loss:.4f} | Val acc: {val_acc:.4f}")

Epoch 1/20 | Train loss: 0.6173 | Val acc: 0.7753
Epoch 2/20 | Train loss: 0.5273 | Val acc: 0.7969
Epoch 3/20 | Train loss: 0.4961 | Val acc: 0.8123
Epoch 4/20 | Train loss: 0.4773 | Val acc: 0.8184
Epoch 5/20 | Train loss: 0.4638 | Val acc: 0.8280
Epoch 6/20 | Train loss: 0.4528 | Val acc: 0.8324
Epoch 7/20 | Train loss: 0.4449 | Val acc: 0.8407
Epoch 8/20 | Train loss: 0.4381 | Val acc: 0.8428
Epoch 9/20 | Train loss: 0.4324 | Val acc: 0.8442
Epoch 10/20 | Train loss: 0.4282 | Val acc: 0.8467
Epoch 11/20 | Train loss: 0.4227 | Val acc: 0.8502
Epoch 12/20 | Train loss: 0.4188 | Val acc: 0.8535
Epoch 13/20 | Train loss: 0.4161 | Val acc: 0.8516
Epoch 14/20 | Train loss: 0.4129 | Val acc: 0.8528
Epoch 15/20 | Train loss: 0.4097 | Val acc: 0.8567
Epoch 16/20 | Train loss: 0.4073 | Val acc: 0.8582
Epoch 17/20 | Train loss: 0.4055 | Val acc: 0.8563
Epoch 18/20 | Train loss: 0.4041 | Val acc: 0.8615
Epoch 19/20 | Train loss: 0.4022 | Val acc: 0.8611
Epoch 20/20 | Train loss: 0.3998 | Val a