## Тренировка модели TabularNN
Реализовано:
- подготовка данных,
- проектирование структуры модели с использованием Pytorch,
- обучение на протяжении 120 эпох,
- проверка точности.
##### acc составила 0.8106
Приблизилась к бустингу, но есть еще куда расти и экспериментировать.

In [141]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim

from pickle import dump

In [142]:
train_path = 'train.csv'
test_path = 'test.csv'
model_path = 'tabularnn_model.pth'
scaler_path = 'scaler.pkl'
encoder_path = 'encoder.pkl'


In [143]:
df = pd.read_csv(train_path)
df.head(2)

Unnamed: 0.1,Unnamed: 0,tree_id,tree_dbh,curb_loc,health,spc_common,sidewalk,root_stone,root_grate,root_other,trunk_wire,trnk_light,trnk_other,brch_light,brch_shoe,brch_other,nta,census tract
0,582012,90515,4,OnCurb,Good,Japanese zelkova,NoDamage,No,No,No,No,No,No,No,No,No,MN13,91.0
1,660019,175666,5,OnCurb,Good,Japanese zelkova,NoDamage,No,No,No,No,No,No,No,No,No,BK76,573.0


In [144]:
col = ['root_stone', 'root_grate', 'root_other',
       'trunk_wire', 'trnk_light', 'trnk_other',
       'brch_light', 'brch_shoe', 'brch_other',
       'curb_loc', 'sidewalk', 'spc_common', 'nta']

In [146]:
label_encoders = {}
for c in col:
    le = LabelEncoder()
    df[c] = le.fit_transform(df[c])
    label_encoders[c] = le

scaler = StandardScaler()
df[['tree_dbh']] = scaler.fit_transform(df[['tree_dbh']])

dump(scaler, open(scaler_path, 'wb'))
dump(label_encoders, open(encoder_path, 'wb'))

In [147]:
col_ = col.copy()
col_.append('tree_dbh')
x =df[col_]

enc = LabelEncoder()
y = enc.fit_transform(df.health)

In [148]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

x_train = {col: torch.tensor(x_train[col].values, dtype=torch.float32) for col in x_train.columns}
x_test = {col: torch.tensor(x_test[col].values, dtype=torch.float32) for col in x_test.columns}
x = {col: torch.tensor(x[col].values, dtype=torch.float32) for col in x.columns}
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)
y = torch.tensor(y, dtype=torch.long)

In [149]:
class TabularNN(nn.Module):
    def __init__(self, num_embeddings, embedding_dims, num_numerical_features, hidden_size, output_size):
        super(TabularNN, self).__init__()

        # Embedding layers for categorical features
        self.embeddings = nn.ModuleDict({
            col: nn.Embedding(num_embeddings[col], embedding_dims[col])
            for col in num_embeddings
        })

        # Calculate the total size of embeddings + numerical features
        total_embedding_size = sum(embedding_dims.values())
        self.total_input_size = total_embedding_size + num_numerical_features

        # Fully connected layers
        self.fc1 = nn.Linear(self.total_input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

    def forward(self, numerical_features, categorical_features):
        # Embed categorical features
        embedded_features = [
            self.embeddings[col](categorical_features[col].long())
            for col in self.embeddings
        ]
        embedded_features = torch.cat(embedded_features, dim=1)

        # Concatenate numerical and embedded categorical features
        combined_features = torch.cat([embedded_features, numerical_features[..., None]], dim=1)

        # Pass through fully connected layers
        x = self.relu(self.fc1(combined_features))
        x = self.relu(self.fc2(x))
        x = self.output(x)
        return self.softmax(x)

# Define model parameters
num_embeddings = {       # Number of unique categories for each feature
    'root_stone': 2, 'root_grate': 2, 'root_other': 2,
    'trunk_wire': 2, 'trnk_light': 2, 'trnk_other': 2,
    'brch_light': 2, 'brch_shoe': 2, 'brch_other': 2,
    'curb_loc': 2, 'sidewalk': 2, 'spc_common': 133, 'nta': 188
}
embedding_dims = {       # Embedding dimensions for each feature
    'root_stone': 2, 'root_grate': 2, 'root_other': 2,
    'trunk_wire': 2, 'trnk_light': 2, 'trnk_other': 2,
    'brch_light': 2, 'brch_shoe': 2, 'brch_other': 2,
    'curb_loc': 2, 'sidewalk': 2, 'spc_common': 16, 'nta': 16
}
num_numerical_features = 1  # Number of numerical features
hidden_size = 32  # Hidden layer size
output_size = 3  # Binary classification

# Initialize the model
model = TabularNN(num_embeddings, embedding_dims, num_numerical_features, hidden_size, output_size)

In [150]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()

    # Forward pass
    outputs = model(
        numerical_features=x_train['tree_dbh'][:],
        categorical_features=x_train
    )
    loss = criterion(outputs.squeeze(), y_train)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
        model.eval()
        with torch.no_grad():
            test_outputs = model(
                numerical_features=x_test['tree_dbh'],
                categorical_features=x_test
            )
            _, predicted = torch.max(test_outputs, 1)
            accuracy = (predicted == y_test).float().mean()

            print(f"Test Accuracy: {accuracy.item():.4f}")

  return self._call_impl(*args, **kwargs)


Epoch [10/100], Loss: 1.0233
Test Accuracy: 0.8111
Epoch [20/100], Loss: 0.9416
Test Accuracy: 0.8111
Epoch [30/100], Loss: 0.8431
Test Accuracy: 0.8111
Epoch [40/100], Loss: 0.7724
Test Accuracy: 0.8111
Epoch [50/100], Loss: 0.7488
Test Accuracy: 0.8111
Epoch [60/100], Loss: 0.7433
Test Accuracy: 0.8111
Epoch [70/100], Loss: 0.7419
Test Accuracy: 0.8111
Epoch [80/100], Loss: 0.7414
Test Accuracy: 0.8111
Epoch [90/100], Loss: 0.7411
Test Accuracy: 0.8111
Epoch [100/100], Loss: 0.7410
Test Accuracy: 0.8111


In [151]:
num_epochs = 20
for epoch in range(num_epochs):
    model.train()

    # Forward pass
    outputs = model(
        numerical_features=x['tree_dbh'][:],
        categorical_features=x
    )
    loss = criterion(outputs.squeeze(), y)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [10/20], Loss: 0.7409
Epoch [20/20], Loss: 0.7408


In [152]:
df = pd.read_csv(test_path)
df.head(2)

Unnamed: 0.1,Unnamed: 0,tree_id,tree_dbh,curb_loc,health,spc_common,sidewalk,root_stone,root_grate,root_other,trunk_wire,trnk_light,trnk_other,brch_light,brch_shoe,brch_other,nta,census tract
0,81641,293444,15,OnCurb,Good,honeylocust,Damage,Yes,No,No,No,No,No,Yes,No,No,BK26,314.0
1,184940,379735,30,OnCurb,Fair,Norway maple,Damage,Yes,No,No,No,No,No,No,No,No,QN54,156.0


In [153]:
y_test = enc.transform(df.health)
for c in col:
    le = label_encoders[c]
    df[c] = le.transform(df[c])
df[['tree_dbh']] = scaler.transform(df[['tree_dbh']])

test = df[col_]
test = {col: torch.tensor(test[col].values, dtype=torch.float32) for col in test.columns}
y_test = torch.tensor(y_test, dtype=torch.long)

In [154]:
model.eval()
with torch.no_grad():
    test_outputs = model(
        numerical_features=test['tree_dbh'],
        categorical_features=test
    )
    _, predicted = torch.max(test_outputs, 1)
    accuracy = (predicted == y_test).float().mean()

    print(f"Test Accuracy: {accuracy.item():.4f}")

Test Accuracy: 0.8106


  return self._call_impl(*args, **kwargs)


In [155]:
torch.save(model.state_dict(), model_path)