In [5]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from torchmetrics import Recall
import joblib
from sklearn.preprocessing import MinMaxScaler    
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from imblearn.over_sampling import SMOTE

In [7]:
df_train = pd.read_csv("C:/Users/vovad/Desktop/All_projects/Конкурс_Новосиб/dataset_novosib/train.csv", nrows=10000)

In [None]:
df_train

Так как распределение данных в классах сильно неравномерно, то добавляем синтетические данные для малых классов.

In [None]:
x, y = df_train[['Easting','Northing','Height','Reflectance']], df_train['Class']
sampling_strategy = {4: 300000, 1: 300000, 5: 300000, 64: 300000}
smote = SMOTE(sampling_strategy=sampling_strategy)
x_smote, y_smote = smote.fit_resample(x, y)

df_train = pd.concat((x_smote, y_smote), ignore_index=False, axis=1)

Логарифмируем данные.

In [10]:
df_train['Easting_log'], df_train['Northing_log'], df_train['Height_log'], df_train['Reflectance_log'] = np.log10((df_train['Easting'], df_train['Northing'], df_train['Height'], (df_train['Reflectance']+45) ))
df_train = df_train.drop(['Easting', 'Northing', 'Height', 'Reflectance'], axis=1)
df_train = df_train[[col for col in df_train.columns if col != 'Class'] + ['Class']]

Сбалансируем классы, чтобы нейросеть оценивала их равномерно. (Класс 64, для удобства one-hot encoding, назовем классом 2)

In [11]:
df_train.loc[df_train["Class"] == 64, "Class"] = 2

num_0 = len(df_train.loc[(df_train["Class"] == 0)])
num_1 = len(df_train.loc[(df_train["Class"] == 1)])
num_2 = len(df_train.loc[(df_train["Class"] == 2)])
num_3 = len(df_train.loc[(df_train["Class"] == 3)])
num_4 = len(df_train.loc[(df_train["Class"] == 4)])
num_5 = len(df_train.loc[(df_train["Class"] == 5)])

total_nums = num_0 + num_1 + num_2 + num_3 + num_4 + num_5

weight_0 = 1/(num_0/total_nums)/2
weight_1 = 1/(num_1/total_nums)/2
weight_2 = 1/(num_2/total_nums)/2
weight_3 = 1/(num_3/total_nums)/2
weight_4 = 1/(num_4/total_nums)/2
weight_5 = 1/(num_5/total_nums)/2

total_weight = np.array([weight_0, weight_1, weight_2, weight_3, weight_4, weight_5])

total_weight = torch.from_numpy(total_weight).float()

Разбиваем датасет на тренировочный и валидационный 

In [12]:
X = df_train.iloc[:, 0:-1]
y = df_train.iloc[:, -1]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

Синтезируем дополнительные признаки с помощью функции PolynomialFeatures() и нормализуем данные 

In [13]:
polier = PolynomialFeatures(3)
X_train = polier.fit_transform(X_train)
X_val = polier.transform(X_val)

scaler = MinMaxScaler(feature_range=(-1, 1))
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

X_train, y_train = np.array(X_train), np.array(y_train)
X_val, y_val = np.array(X_val), np.array(y_val)

Сохраняем экземпляры функций синтезатора и нормализатора (не знаю как их по-русски назвать :)), так как они понадобятся для тестового набора.

In [8]:
joblib.dump(polier, '/notebooks/Scalers/polier.gz')
joblib.dump(scaler, '/notebooks/Scalers/scaler.gz')

['/notebooks/Scalers/scaler_120.gz']

Создаем класс преобразования датасетов в тензоры.

In [14]:
class ClassifierDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

train_dataset = ClassifierDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long())
val_dataset = ClassifierDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).long())

Создаем переменные для гиперпараметров нейросети.

In [15]:
EPOCHS = 15
BATCH_SIZE = 2048
LEARNING_RATE = 0.001
NUM_FEATURES = 35
NUM_CLASSES = 6

In [16]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE)

Создаем модель нашей сети. Возможно модель покажется избыточной, но я не пробовал уменьшать емкость. 

In [17]:
class LidarModel(pl.LightningModule):

    def __init__(self, input_dim, output_dim, sign_size=32, cha_input=16, cha_hidden=32, 
                 K=2, dropout_input=0.2, dropout_hidden=0.2, dropout_output=0.2):
        super().__init__()

        hidden_size = sign_size*cha_input
        sign_size1 = sign_size
        sign_size2 = sign_size//2
        output_size = (sign_size//4) * cha_hidden

        self.hidden_size = hidden_size
        self.cha_input = cha_input
        self.cha_hidden = cha_hidden
        self.K = K
        self.sign_size1 = sign_size1
        self.sign_size2 = sign_size2
        self.output_size = output_size
        self.dropout_input = dropout_input
        self.dropout_hidden = dropout_hidden
        self.dropout_output = dropout_output

        self.batch_norm1 = nn.BatchNorm1d(input_dim)
        self.dropout1 = nn.Dropout(dropout_input)
        dense1 = nn.Linear(input_dim, hidden_size, bias=False)
        self.dense1 = nn.utils.weight_norm(dense1)

        # 1st conv layer
        self.batch_norm_c1 = nn.BatchNorm1d(cha_input)
        conv1 = conv1 = nn.Conv1d(
            cha_input, 
            cha_input*K, 
            kernel_size=5, 
            stride = 1, 
            padding=2,  
            groups=cha_input, 
            bias=False)
        self.conv1 = nn.utils.weight_norm(conv1, dim=None)

        self.ave_po_c1 = nn.AdaptiveAvgPool1d(output_size = sign_size2)

        # 2nd conv layer
        self.batch_norm_c2 = nn.BatchNorm1d(cha_input*K)
        self.dropout_c2 = nn.Dropout(dropout_hidden)
        conv2 = nn.Conv1d(
            cha_input*K, 
            cha_hidden, 
            kernel_size=3, 
            stride=1, 
            padding=1, 
            bias=False)
        self.conv2 = nn.utils.weight_norm(conv2, dim=None)

        # 3rd conv layer
        self.batch_norm_c3 = nn.BatchNorm1d(cha_hidden)
        self.dropout_c3 = nn.Dropout(dropout_hidden)
        conv3 = nn.Conv1d(
            cha_hidden, 
            cha_hidden, 
            kernel_size=3, 
            stride=1, 
            padding=1, 
            bias=False)
        self.conv3 = nn.utils.weight_norm(conv3, dim=None)        

        # 4th conv layer
        self.batch_norm_c4 = nn.BatchNorm1d(cha_hidden)
        conv4 = nn.Conv1d(
            cha_hidden, 
            cha_hidden, 
            kernel_size=5, 
            stride=1, 
            padding=2, 
            groups=cha_hidden, 
            bias=False)
        self.conv4 = nn.utils.weight_norm(conv4, dim=None)

        self.avg_po_c4 = nn.AvgPool1d(kernel_size=4, stride=2, padding=1)

        self.flt = nn.Flatten()

        self.batch_norm2 = nn.BatchNorm1d(output_size)
        self.dropout2 = nn.Dropout(dropout_output)
        dense2 = nn.Linear(output_size, output_dim, bias=True)
        self.dense2 = nn.utils.weight_norm(dense2)        

    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = nn.functional.celu(self.dense1(x))

        x = x.reshape(x.shape[0], self.cha_input, self.sign_size1)

        x = self.batch_norm_c1(x)
        x = nn.functional.relu(self.conv1(x))

        x = self.ave_po_c1(x)

        x = self.batch_norm_c2(x)
        x = self.dropout_c2(x)
        x = nn.functional.relu(self.conv2(x))
        x_s = x

        x = self.batch_norm_c3(x)
        x = self.dropout_c3(x)
        x = nn.functional.relu(self.conv3(x))

        x = self.batch_norm_c4(x)
        x = self.conv4(x)
        x =  x + x_s
        x = nn.functional.relu(x)

        x = self.avg_po_c4(x)

        x = self.flt(x)

        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.dense2(x)       

        return x

Проверка cuda или cpu

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


Создаем экземпляр модели, функцию потерь и оптимизатор.

In [19]:
model = LidarModel(
    input_dim=NUM_FEATURES, 
    output_dim=NUM_CLASSES, 
    sign_size=16, 
    cha_input=64, 
    cha_hidden=64, 
    K=2, 
    dropout_input=0.3, 
    dropout_hidden=0.3,
    dropout_output=0.4
)
model.to(device)

criterion = nn.CrossEntropyLoss(weight=total_weight).to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
print(model)

LidarModel(
  (batch_norm1): BatchNorm1d(35, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (dense1): Linear(in_features=35, out_features=1024, bias=False)
  (batch_norm_c1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv1): Conv1d(64, 128, kernel_size=(5,), stride=(1,), padding=(2,), groups=64, bias=False)
  (ave_po_c1): AdaptiveAvgPool1d(output_size=8)
  (batch_norm_c2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout_c2): Dropout(p=0.3, inplace=False)
  (conv2): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
  (batch_norm_c3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout_c3): Dropout(p=0.3, inplace=False)
  (conv3): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
  (batch_norm_c4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_ru

Создаем функцию метрики

In [20]:
def multi_recall(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
      
    recall = Recall(average='macro', num_classes=6).to(device)    
    rec = recall(y_pred_tags, y_test)
    
    rec = torch.round(rec * 100)
    
    return rec

In [21]:
recall_stats = {
    'train': [],
    "val": []
}
loss_stats = {
    'train': [],
    "val": []
}

Задаем путь сохранения модели, на случай сбоев или потери обучения.

In [22]:
MODEL_SAVE_PATH = 'C:/Users/vovad/Desktop/All_projects/Конкурс_Новосиб/model.pth'

Тренируем модель.

In [None]:
print("Begin training.")

model.train()
for epoch in range(1, EPOCHS+1):    
    with tqdm(train_loader, unit="batch") as tepoch:
        train_epoch_loss = 0
        train_epoch_rec = 0
        for X_train_batch, y_train_batch in tepoch:
            tepoch.set_description(f"Epoch {epoch}")
            X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
            optimizer.zero_grad()

            y_train_pred = model(X_train_batch)

            train_loss = criterion(y_train_pred, y_train_batch)
            # train_rec = multi_recall(y_train_pred, y_train_batch)

            train_loss.backward()
            optimizer.step()
            

            train_epoch_loss += train_loss.item()
            # train_epoch_rec += train_rec.item()            
            
        with torch.inference_mode():

            val_epoch_loss = 0
            val_epoch_rec = 0

            model.eval()
            for X_val_batch, y_val_batch in val_loader:
                X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)

                y_val_pred = model(X_val_batch)

                val_loss = criterion(y_val_pred, y_val_batch)
                # val_rec = multi_recall(y_val_pred, y_val_batch)

                val_epoch_loss += val_loss.item()
                # val_epoch_rec += val_rec.item()
                
        if val_loss < val_epoch_loss:
            print(f"Saving model to: {MODEL_SAVE_PATH}")
            torch.save(obj=model.state_dict(), f=MODEL_SAVE_PATH)
            
        loss_stats['train'].append(train_epoch_loss/len(train_loader))
        loss_stats['val'].append(val_epoch_loss/len(val_loader))
        recall_stats['train'].append(train_epoch_rec/len(train_loader))
        recall_stats['val'].append(val_epoch_rec/len(val_loader))
    
        if epoch % 1 == 0:
            print(f'Epoch {epoch+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f} | Train Recall:{train_epoch_rec/len(train_loader):.3f}| Val Recall:{val_epoch_rec/len(val_loader):.3f}')


In [19]:
MODEL_LOAD_PATH = '/notebooks/Model/model.pth'

Если мы загружаем модель для получения предсказаний на новом датасете, 
необходимо установить гиперпараметры и создать модель нейросети путем повторного запуска соответствующих ячеек выше.

В случае сбоя или других причин создаем новый экземпляр модели и загружаем в него сохраненную модель.

In [20]:
model_loaded = LidarModel(
    input_dim=NUM_FEATURES, 
    output_dim=NUM_CLASSES, 
    sign_size=16, 
    cha_input=64, 
    cha_hidden=64, 
    K=2, 
    dropout_input=0.3, 
    dropout_hidden=0.3,
    dropout_output=0.4
)
model_loaded.load_state_dict(torch.load(f=MODEL_LOAD_PATH))
model_loaded.to(device)

LidarModel(
  (batch_norm1): BatchNorm1d(35, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (dense1): Linear(in_features=35, out_features=1024, bias=False)
  (batch_norm_c1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv1): Conv1d(64, 128, kernel_size=(5,), stride=(1,), padding=(2,), groups=64, bias=False)
  (ave_po_c1): AdaptiveAvgPool1d(output_size=8)
  (batch_norm_c2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout_c2): Dropout(p=0.3, inplace=False)
  (conv2): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
  (batch_norm_c3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout_c3): Dropout(p=0.3, inplace=False)
  (conv3): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
  (batch_norm_c4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_ru

Начинаем оценку модели на тестовых данных. Загрузим тестовый датасет и преобразуем его как мы это делали с тренировочным датасетом.

In [24]:
df_test_raw = pd.read_csv("/notebooks/Dataset/test_dataset_test.csv")
df_test_raw['Easting_log'], df_test_raw['Northing_log'], df_test_raw['Height_log'], df_test_raw['Reflectance_log'] = np.log10((df_test_raw['Easting'], df_test_raw['Northing'], 
                                                                                                                   df_test_raw['Height'], (df_test_raw['Reflectance']+45) ))

df_test_raw = df_test_raw.drop(['id', 'Easting', 'Northing', 'Height', 'Reflectance'], axis=1)

Подгрузим синтезатор и нормализатор, если это необходимо.

In [22]:
polier = joblib.load('/notebooks/Scalers/polier.gz')
scaler = joblib.load('/notebooks/Scalers/scaler.gz')

In [25]:
df_test = polier.transform(df_test_raw)
df_test = scaler.transform(df_test)

class ClassifierDatasetTest(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data       
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)

test_dataset = ClassifierDatasetTest(torch.from_numpy(df_test).float())
test_loader = DataLoader(dataset=test_dataset, batch_size=64)

Получаем предсказания с помощью нашей натренированной модели, если мы загружаем ранее сохраненную модель, то надо поменять model на model_loaded

In [26]:
y_pred_list = []
with torch.inference_mode():
    model.eval()
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)        
        _, y_pred_tags = torch.max(y_test_pred, dim = 1)        
        y_pred_list.append(y_pred_tags.squeeze().cpu().numpy())

Преобразуем предсказания в необходимый формат для загрузки на сайт. (Не забываем вернуть класс 2 обратно в класс 64)

In [27]:
import itertools
y_pred_list1 = [a.squeeze().tolist() for a in y_pred_list]
y_pred_list2 = list(itertools.chain.from_iterable(y_pred_list1))
cols = ['id']
df_test1 = pd.read_csv("/notebooks/Dataset/test_dataset_test.csv", usecols=cols)
df_test1['Class'] = [a for a in y_pred_list2]
df_test1.loc[df_test1["Class"] == 2, "Class"] = 64

Сохраняем файл в формате csv

In [28]:
df_test1.to_csv('/notebooks/Dataset/test_check_final.csv', index=False)