# 1. Импорт модулей и библиотек

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

# 2. Сплит данных и нормализация

In [10]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

X = train_data.drop(columns=['smoking']).values
y = train_data['smoking'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(test_data.values)

# 3. Определение класса SmokerDataset и создание даталодеров

In [11]:
class SmokerDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return (
                torch.tensor(self.features[idx], dtype=torch.float32),
                torch.tensor(self.labels[idx], dtype=torch.float32)
            )
        else:
            return torch.tensor(self.features[idx], dtype=torch.float32)

train_dataset = SmokerDataset(X_train, y_train)
val_dataset = SmokerDataset(X_val, y_val)
test_dataset = SmokerDataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 4. Класс классификатора

In [12]:
class SmokerClassifier(nn.Module):
    def __init__(self, input_size):
        super(SmokerClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x)) 
        return x

model = SmokerClassifier(input_size=X_train.shape[1])

# 5. Выбор функции лосса и оптимизатора

In [13]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 6. Обучение модели

In [14]:
num_epochs = 10
best_auc = 0 

for epoch in range(num_epochs):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.squeeze(), target)
        loss.backward()
        optimizer.step()

    model.eval()
    val_loss = 0
    all_targets = []
    all_probabilities = []
    with torch.no_grad():
        for data, target in val_loader:
            output = model(data)
            val_loss += criterion(output.squeeze(), target).item()
            all_targets.extend(target.numpy())
            all_probabilities.extend(output.numpy().flatten())

    val_loss /= len(val_loader.dataset)
    val_auc = roc_auc_score(all_targets, all_probabilities)
    print(f'Epoch {epoch+1}, Validation Loss: {val_loss:.4f}, Validation ROC AUC: {val_auc:.4f}')

    if val_auc > best_auc:
        best_auc = val_auc
        torch.save(model.state_dict(), 'best_model.pth')

Epoch 1, Validation Loss: 0.0127, Validation ROC AUC: 0.8811
Epoch 2, Validation Loss: 0.0125, Validation ROC AUC: 0.8842
Epoch 3, Validation Loss: 0.0127, Validation ROC AUC: 0.8833
Epoch 4, Validation Loss: 0.0127, Validation ROC AUC: 0.8846
Epoch 5, Validation Loss: 0.0125, Validation ROC AUC: 0.8861
Epoch 6, Validation Loss: 0.0126, Validation ROC AUC: 0.8859
Epoch 7, Validation Loss: 0.0125, Validation ROC AUC: 0.8867
Epoch 8, Validation Loss: 0.0126, Validation ROC AUC: 0.8837
Epoch 9, Validation Loss: 0.0126, Validation ROC AUC: 0.8852
Epoch 10, Validation Loss: 0.0125, Validation ROC AUC: 0.8853


In [15]:
# Используем лучшую получившуюся модель
model.load_state_dict(torch.load('best_model.pth'))

  model.load_state_dict(torch.load('best_model.pth'))


<All keys matched successfully>

# 7. Выгрузка ответов

In [19]:
model.eval()
probabilities = []
# Проверяем на тестовой части и записываем в выходной файлик
with torch.no_grad():
    for data in test_loader:
        output = model(data)
        probabilities.extend(output.numpy().flatten())

submission = pd.DataFrame({'id': test_data['id'], 'smoking': probabilities})
submission.to_csv('submission.csv', index=False)