### Lab 3 Task 1

In [11]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report

df = pd.read_csv('../data/cyber_salaries.csv')

target_col = 'experience_level'

cols_to_drop = ['salary', 'salary_currency']
df = df.drop(columns=cols_to_drop)

label_encoders = {}
categorical_cols = ['employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size', 'experience_level']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

X = df.drop(columns=[target_col]).values
y = df[target_col].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

class SalariesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = SalariesDataset(X_train, y_train)
test_dataset = SalariesDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

class SalaryClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SalaryClassifier, self).__init__()
        
        self.network = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(64, 32),
            nn.ReLU(),
            
            nn.Linear(32, num_classes)
        )
        
    def forward(self, x):
        return self.network(x)

# Ініціалізація моделі
input_dim = X.shape[1]
num_classes = len(np.unique(y))
model = SalaryClassifier(input_dim, num_classes)

device = torch.device("cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 100
print(f"Навчання мережі")

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}")

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

predicted_labels = label_encoders[target_col].inverse_transform(all_preds)
actual_labels = label_encoders[target_col].inverse_transform(all_labels)

print(classification_report(actual_labels, predicted_labels))

Навчання мережі
Epoch [10/100], Loss: 0.8918
Epoch [20/100], Loss: 0.8434
Epoch [30/100], Loss: 0.8111
Epoch [40/100], Loss: 0.7848
Epoch [50/100], Loss: 0.7481
Epoch [60/100], Loss: 0.7474
Epoch [70/100], Loss: 0.7092
Epoch [80/100], Loss: 0.7157
Epoch [90/100], Loss: 0.7094
Epoch [100/100], Loss: 0.6860
              precision    recall  f1-score   support

          EN       0.48      0.53      0.50        55
          EX       0.64      0.26      0.37        27
          MI       0.56      0.43      0.49       140
          SE       0.63      0.78      0.69       183

    accuracy                           0.59       405
   macro avg       0.58      0.50      0.51       405
weighted avg       0.58      0.59      0.57       405



Повнозв'язна нейронна мережа показала трохи гірші результати в порівнянні з алгоритмом SVM з Лабораторної роботи №1, а саме 59% проти 61% відповідно. Також варто зазначити, що f1-score для менш представлених класів, зокрема EX та EN у SVM був суттєво вищим - 0.51 та 0.63 проти 0.37 та 0.50 у повнозвязної нейромережі.