In [2]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from neuron import MLP
import torch
from sklearn.preprocessing import StandardScaler

DATA_DIR = Path("../../data")

In [3]:
trn_data_pos = pd.read_csv(DATA_DIR/'michal'/'pos_trn.csv')
trn_data_pos['is_positive'] = 1
trn_data_neg = pd.read_csv(DATA_DIR/'michal'/'neg_trn.csv')
trn_data_neg['is_positive'] = 0
trn_data = pd.concat([trn_data_pos, trn_data_neg], axis=0)
trn_data.fillna(-1, inplace=True)
shuffled_data = trn_data.sample(frac=1, random_state=42).reset_index(drop=True)
VAL_SPLIT = 0.2
split_idx = int(VAL_SPLIT * shuffled_data.shape[0])
val_data = shuffled_data.iloc[:split_idx]
trn_data = shuffled_data.iloc[split_idx:]



In [4]:
X = trn_data.drop(columns=['is_positive'])
y = trn_data['is_positive']
X_val = val_data.drop(columns=['is_positive'])
y_val = val_data['is_positive']


In [5]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_val = scaler.transform(X_val)
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y.values, dtype=torch.float32).reshape(-1, 1)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val.values, dtype=torch.float32).reshape(-1, 1)


In [6]:
train_dataset = TensorDataset(X, y)
val_dataset = TensorDataset(X_val, y_val)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [7]:
def train_mlp(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        for inputs, targets in train_loader:
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
                
        print(f'Epoch {epoch+1}, Train Loss: {train_loss/len(train_loader):.4f}, '
              f'Val Loss: {val_loss/len(val_loader):.4f}')

In [8]:
model = MLP(input_size=X.shape[1], hidden_sizes=[64,64], output_size=1)
num_pos = y.sum().item()
num_neg = y.size(0) - num_pos
pos_weight = num_neg / num_pos  # Higher values favor recall over precision

criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight))
# criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

# Training the model
print("Starting training...")
train_mlp(model, train_loader, val_loader, criterion, optimizer, epochs=20)

# Evaluate model on validation set
model.eval()
correct = 0
total = 0
pos_prediced = 0
all_predicted = 0
FP = 0
FN = 0
TP = 0
TN = 0
with torch.no_grad():
    for inputs, targets in val_loader:
        outputs = model(inputs)
        predicted = (torch.sigmoid(outputs) > 0.5).float()
        TP += ((predicted == 1) & (targets == 1)).sum().item()
        TN += ((predicted == 0) & (targets == 0)).sum().item()
        FP += ((predicted == 1) & (targets == 0)).sum().item()
        FN += ((predicted == 0) & (targets == 1)).sum().item()
        total += targets.size(0)
        correct += (predicted == targets).sum().item()
        
print(f'Validation Accuracy: {100 * correct / total:.2f}%')
print(f'Confusion matrix:')
print(f'TP: {TP}, FP: {FP}')
print(f'FN: {FN}, TN: {TN}')
print(f'Precision: {TP / (TP + FP):.2f}')
print(f'Recall: {TP / (TP + FN):.2f}')

Starting training...
Epoch 1, Train Loss: 1.2435, Val Loss: 1.2228
Epoch 2, Train Loss: 1.2012, Val Loss: 1.1993
Epoch 3, Train Loss: 1.1834, Val Loss: 1.1843
Epoch 4, Train Loss: 1.1735, Val Loss: 1.1760
Epoch 5, Train Loss: 1.1708, Val Loss: 1.1819
Epoch 6, Train Loss: 1.1700, Val Loss: 1.1746
Epoch 7, Train Loss: 1.1703, Val Loss: 1.1734
Epoch 8, Train Loss: 1.1688, Val Loss: 1.1672
Epoch 9, Train Loss: 1.1625, Val Loss: 1.1656
Epoch 10, Train Loss: 1.1631, Val Loss: 1.1635
Epoch 11, Train Loss: 1.1603, Val Loss: 1.1624
Epoch 12, Train Loss: 1.1592, Val Loss: 1.1600
Epoch 13, Train Loss: 1.1586, Val Loss: 1.1562
Epoch 14, Train Loss: 1.1554, Val Loss: 1.1635
Epoch 15, Train Loss: 1.1541, Val Loss: 1.1581
Epoch 16, Train Loss: 1.1522, Val Loss: 1.1517
Epoch 17, Train Loss: 1.1538, Val Loss: 1.1548
Epoch 18, Train Loss: 1.1472, Val Loss: 1.1529
Epoch 19, Train Loss: 1.1505, Val Loss: 1.1538
Epoch 20, Train Loss: 1.1467, Val Loss: 1.1497
Validation Accuracy: 50.60%
Confusion matrix:
TP

In [9]:
pos_tst = pd.read_csv(DATA_DIR/'michal'/'pos_tst.csv')
pos_tst['is_positive'] = 1
neg_tst = pd.read_csv(DATA_DIR/'michal'/'neg_tst.csv')
neg_tst['is_positive'] = 0
tst_data = pd.concat([pos_tst, neg_tst], axis=0)
tst_data.fillna(-1, inplace=True)
tst_data = tst_data.sample(frac=1, random_state=42).reset_index(drop=True)
X_tst = tst_data.drop(columns=['is_positive'])
y_tst = tst_data['is_positive']
X_tst = scaler.transform(X_tst)
X_tst = torch.tensor(X_tst, dtype=torch.float32)
y_tst = torch.tensor(y_tst.values, dtype=torch.float32).reshape(-1, 1)
tst_dataset = TensorDataset(X_tst, y_tst)
tst_loader = DataLoader(tst_dataset, batch_size=batch_size, shuffle=True)

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, targets in tst_loader:
        outputs = model(inputs)
        predicted = (torch.sigmoid(outputs) > 0.5).float()
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print(f'Test Accuracy: {100 * correct / total:.2f}%')



Test Accuracy: 48.65%
