In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from L_score import L_score
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

### Wersja tworzenia datasetu dla danych po feature engineeringu

In [None]:
class HackatonDataset(Dataset):
    def __init__(self, X, y_df):
        """
        Konstruktor przyjmuje macierz cech (X) oraz DataFrame y_df,
        w którym kolumna 'label' to etykieta, a 'clicked' to informacja aposteriori.
        """
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y_df["label"].values, dtype=torch.long)
        self.clicked = torch.tensor(y_df["clicked"].values, dtype=torch.float32)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.clicked[idx]

### Wersja tworzenia datasetu dla danych bez feature engineeringu

In [None]:
class HackatonDataset2(Dataset):
    def __init__(self, X, y_df):
        """
        Konstruktor przyjmuje macierz cech (X) oraz DataFrame y_df,
        w którym kolumna 'label' to etykieta, a 'clicked' to informacja aposteriori.
        """
        X_numeric = X.apply(pd.to_numeric, errors='coerce')
        self.X = torch.tensor(X_numeric.values, dtype=torch.float32)
        self.y = torch.tensor(y_df["label"].values, dtype=torch.long)
        self.clicked = torch.tensor(y_df["clicked"].values, dtype=torch.float32)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.clicked[idx]

In [None]:
X_train_scaled = pd.read_csv("data/feature_engineering/processed_x_train.csv")
X_val_scaled = pd.read_csv("data/feature_engineering/processed_x_valid.csv")
y_train = pd.read_csv("data/y_train.csv")
y_val = pd.read_csv("data/y_valid.csv")

train_dataset = HackatonDataset(X_train_scaled, y_train)
val_dataset   = HackatonDataset(X_val_scaled, y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=64, shuffle=False)


class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, num_classes)
        
    def forward(self, x):
        logits = self.linear(x)
        return logits

input_dim = X_train_scaled.shape[1]
num_classes = 10
model = LogisticRegressionModel(input_dim, num_classes)

optimizer = optim.Adam(model.parameters(), lr=1e-3)

def custom_loss_fn(logits, target, clicked, lambda_val):
    """
    Oblicza standardową funkcję cross-entropy (per próbka) i modyfikuje ją według reguł:
    
    - Jeżeli clicked == 0:
        * Jeżeli predykcja poprawna: loss += lambda_val
        * Jeżeli predykcja błędna: loss += (lambda_val / 10)
    - Jeżeli clicked == 1:
        * Jeżeli predykcja poprawna: loss -= lambda_val
        * Jeżeli predykcja błędna: loss += (lambda_val / 10)
        
    params:
      logits    : logity wyjściowe z modelu (tensor)
      target    : prawdziwe etykiety (tensor, dtype long)
      clicked   : informacje o kliknięciu (tensor, dtype float, wartości 0 lub 1)
      lambda_val: parametr lambda (float)
      
    return:
      średnia zmodyfikowana strata (loss)
    """
    ce_loss = F.cross_entropy(logits, target, reduction="none")
    
    preds = torch.argmax(logits, dim=1)
    correct = preds.eq(target)
 
    adjustment = torch.zeros_like(ce_loss)
    
    # Dla próbek, gdzie clicked == 0:
    mask0 = (clicked == 0)
    # Jeśli predykcja poprawna -> +lambda, jeśli błędna -> +(lambda/10)
    adjustment[mask0] = torch.where(correct[mask0], lambda_val, lambda_val/10)
    
    # Dla próbek, gdzie clicked == 1:
    mask1 = (clicked == 1)
    # Jeśli predykcja poprawna -> -lambda, jeśli błędna -> +(lambda/10)
    adjustment[mask1] = torch.where(correct[mask1], -lambda_val, lambda_val/10)
    
    total_loss = ce_loss + adjustment
    return total_loss.mean()


lambda_val = 0.1
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    epoch_losses = []
    for batch in train_loader:
        X_batch, y_batch, clicked_batch = batch
        optimizer.zero_grad()
        logits = model(X_batch)
        loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
    print(f"Epoch {epoch+1:2d}/{num_epochs}, Training Loss: {np.mean(epoch_losses):.4f}")


model.eval()
val_losses = []
all_preds = []
all_targets = []
all_clicked = []

with torch.no_grad():
    for batch in val_loader:
        X_batch, y_batch, clicked_batch = batch
        logits = model(X_batch)
        loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
        val_losses.append(loss.item())
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(y_batch.cpu().numpy())
        all_clicked.extend(clicked_batch.cpu().numpy())
        
avg_val_loss = np.mean(val_losses)
print(f"\nValidation Loss: {avg_val_loss:.4f}")

val_df = y_val.copy()
val_df['y_pred'] = all_preds

l_score_val = L_score(val_df[['label','clicked']], val_df['y_pred'])
print(f"L_score na zbiorze walidacyjnym: {l_score_val:.4f}")

Epoch  1/20, Training Loss: 2.3315
Epoch  2/20, Training Loss: 2.3206
Epoch  3/20, Training Loss: 2.3197
Epoch  4/20, Training Loss: 2.3193
Epoch  5/20, Training Loss: 2.3193
Epoch  6/20, Training Loss: 2.3191
Epoch  7/20, Training Loss: 2.3190
Epoch  8/20, Training Loss: 2.3187
Epoch  9/20, Training Loss: 2.3187
Epoch 10/20, Training Loss: 2.3187
Epoch 11/20, Training Loss: 2.3185
Epoch 12/20, Training Loss: 2.3187
Epoch 13/20, Training Loss: 2.3187
Epoch 14/20, Training Loss: 2.3186
Epoch 15/20, Training Loss: 2.3187
Epoch 16/20, Training Loss: 2.3186
Epoch 17/20, Training Loss: 2.3188
Epoch 18/20, Training Loss: 2.3184
Epoch 19/20, Training Loss: 2.3186
Epoch 20/20, Training Loss: 2.3186

Validation Loss: 2.3058
L_score na zbiorze walidacyjnym: 0.1055


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

In [None]:
X_train_scaled = pd.read_csv("data/feature_engineering/processed_x_train_2.csv")
X_val_scaled = pd.read_csv("data/feature_engineering/processed_x_valid_2.csv")
y_train = pd.read_csv("data/y_train.csv")
y_val = pd.read_csv("data/y_valid.csv")

train_dataset = HackatonDataset(X_train_scaled, y_train)
val_dataset   = HackatonDataset(X_val_scaled, y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=64, shuffle=False)

# ---  Definicja nowego modelu sieci neuronowej ---

class BetterMLP(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(BetterMLP, self).__init__()
        # Pierwsza warstwa: 64 neurony + BatchNorm + Dropout
        self.fc1 = nn.Linear(input_dim, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.dropout1 = nn.Dropout(0.5)
        
        # Druga warstwa: 32 neurony + BatchNorm + Dropout
        self.fc2 = nn.Linear(64, 32)
        self.bn2 = nn.BatchNorm1d(32)
        self.dropout2 = nn.Dropout(0.5)
        
        # Warstwa wyjściowa
        self.fc3 = nn.Linear(32, num_classes)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        logits = self.fc3(x)
        return logits

input_dim = X_train_scaled.shape[1]
num_classes = 10 
model = BetterMLP(input_dim, num_classes)

optimizer = optim.Adamax(model.parameters(), lr=1e-3)

# ---  Definicja customowego lossu ---

def custom_loss_fn(logits, target, clicked, lambda_val):
    """
    Oblicza standardową funkcję cross-entropy (per próbka) i modyfikuje ją według reguł:
    
    - Jeżeli clicked == 0:
        * Jeżeli predykcja poprawna: loss += lambda_val
        * Jeżeli predykcja błędna: loss += (lambda_val / 10)
    - Jeżeli clicked == 1:
        * Jeżeli predykcja poprawna: loss -= lambda_val
        * Jeżeli predykcja błędna: loss += (lambda_val / 10)
        
    params:
      logits    : logity wyjściowe z modelu (tensor)
      target    : prawdziwe etykiety (tensor, dtype long)
      clicked   : informacje o kliknięciu (tensor, dtype float, wartości 0 lub 1)
      lambda_val: parametr lambda (float)
      
    return:
      średnia zmodyfikowana strata (loss)
    """
    ce_loss = F.cross_entropy(logits, target, reduction="none")
    preds = torch.argmax(logits, dim=1)
    correct = preds.eq(target)
    adjustment = torch.zeros_like(ce_loss)
    
    mask0 = (clicked == 0)
    adjustment[mask0] = torch.where(correct[mask0], lambda_val, lambda_val/10)
    
    mask1 = (clicked == 1)
    adjustment[mask1] = torch.where(correct[mask1], -lambda_val, lambda_val/10)
    
    total_loss = ce_loss + adjustment
    return total_loss.mean()

lambda_val = 2 
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    epoch_losses = []
    for batch in train_loader:
        X_batch, y_batch, clicked_batch = batch
        optimizer.zero_grad()
        logits = model(X_batch)
        loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
    print(f"Epoch {epoch+1:2d}/{num_epochs}, Training Loss: {np.mean(epoch_losses):.4f}")

model.eval()
val_losses = []
all_preds = []
all_targets = []
all_clicked = []

with torch.no_grad():
    for batch in val_loader:
        X_batch, y_batch, clicked_batch = batch
        logits = model(X_batch)
        loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
        val_losses.append(loss.item())
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(y_batch.cpu().numpy())
        all_clicked.extend(clicked_batch.cpu().numpy())
        
avg_val_loss = np.mean(val_losses)
print(f"\nValidation Loss: {avg_val_loss:.4f}")

val_df = y_val.copy()
val_df['y_pred'] = all_preds

l_score_val = L_score(val_df[['label','clicked']], val_df['y_pred'])
print(f"L_score na zbiorze walidacyjnym: {l_score_val:.4f}")

Epoch  1/20, Training Loss: 2.6086
Epoch  2/20, Training Loss: 2.5723
Epoch  3/20, Training Loss: 2.5664
Epoch  4/20, Training Loss: 2.5639
Epoch  5/20, Training Loss: 2.5627
Epoch  6/20, Training Loss: 2.5637
Epoch  7/20, Training Loss: 2.5606
Epoch  8/20, Training Loss: 2.5612
Epoch  9/20, Training Loss: 2.5632
Epoch 10/20, Training Loss: 2.5625
Epoch 11/20, Training Loss: 2.5618
Epoch 12/20, Training Loss: 2.5642
Epoch 13/20, Training Loss: 2.5606
Epoch 14/20, Training Loss: 2.5607
Epoch 15/20, Training Loss: 2.5628
Epoch 16/20, Training Loss: 2.5615
Epoch 17/20, Training Loss: 2.5594
Epoch 18/20, Training Loss: 2.5589
Epoch 19/20, Training Loss: 2.5593
Epoch 20/20, Training Loss: 2.5634

Validation Loss: 2.2739
L_score na zbiorze walidacyjnym: 0.1034


In [None]:
X_train_scaled = pd.read_csv("data/feature_engineering/processed_x_train.csv")
X_val_scaled   = pd.read_csv("data/feature_engineering/processed_x_valid.csv")
y_train        = pd.read_csv("data/y_train.csv")
y_val          = pd.read_csv("data/y_valid.csv")

train_dataset = HackatonDataset(X_train_scaled, y_train)
val_dataset   = HackatonDataset(X_val_scaled, y_val)
train_loader  = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader    = DataLoader(val_dataset, batch_size=64, shuffle=False)

input_dim   = X_train_scaled.shape[1]
num_classes = 10


# 1. Prostszy model – tylko jedna warstwa ukryta
class SimplerMLP(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SimplerMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.dropout = nn.Dropout(0.5)
        self.fc_out = nn.Linear(64, num_classes)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        logits = self.fc_out(x)
        return logits

# 2. Domyślny model – BetterMLP
class BetterMLP(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(BetterMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.dropout1 = nn.Dropout(0.5)
        
        self.fc2 = nn.Linear(64, 32)
        self.bn2 = nn.BatchNorm1d(32)
        self.dropout2 = nn.Dropout(0.5)
        
        self.fc3 = nn.Linear(32, num_classes)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        logits = self.fc3(x)
        return logits

# 3. Głębszy model – dodatkowa warstwa ukryta
class DeeperMLP(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(DeeperMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.dropout1 = nn.Dropout(0.5)
        
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(0.5)
        
        self.fc3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.dropout3 = nn.Dropout(0.5)
        
        self.fc_out = nn.Linear(32, num_classes)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = F.relu(self.bn3(self.fc3(x)))
        x = self.dropout3(x)
        logits = self.fc_out(x)
        return logits

# ================================================
# Definicja custom lossu
# ================================================

def custom_loss_fn(logits, target, clicked, lambda_val):
    """
    Oblicza standardową funkcję cross-entropy (per próbka) i modyfikuje ją według reguł:
    
    - Dla clicked == 0:
        * Jeśli predykcja poprawna: loss += lambda_val
        * Jeśli predykcja błędna: loss += (lambda_val / 10)
    - Dla clicked == 1:
        * Jeśli predykcja poprawna: loss -= lambda_val
        * Jeśli predykcja błędna: loss += (lambda_val / 10)
    """
    ce_loss = F.cross_entropy(logits, target, reduction="none")
    preds = torch.argmax(logits, dim=1)
    correct = preds.eq(target)
    adjustment = torch.zeros_like(ce_loss)
    
    mask0 = (clicked == 0)
    adjustment[mask0] = torch.where(correct[mask0], lambda_val, lambda_val/10)
    
    mask1 = (clicked == 1)
    adjustment[mask1] = torch.where(correct[mask1], -lambda_val, lambda_val/10)
    
    total_loss = ce_loss + adjustment
    return total_loss.mean()

# ================================================
# Funkcja treningu i walidacji
# ================================================

def train_and_validate(model_class, lambda_val, num_epochs=20):
    """
    Funkcja inicjalizuje model danej architektury, trenuje przez num_epochs epok
    i zwraca średni validation loss oraz metrykę L_score.
    """
    model = model_class(input_dim, num_classes)
    optimizer = optim.Adamax(model.parameters(), lr=1e-3)
    
    for epoch in range(num_epochs):
        model.train()
        epoch_losses = []
        for batch in train_loader:
            X_batch, y_batch, clicked_batch = batch
            optimizer.zero_grad()
            logits = model(X_batch)
            loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
            loss.backward()
            optimizer.step()
            epoch_losses.append(loss.item())

    model.eval()
    val_losses = []
    all_preds = []
    with torch.no_grad():
        for batch in val_loader:
            X_batch, y_batch, clicked_batch = batch
            logits = model(X_batch)
            loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
            val_losses.append(loss.item())
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
    
    avg_val_loss = np.mean(val_losses)
    val_df = y_val.copy()
    val_df['y_pred'] = all_preds
    l_score_val = L_score(val_df[['label','clicked']], val_df['y_pred'])
    
    return avg_val_loss, l_score_val

# ================================================
# Grid search: różne architektury i lambda
# ================================================

architectures = {
    "SimplerMLP": SimplerMLP,
    "BetterMLP": BetterMLP,
    "DeeperMLP": DeeperMLP
}

lambda_candidates = [0.1, 0.5, 1, 2, 5, 10]

results = {}

for arch_name, model_class in architectures.items():
    results[arch_name] = {}
    print(f"\nTestowanie architektury: {arch_name}")
    for lam in lambda_candidates:
        print(f"  Lambda = {lam} ...", end=" ")
        val_loss, l_score = train_and_validate(model_class, lam, num_epochs=20)
        results[arch_name][lam] = {"val_loss": val_loss, "L_score": l_score}
        print(f"Val Loss: {val_loss:.4f} | L_score: {l_score:.4f}")

# ================================================
# Podsumowanie wyników
# ================================================

print("\nPodsumowanie wyników:")
for arch_name, lam_results in results.items():
    print(f"\nArchitektura: {arch_name}")
    for lam, metrics in lam_results.items():
        print(f"  Lambda = {lam:<4} | Val Loss: {metrics['val_loss']:.4f} | L_score: {metrics['L_score']:.4f}")

best_arch = None
best_lambda = None
best_loss = float("inf")
for arch_name, lam_results in results.items():
    for lam, metrics in lam_results.items():
        if metrics['val_loss'] < best_loss:
            best_loss = metrics['val_loss']
            best_arch = arch_name
            best_lambda = lam

print(f"\nNajlepsza kombinacja: Architektura = {best_arch}, Lambda = {best_lambda}, Val Loss = {best_loss:.4f}")



Testowanie architektury: SimplerMLP
  Lambda = 0.1 ... Val Loss: 2.3037 | L_score: 0.1028
  Lambda = 0.5 ... Val Loss: 2.2999 | L_score: 0.1009
  Lambda = 1 ... Val Loss: 2.2955 | L_score: 0.0997
  Lambda = 2 ... Val Loss: 2.2847 | L_score: 0.1003
  Lambda = 5 ... Val Loss: 2.2711 | L_score: 0.0975
  Lambda = 10 ... Val Loss: 2.2122 | L_score: 0.0995

Testowanie architektury: BetterMLP
  Lambda = 0.1 ... Val Loss: 2.3019 | L_score: 0.0993
  Lambda = 0.5 ... Val Loss: 2.2984 | L_score: 0.0994
  Lambda = 1 ... Val Loss: 2.2912 | L_score: 0.1017
  Lambda = 2 ... Val Loss: 2.2752 | L_score: 0.1035
  Lambda = 5 ... Val Loss: 2.2537 | L_score: 0.1000
  Lambda = 10 ... Val Loss: 2.2116 | L_score: 0.0990

Testowanie architektury: DeeperMLP
  Lambda = 0.1 ... Val Loss: 2.3023 | L_score: 0.0966
  Lambda = 0.5 ... Val Loss: 2.2975 | L_score: 0.1004
  Lambda = 1 ... Val Loss: 2.2926 | L_score: 0.1001
  Lambda = 2 ... Val Loss: 2.2860 | L_score: 0.0983
  Lambda = 5 ... Val Loss: 2.2500 | L_score: 

### Model dla danych bez feature engineeringu

In [None]:
X_train_scaled = pd.read_csv("data/x_train.csv")
X_val_scaled = pd.read_csv("data/x_valid.csv")
y_train = pd.read_csv("data/y_train.csv")   
y_val = pd.read_csv("data/y_valid.csv")

# Zostawiam tylko kolumny numeryczne
X_train_filtered = X_train_scaled.select_dtypes(include=[np.number])
X_val_filtered = X_val_scaled.select_dtypes(include=[np.number])

# Filtrowanie danych treningowych - wybieram tylko te wiersze, gdzie clicked == 1
train_filter = y_train['clicked'] == 1
X_train_filtered = X_train_scaled.loc[train_filter].reset_index(drop=True)
y_train_filtered = y_train.loc[train_filter].reset_index(drop=True)

train_dataset = HackatonDataset2(X_train_filtered, y_train_filtered)
val_dataset   = HackatonDataset2(X_val_filtered, y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=64, shuffle=False)

# --- Definicja modelu (logistyczna regresja) ---

class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, num_classes)
        
    def forward(self, x):
        logits = self.linear(x)
        return logits

input_dim = X_train_filtered.shape[1]
num_classes = 10  
model = LogisticRegressionModel(input_dim, num_classes)

optimizer = optim.Adam(model.parameters(), lr=1e-3)

# --- Definicja customowego lossu ---

def custom_loss_fn(logits, target, clicked, lambda_val):
    """
    Oblicza standardową funkcję cross-entropy (per próbka) i modyfikuje ją według reguł:
    
    - Jeżeli clicked == 0:
        * Jeżeli predykcja poprawna: loss += lambda_val
        * Jeżeli predykcja błędna: loss += (lambda_val / 10)
    - Jeżeli clicked == 1:
        * Jeżeli predykcja poprawna: loss -= lambda_val
        * Jeżeli predykcja błędna: loss += (lambda_val / 10)
    """
    ce_loss = F.cross_entropy(logits, target, reduction="none")
    preds = torch.argmax(logits, dim=1)
    correct = preds.eq(target)
    adjustment = torch.zeros_like(ce_loss)
    
    mask0 = (clicked == 0)
    adjustment[mask0] = torch.where(correct[mask0], lambda_val, lambda_val/10)
    
    mask1 = (clicked == 1)
    adjustment[mask1] = torch.where(correct[mask1], -lambda_val, lambda_val/10)
    
    total_loss = ce_loss + adjustment
    return total_loss.mean()

# ---  Pętla treningowa wykorzystująca custom loss ---

lambda_val = 0.1 
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    epoch_losses = []
    for batch in train_loader:
        X_batch, y_batch, clicked_batch = batch
        optimizer.zero_grad()
        logits = model(X_batch)
        loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
    print(f"Epoch {epoch+1:2d}/{num_epochs}, Training Loss: {np.mean(epoch_losses):.4f}")

# --- Walidacja modelu ---

model.eval()
val_losses = []
all_preds = []
all_targets = []
all_clicked = []

with torch.no_grad():
    for batch in val_loader:
        X_batch, y_batch, clicked_batch = batch
        logits = model(X_batch)
        loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
        val_losses.append(loss.item())
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(y_batch.cpu().numpy())
        all_clicked.extend(clicked_batch.cpu().numpy())
        
avg_val_loss = np.mean(val_losses)
print(f"\nValidation Loss: {avg_val_loss:.4f}")

# ---  Ocena modelu za pomocą metryki L_score ---

val_df = y_val.copy()
val_df['y_pred'] = all_preds

l_score_val = L_score(val_df[['label', 'clicked']], val_df['y_pred'])
print(f"L_score na zbiorze walidacyjnym: {l_score_val:.4f}")

## PCA

In [None]:
df = pd.read_csv("data/x_train.csv")
df.drop(columns=['id', 'booking_id', 'flight_coupon_id', 'flight_leg_id', 
                     'email', 'pnr', 'request_id', 'request_dttm'], inplace=True)
categorical_columns = df.select_dtypes(include=["object"]).columns
df = pd.get_dummies(df, columns=categorical_columns, drop_first=False)

# Standaryzacja danych 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)

# Wykonanie PCA, redukcja do 10 komponentów
pca = PCA(n_components=10)
pca_result_train = pca.fit_transform(X_scaled)

print("Wariancja wyjaśniona przez komponenty:", pca.explained_variance_ratio_)

# Wizualizacja wyników PCA
plt.figure(figsize=(8, 6))
plt.scatter(pca_result_train[:, 0], pca_result_train[:, 1], alpha=0.6)
plt.xlabel("Pierwsza główna składowa")
plt.ylabel("Druga główna składowa")
plt.title("Analiza PCA danych z processed_x_train_2.csv")
plt.grid(True)
plt.show()

In [None]:
df = pd.read_csv("data/feature_engineering/processed_x_valid_2.csv")
print("Rozmiar danych:", df.shape)

# Standaryzacja danych
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)


pca_result_valid = pca.transform(X_scaled)
print("Wariancja wyjaśniona przez komponenty:", pca.explained_variance_ratio_)

# Wizualizacja wyników PCA
plt.figure(figsize=(8, 6))
plt.scatter(pca_result_valid[:, 0], pca_result_valid[:, 1], alpha=0.6)
plt.xlabel("Pierwsza główna składowa")
plt.ylabel("Druga główna składowa")
plt.title("Analiza PCA danych z processed_x_train_2.csv")
plt.grid(True)
plt.show()

In [49]:
pca_result_train

array([[-0.70202858,  2.74424858],
       [ 1.1263193 ,  0.53026595],
       [-0.36580727, -4.39942132],
       ...,
       [ 4.40368759,  0.21563085],
       [ 1.6501326 ,  0.75065808],
       [ 0.00695971,  1.68506765]])

In [None]:
y_train = pd.read_csv("data/y_train.csv")
y_valid = pd.read_csv("data/y_valid.csv")

# ---- Definicja Datasetu dla danych PCA ----
class HackatonPCADataset(Dataset):
    def __init__(self, X, y_df):
        """
        X: NumPy array - wynik PCA, gdzie każda próbka to wiersz (np. kształt [n_samples, 2])
        y_df: DataFrame z kolumnami "label" (etykiety, int) oraz "clicked" (0 lub 1)
        """
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y_df["label"].values, dtype=torch.long)
        self.clicked = torch.tensor(y_df["clicked"].values, dtype=torch.float32)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.clicked[idx]

train_dataset = HackatonPCADataset(pca_result_train, y_train)
val_dataset   = HackatonPCADataset(pca_result_valid, y_valid)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=64, shuffle=False)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

# ---- Definicja modelu sieci neuronowej ----
class PCANet(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(PCANet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, num_classes)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        logits = self.fc3(x)
        return logits

input_dim =   10       
num_classes = 10       
model = PCANet(input_dim, num_classes)

optimizer = optim.Adamax(model.parameters(), lr=1e-3)

# ---- Definicja customowej funkcji loss ----
def custom_loss_fn(logits, target, clicked, lambda_val):
    """
    Oblicza standardową cross-entropy (per próbka) i modyfikuje ją:
      - Dla próbek, gdzie clicked == 0:
          * jeśli predykcja poprawna: loss += lambda_val
          * w przeciwnym wypadku: loss += (lambda_val / 10)
      - Dla próbek, gdzie clicked == 1:
          * jeśli predykcja poprawna: loss -= lambda_val
          * w przeciwnym wypadku: loss += (lambda_val / 10)
    """
    ce_loss = F.cross_entropy(logits, target, reduction="none")
    preds = torch.argmax(logits, dim=1)
    correct = preds.eq(target)
    adjustment = torch.zeros_like(ce_loss)
    
    mask0 = (clicked == 0)
    adjustment[mask0] = torch.where(correct[mask0], lambda_val, lambda_val / 10)
    
    mask1 = (clicked == 1)
    adjustment[mask1] = torch.where(correct[mask1], -lambda_val, lambda_val / 10)
    
    total_loss = ce_loss + adjustment
    return total_loss.mean()

# ---- Pętla treningowa ----
lambda_val = 2  
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    epoch_losses = []
    for X_batch, y_batch, clicked_batch in train_loader:
        optimizer.zero_grad()
        logits = model(X_batch)
        loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
    print(f"Epoch {epoch+1:02d}/{num_epochs}, Training Loss: {np.mean(epoch_losses):.4f}")

# ---- Walidacja modelu ----
model.eval()
val_losses = []
all_preds = []
with torch.no_grad():
    for X_batch, y_batch, clicked_batch in val_loader:
        logits = model(X_batch)
        loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
        val_losses.append(loss.item())
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        
avg_val_loss = np.mean(val_losses)
print(f"\nValidation Loss: {avg_val_loss:.4f}")


val_df = y_valid.copy()
val_df['y_pred'] = all_preds
l_score_val = L_score(val_df[['label', 'clicked']], val_df['y_pred'])
print(f"L_score on validation set: {l_score_val:.4f}")

Train dataset size: 82163
Validation dataset size: 2786
Epoch 01/20, Training Loss: 2.5633
Epoch 02/20, Training Loss: 2.5608
Epoch 03/20, Training Loss: 2.5633
Epoch 04/20, Training Loss: 2.5628
Epoch 05/20, Training Loss: 2.5629
Epoch 06/20, Training Loss: 2.5641
Epoch 07/20, Training Loss: 2.5630
Epoch 08/20, Training Loss: 2.5614
Epoch 09/20, Training Loss: 2.5624
Epoch 10/20, Training Loss: 2.5606
Epoch 11/20, Training Loss: 2.5621
Epoch 12/20, Training Loss: 2.5637
Epoch 13/20, Training Loss: 2.5639
Epoch 14/20, Training Loss: 2.5655
Epoch 15/20, Training Loss: 2.5663
Epoch 16/20, Training Loss: 2.5622
Epoch 17/20, Training Loss: 2.5656
Epoch 18/20, Training Loss: 2.5655
Epoch 19/20, Training Loss: 2.5641
Epoch 20/20, Training Loss: 2.5628

Validation Loss: 2.3054
L_score on validation set: 0.0901
