In [5]:
from ydata_profiling import ProfileReport
train = pd.read_csv("hackaton_students_train.csv")

In [6]:
profile = ProfileReport(train, title="Raport danych: Hackaton Students", explorative=True)
profile.to_file("profiling_report.html")

100%|██████████| 54/54 [00:05<00:00,  9.44it/s]5<00:00, 10.75it/s, Describe variable: top_3_section]                  
Summarize dataset: 100%|██████████| 505/505 [00:58<00:00,  8.64it/s, Completed]                                                             
Generate report structure: 100%|██████████| 1/1 [00:13<00:00, 13.15s/it]
Render HTML: 100%|██████████| 1/1 [00:10<00:00, 10.90s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00,  9.04it/s]


In [2]:
conda install pytorch torchvision torchaudio cpuonly -c pytorch

Channels:
 - pytorch
 - conda-forge
 - bioconda
 - defaults
 - anaconda
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: c:\Users\Mikolaj\anaconda3\envs\djangovenv

  added / updated specs:
    - cpuonly
    - pytorch
    - torchaudio
    - torchvision


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    brotli-python-1.0.9        |  py312h5da7b33_9         347 KB
    cffi-1.17.1                |  py312h827c3e9_1         311 KB
    charset-normalizer-3.4.1   |     pyhd8ed1ab_0          46 KB  conda-forge
    cpuonly-2.0                |                0           2 KB  pytorch
    filelock-3.18.0            |     pyhd8ed1ab_0          17 KB  conda-forge
    freetype-2.10.4            |       h546665d_1         489 KB  conda-forge
    h2-4.2.0                   |     pyhd8ed1a



    current version: 24.11.3
    latest version: 25.3.1

Please update conda by running

    $ conda update -n base -c conda-forge conda




In [8]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from L_score import L_score

In [4]:
y_train.head()

Unnamed: 0,label,clicked
15364,1,0
13371,8,0
52339,7,0
40119,7,0
39677,4,0


In [6]:
class HackatonDataset(Dataset):
    def __init__(self, X, y_df):
        """
        Konstruktor przyjmuje macierz cech (X) oraz DataFrame y_df,
        w którym kolumna 'label' to etykieta, a 'clicked' to informacja aposteriori.
        """
        # Konwersja DataFrame na tablicę NumPy
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y_df["label"].values, dtype=torch.long)
        self.clicked = torch.tensor(y_df["clicked"].values, dtype=torch.float32)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.clicked[idx]

In [None]:
# --- 1. Wczytanie i przygotowanie danych ---

# Wczytujemy dane z plików x_train.csv oraz y_train.csv
X_train = pd.read_csv("data/x_train.csv")
y_train = pd.read_csv("data/y_train.csv")  # Powinien zawierać kolumny: 'label' i 'clicked'
X_val = pd.read_csv("data/x_valid.csv")
y_val = pd.read_csv("data/y_valid.csv")  # Powinien zawierać kolumny: 'label' i 'clicked'

# Upewniamy się, że X zawiera tylko cechy numeryczne (dostosuj, jeśli potrzeba)
X_train= X_train.select_dtypes(include=[np.number])
X_val = X_val.select_dtypes(include=[np.number])


# Standaryzacja cech
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


# Tworzymy zestawy danych i DataLoadery
train_dataset = HackatonDataset(X_train_scaled, y_train)
val_dataset   = HackatonDataset(X_val_scaled, y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=64, shuffle=False)

# --- 3. Definicja modelu (logistyczna regresja) ---

class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, num_classes)
        
    def forward(self, x):
        logits = self.linear(x)
        return logits

input_dim = X_train_scaled.shape[1]
num_classes = 10  # Zakładamy 10 unikalnych wartości layout_type
model = LogisticRegressionModel(input_dim, num_classes)

# Wybieramy optymalizator
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# --- 4. Definicja customowego lossu ---

def custom_loss_fn(logits, target, clicked, lambda_val):
    """
    Oblicza standardową funkcję cross-entropy (per próbka) i modyfikuje ją według reguł:
    
    - Jeżeli clicked == 0:
        * Jeżeli predykcja poprawna: loss += lambda_val
        * Jeżeli predykcja błędna: loss += (lambda_val / 10)
    - Jeżeli clicked == 1:
        * Jeżeli predykcja poprawna: loss -= lambda_val
        * Jeżeli predykcja błędna: loss += (lambda_val / 10)
        
    params:
      logits    : logity wyjściowe z modelu (tensor)
      target    : prawdziwe etykiety (tensor, dtype long)
      clicked   : informacje o kliknięciu (tensor, dtype float, wartości 0 lub 1)
      lambda_val: parametr lambda (float)
      
    return:
      średnia zmodyfikowana strata (loss)
    """
    # Obliczamy cross-entropy (bez redukcji, czyli per próbka)
    ce_loss = F.cross_entropy(logits, target, reduction="none")
    
    # Predykcje – uzywamy argmax
    preds = torch.argmax(logits, dim=1)
    # Tensor boolowski: czy predykcja jest poprawna
    correct = preds.eq(target)
    
    # Inicjujemy modyfikację lossu
    adjustment = torch.zeros_like(ce_loss)
    
    # Dla próbek, gdzie clicked == 0:
    mask0 = (clicked == 0)
    # Jeśli predykcja poprawna -> +lambda, jeśli błędna -> +(lambda/10)
    adjustment[mask0] = torch.where(correct[mask0], lambda_val, lambda_val/10)
    
    # Dla próbek, gdzie clicked == 1:
    mask1 = (clicked == 1)
    # Jeśli predykcja poprawna -> -lambda, jeśli błędna -> +(lambda/10)
    adjustment[mask1] = torch.where(correct[mask1], -lambda_val, lambda_val/10)
    
    total_loss = ce_loss + adjustment
    return total_loss.mean()

# --- 5. Pętla treningowa wykorzystująca custom loss ---

lambda_val = 0.1  # Ustalona wartość parametru lambda – można eksperymentować
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    epoch_losses = []
    for batch in train_loader:
        X_batch, y_batch, clicked_batch = batch
        optimizer.zero_grad()
        logits = model(X_batch)
        loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
    print(f"Epoch {epoch+1:2d}/{num_epochs}, Training Loss: {np.mean(epoch_losses):.4f}")

# --- 6. Walidacja modelu ---

model.eval()
val_losses = []
all_preds = []
all_targets = []
all_clicked = []

with torch.no_grad():
    for batch in val_loader:
        X_batch, y_batch, clicked_batch = batch
        logits = model(X_batch)
        loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
        val_losses.append(loss.item())
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(y_batch.cpu().numpy())
        all_clicked.extend(clicked_batch.cpu().numpy())
        
avg_val_loss = np.mean(val_losses)
print(f"\nValidation Loss: {avg_val_loss:.4f}")

# --- 7. Ocena modelu za pomocą metryki L_score ---


# Przygotowanie zbioru walidacyjnego do obliczania L_score
val_df = y_val.copy()
# Dodajemy kolumnę z predykcjami uzyskanymi z modelu
val_df['y_pred'] = all_preds

l_score_val = L_score(val_df[['label','clicked']], val_df['y_pred'])
print(f"L_score na zbiorze walidacyjnym: {l_score_val:.4f}")

Epoch  1/20, Training Loss: 2.3511
Epoch  2/20, Training Loss: 2.3211
Epoch  3/20, Training Loss: 2.3202
Epoch  4/20, Training Loss: 2.3202
Epoch  5/20, Training Loss: 2.3203
Epoch  6/20, Training Loss: 2.3201
Epoch  7/20, Training Loss: 2.3200
Epoch  8/20, Training Loss: 2.3202
Epoch  9/20, Training Loss: 2.3200
Epoch 10/20, Training Loss: 2.3203
Epoch 11/20, Training Loss: 2.3202
Epoch 12/20, Training Loss: 2.3200
Epoch 13/20, Training Loss: 2.3200
Epoch 14/20, Training Loss: 2.3201
Epoch 15/20, Training Loss: 2.3199
Epoch 16/20, Training Loss: 2.3199
Epoch 17/20, Training Loss: 2.3199
Epoch 18/20, Training Loss: 2.3199
Epoch 19/20, Training Loss: 2.3200
Epoch 20/20, Training Loss: 2.3200

Validation Loss: 2.3217
L_score na zbiorze walidacyjnym: 0.1061


In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [None]:
# --- 1. Wczytanie i przygotowanie danych ---

X_train = pd.read_csv("data/x_train.csv")
y_train = pd.read_csv("data/y_train.csv")  # Powinien zawierać kolumny: 'label' i 'clicked'
X_val = pd.read_csv("data/x_valid.csv")
y_val = pd.read_csv("data/y_valid.csv")  # Powinien zawierać kolumny: 'label' i 'clicked'

X_train= X_train.select_dtypes(include=[np.number])
X_val = X_val.select_dtypes(include=[np.number])

# Standaryzacja cech
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


# Tworzymy zestawy danych i DataLoadery
train_dataset = HackatonDataset(X_train_scaled, y_train)
val_dataset   = HackatonDataset(X_val_scaled, y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=64, shuffle=False)

# --- 3. Definicja nowego modelu sieci neuronowej ---

class BetterMLP(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(BetterMLP, self).__init__()
        # Pierwsza warstwa: 64 neurony + BatchNorm + Dropout
        self.fc1 = nn.Linear(input_dim, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.dropout1 = nn.Dropout(0.5)
        
        # Druga warstwa: 32 neurony + BatchNorm + Dropout
        self.fc2 = nn.Linear(64, 32)
        self.bn2 = nn.BatchNorm1d(32)
        self.dropout2 = nn.Dropout(0.5)
        
        # Warstwa wyjściowa
        self.fc3 = nn.Linear(32, num_classes)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        logits = self.fc3(x)
        return logits

input_dim = X_train_scaled.shape[1]
num_classes = 10  # Zakładamy 10 unikalnych klas
model = BetterMLP(input_dim, num_classes)

# Wybieramy optymalizator
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# --- 4. Definicja customowego lossu ---

def custom_loss_fn(logits, target, clicked, lambda_val):
    """
    Oblicza standardową funkcję cross-entropy (per próbka) i modyfikuje ją według reguł:
    
    - Jeżeli clicked == 0:
        * Jeżeli predykcja poprawna: loss += lambda_val
        * Jeżeli predykcja błędna: loss += (lambda_val / 10)
    - Jeżeli clicked == 1:
        * Jeżeli predykcja poprawna: loss -= lambda_val
        * Jeżeli predykcja błędna: loss += (lambda_val / 10)
        
    params:
      logits    : logity wyjściowe z modelu (tensor)
      target    : prawdziwe etykiety (tensor, dtype long)
      clicked   : informacje o kliknięciu (tensor, dtype float, wartości 0 lub 1)
      lambda_val: parametr lambda (float)
      
    return:
      średnia zmodyfikowana strata (loss)
    """
    ce_loss = F.cross_entropy(logits, target, reduction="none")
    preds = torch.argmax(logits, dim=1)
    correct = preds.eq(target)
    adjustment = torch.zeros_like(ce_loss)
    
    mask0 = (clicked == 0)
    adjustment[mask0] = torch.where(correct[mask0], lambda_val, lambda_val/10)
    
    mask1 = (clicked == 1)
    adjustment[mask1] = torch.where(correct[mask1], -lambda_val, lambda_val/10)
    
    total_loss = ce_loss + adjustment
    return total_loss.mean()

# --- 5. Pętla treningowa wykorzystująca custom loss ---

lambda_val = 0.1  # Ustalona wartość parametru lambda – można eksperymentować
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    epoch_losses = []
    for batch in train_loader:
        X_batch, y_batch, clicked_batch = batch
        optimizer.zero_grad()
        logits = model(X_batch)
        loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
    print(f"Epoch {epoch+1:2d}/{num_epochs}, Training Loss: {np.mean(epoch_losses):.4f}")

# --- 6. Walidacja modelu ---

model.eval()
val_losses = []
all_preds = []
all_targets = []
all_clicked = []

with torch.no_grad():
    for batch in val_loader:
        X_batch, y_batch, clicked_batch = batch
        logits = model(X_batch)
        loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
        val_losses.append(loss.item())
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(y_batch.cpu().numpy())
        all_clicked.extend(clicked_batch.cpu().numpy())
        
avg_val_loss = np.mean(val_losses)
print(f"\nValidation Loss: {avg_val_loss:.4f}")


# Przygotowanie zbioru walidacyjnego do obliczania L_score
val_df = y_val.copy()
val_df['y_pred'] = all_preds

l_score_val = L_score(val_df[['label','clicked']], val_df['y_pred'])
print(f"L_score na zbiorze walidacyjnym: {l_score_val:.4f}")

Epoch  1/20, Training Loss: 2.3503
Epoch  2/20, Training Loss: 2.3219
Epoch  3/20, Training Loss: 2.3200
Epoch  4/20, Training Loss: 2.3196
Epoch  5/20, Training Loss: 2.3193
Epoch  6/20, Training Loss: 2.3194
Epoch  7/20, Training Loss: 2.3189
Epoch  8/20, Training Loss: 2.3184
Epoch  9/20, Training Loss: 2.3186
Epoch 10/20, Training Loss: 2.3186
Epoch 11/20, Training Loss: 2.3181
Epoch 12/20, Training Loss: 2.3183
Epoch 13/20, Training Loss: 2.3181
Epoch 14/20, Training Loss: 2.3184
Epoch 15/20, Training Loss: 2.3183
Epoch 16/20, Training Loss: 2.3184
Epoch 17/20, Training Loss: 2.3183
Epoch 18/20, Training Loss: 2.3181
Epoch 19/20, Training Loss: 2.3183
Epoch 20/20, Training Loss: 2.3178

Validation Loss: 2.3186
L_score na zbiorze walidacyjnym: 0.0964


In [10]:
X_train_scaled = pd.read_csv("data/feature_engineering/processed_x_train.csv")
X_val_scaled = pd.read_csv("data/feature_engineering/processed_x_valid.csv")
y_train = pd.read_csv("data/y_train.csv")
y_val = pd.read_csv("data/y_valid.csv")

# Tworzymy zestawy danych i DataLoadery
train_dataset = HackatonDataset(X_train_scaled, y_train)
val_dataset   = HackatonDataset(X_val_scaled, y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=64, shuffle=False)

# --- 3. Definicja nowego modelu sieci neuronowej ---

class BetterMLP(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(BetterMLP, self).__init__()
        # Pierwsza warstwa: 64 neurony + BatchNorm + Dropout
        self.fc1 = nn.Linear(input_dim, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.dropout1 = nn.Dropout(0.5)
        
        # Druga warstwa: 32 neurony + BatchNorm + Dropout
        self.fc2 = nn.Linear(64, 32)
        self.bn2 = nn.BatchNorm1d(32)
        self.dropout2 = nn.Dropout(0.5)
        
        # Warstwa wyjściowa
        self.fc3 = nn.Linear(32, num_classes)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        logits = self.fc3(x)
        return logits

input_dim = X_train_scaled.shape[1]
num_classes = 10  # Zakładamy 10 unikalnych klas
model = BetterMLP(input_dim, num_classes)

# Wybieramy optymalizator
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# --- 4. Definicja customowego lossu ---

def custom_loss_fn(logits, target, clicked, lambda_val):
    """
    Oblicza standardową funkcję cross-entropy (per próbka) i modyfikuje ją według reguł:
    
    - Jeżeli clicked == 0:
        * Jeżeli predykcja poprawna: loss += lambda_val
        * Jeżeli predykcja błędna: loss += (lambda_val / 10)
    - Jeżeli clicked == 1:
        * Jeżeli predykcja poprawna: loss -= lambda_val
        * Jeżeli predykcja błędna: loss += (lambda_val / 10)
        
    params:
      logits    : logity wyjściowe z modelu (tensor)
      target    : prawdziwe etykiety (tensor, dtype long)
      clicked   : informacje o kliknięciu (tensor, dtype float, wartości 0 lub 1)
      lambda_val: parametr lambda (float)
      
    return:
      średnia zmodyfikowana strata (loss)
    """
    ce_loss = F.cross_entropy(logits, target, reduction="none")
    preds = torch.argmax(logits, dim=1)
    correct = preds.eq(target)
    adjustment = torch.zeros_like(ce_loss)
    
    mask0 = (clicked == 0)
    adjustment[mask0] = torch.where(correct[mask0], lambda_val, lambda_val/10)
    
    mask1 = (clicked == 1)
    adjustment[mask1] = torch.where(correct[mask1], -lambda_val, lambda_val/10)
    
    total_loss = ce_loss + adjustment
    return total_loss.mean()

# --- 5. Pętla treningowa wykorzystująca custom loss ---

lambda_val = 0.1  # Ustalona wartość parametru lambda – można eksperymentować
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    epoch_losses = []
    for batch in train_loader:
        X_batch, y_batch, clicked_batch = batch
        optimizer.zero_grad()
        logits = model(X_batch)
        loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
    print(f"Epoch {epoch+1:2d}/{num_epochs}, Training Loss: {np.mean(epoch_losses):.4f}")

# --- 6. Walidacja modelu ---

model.eval()
val_losses = []
all_preds = []
all_targets = []
all_clicked = []

with torch.no_grad():
    for batch in val_loader:
        X_batch, y_batch, clicked_batch = batch
        logits = model(X_batch)
        loss = custom_loss_fn(logits, y_batch, clicked_batch, lambda_val)
        val_losses.append(loss.item())
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(y_batch.cpu().numpy())
        all_clicked.extend(clicked_batch.cpu().numpy())
        
avg_val_loss = np.mean(val_losses)
print(f"\nValidation Loss: {avg_val_loss:.4f}")


# Przygotowanie zbioru walidacyjnego do obliczania L_score
val_df = y_val.copy()
val_df['y_pred'] = all_preds

l_score_val = L_score(val_df[['label','clicked']], val_df['y_pred'])
print(f"L_score na zbiorze walidacyjnym: {l_score_val:.4f}")

Epoch  1/20, Training Loss: 2.3484
Epoch  2/20, Training Loss: 2.3210
Epoch  3/20, Training Loss: 2.3198
Epoch  4/20, Training Loss: 2.3195
Epoch  5/20, Training Loss: 2.3192
Epoch  6/20, Training Loss: 2.3187
Epoch  7/20, Training Loss: 2.3188
Epoch  8/20, Training Loss: 2.3187
Epoch  9/20, Training Loss: 2.3182
Epoch 10/20, Training Loss: 2.3184
Epoch 11/20, Training Loss: 2.3184
Epoch 12/20, Training Loss: 2.3183
Epoch 13/20, Training Loss: 2.3182
Epoch 14/20, Training Loss: 2.3181
Epoch 15/20, Training Loss: 2.3180
Epoch 16/20, Training Loss: 2.3181
Epoch 17/20, Training Loss: 2.3179
Epoch 18/20, Training Loss: 2.3182
Epoch 19/20, Training Loss: 2.3180
Epoch 20/20, Training Loss: 2.3179

Validation Loss: 2.3016
L_score na zbiorze walidacyjnym: 0.1031
