# Enviornment setup

The goal of this first partical is to verify that your environment is correctly set up.

## Import package

It is good practice to import all necessary packages at the top of Python files or in the first code cell of a Python notebook.

In [1]:
import torch
import sklearn
import mlc
from mlc.datasets.dataset_factory import get_dataset
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


We check the correct version are installed.

In [2]:
for pkg, version in [(torch, "1.12.1"), (sklearn, "1.2.1"), (mlc, "0.1.0")]:
    if version in pkg.__version__:
        print(f"OK: {pkg.__name__}=={pkg.__version__}.")
    else:
        print(f"Version mismatch: expected version {version} for package {pkg.__name__} but is currently {pkg.__version__}")

OK: torch==1.12.1+cu102.
OK: sklearn==1.2.1.
OK: mlc==0.1.0.


## Retrieve data

In this section we will download and load a feature engineered version of the popular Lending Club Loan Data dataset ([LCLD](https://www.kaggle.com/datasets/wordsforthewise/lending-club)).

In [3]:
dataset = get_dataset("lcld_v2_iid")
x, y = dataset.get_x_y()
metadata = dataset.get_metadata(only_x=True)

In [4]:
# Splitting the data
splits = dataset.get_splits()
x_train, x_val, x_test = x.iloc[splits["train"]], x.iloc[splits["val"]], x.iloc[splits["test"]]
y_train, y_val, y_test = y[splits["train"]], y[splits["val"]], y[splits["test"]]


In [5]:
# Scaling the data, the metadata contains for each feature, its name (feature) and its type (type).
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), metadata[metadata["type"] != "cat"]["feature"]),
        ('cat', OneHotEncoder(), metadata[metadata["type"] == "cat"]["feature"])
    ])

preprocessor.fit(x)
x_train = preprocessor.transform(x_train)
x_val =  preprocessor.transform(x_val)
x_test =  preprocessor.transform(x_test)

## Fit sklearn model

In [6]:
model = RandomForestClassifier(n_estimators=10, class_weight="balanced", n_jobs=-1, verbose=2)
model.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    2.2s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.4s finished


In [7]:
# Model prediction
y_score = model.predict_proba(x_test)


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    0.4s remaining:    0.9s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.4s finished


In [8]:
# Model scoring
auc = roc_auc_score(y_test, y_score[:, 1])
print(f"The AUROC score of the model is {auc}")

The AUROC score of the model is 0.650728917611888


In [9]:
x_train.shape

(494088, 50)

## Fit torch Neural Network

In [10]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.l1 = nn.Linear(50, 64)
        self.l2 = nn.Linear(64, 32)
        self.l3 = nn.Linear(32, 16)
        self.l4 = nn.Linear(16, 2)

    def forward(self, x):
        x = self.l1(x)
        x = self.l2(x)
        x = self.l3(x)
        x = self.l4(x)
        return x



In [11]:
class_weight = torch.Tensor(
    1 - torch.unique(torch.tensor(y_train), return_counts=True)[1] / len(y_train)
)
print(f"Class weight {class_weight}")

Class weight tensor([0.2009, 0.7991])


In [12]:
model = Net()
optimizer = optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=0.01,
)

In [13]:
def train_loop(dataloader, model, loss_fn, optimizer, batch_size):
    size = len(dataloader.dataset)
    for batch, (X, y) in tqdm(enumerate(dataloader), total=int(size/batch_size)):
        # if batch % 10 == 0:
        #     print(f"Batch {batch}.")
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def val_loop(dataloader, model, loss_fn, epoch_i):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y[:, 1]).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Epoch {epoch_i}, Val Error: Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}")



def train_model(model, x_train, y_train, x_val, y_val, optimizer, batch_size, loss_func, epochs):
    # Data processing
    train_dataset = TensorDataset(x_train, y_train)
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
    )
    val_dataset = TensorDataset(x_val, y_val)
    val_loader = DataLoader(
        dataset=val_dataset,
        batch_size=2000,
        shuffle=True,
        num_workers=2,
    )

    # Main train loop
    for epoch in range(epochs):
        train_loop(train_loader, model, loss_func, optimizer, batch_size)
        val_loop(val_loader, model, loss_func, epoch)




In [14]:
loss = nn.CrossEntropyLoss(weight=class_weight)
train_model(
    model,
    torch.from_numpy(x_train).float(),
    torch.from_numpy(np.array([1 - y_train, y_train]).T).float(),
    torch.from_numpy(x_val).float(),
    torch.from_numpy(np.array([1 - y_val, y_val]).T).float(),
    optimizer,
    1024,
    loss,
    2
)

483it [00:02, 200.60it/s]                         


Epoch 0, Val Error: Accuracy: 70.2%, Avg loss: 0.200828


483it [00:02, 234.66it/s]                         


Epoch 1, Val Error: Accuracy: 65.9%, Avg loss: 0.199612


In [15]:
# Model prediction
y_score = model(torch.from_numpy(x_test).float()).detach().numpy()


In [16]:
# Model scoring
auc = roc_auc_score(y_test, y_score[:, 1])
print(f"The AUROC score of the model is {auc}")

The AUROC score of the model is 0.7125688608134522


## Future practical

If you reach this section without trouble, you should be able to complete future practicals.
Simply open the folder of the other practicals in VSCode, install the dependencies with `uv sync`, and select the correct Python interpreter.

If any questions remain to setup your environment, do not hesitate to contact the teaching team.