# Introduction
How to open and understand the dataset

In [1]:
from time import  time_ns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import random
import torch.nn as nn
import torch.nn.utils.prune as prune

from torch.utils.tensorboard.writer import SummaryWriter
from datetime import datetime

from torchvision import models, transforms

  from .autonotebook import tqdm as notebook_tqdm


## Basic information
1. Hyperspectral data:
    1. `hsi_path` contains path to hyperspectral masked numpy arrays containing hyperspectral data that underwent masking (i.e., the field area is masked, whereas all irrelevant areas are not masked)
    2. The name of the file (e.g., _'1989.npz'_) refers to the index of the corresponding training sample in the ground-truth table.
2. Ground-truth data:
    1. `gt_path` contains path to ground truth .csv file.
    2. Additionally, `wavelength_path` contains the mapping between a band number and the corresponding wavelength.


In [2]:
base_path = "/mnt/SoilEstimation/"

hsi_path = base_path + 'train_data/1570.npz'
gt_path = base_path + 'train_gt.csv'
wavelength_path = base_path + 'wavelengths.csv'

In [3]:
use_AdamW = True # Flag utilized for switching between AdamW and SGD optimizer.

if use_AdamW:
  config = {'train_test_split':0.1, # train test split ratio
            'tr_val_split':0.2, # train validation split ratio
            'seed':42, # random seed
            'mini_batch_size':128,
            'epochs':75,
            
            # Settings for the optimizer AdamW
            'lr':5e-4, # learning rate
            'weight_decay':5e-4,

            # Settings for the lr_scheduler CosineAnnealingWarmRestarts
            't_0':5, # Number of iterations for the first restart.
            'eta_min':1e-5, # Minimum learning rate
          }
else:
  config = {'train_test_split':0.1, # train test split ratio
            'tr_val_split':0.2, # train validation split ratio
            'seed':42, # random seed
            'mini_batch_size':128,
            'epochs':100,

            # Settings for the optimizer SGD
            'lr':1e-4, # learning rate
            'weight_decay':5e-4,  
            'momentum':0.9,

            # Settings for the lr_scheduler MultiStepLR
            'milestones':[50,75,90], # List of epoch indices.
            'gamma':0.2, # Multiplicative factor of learning rate decay.
          }

In [4]:
def set_random_seed(seed=42):
    random.seed(seed) # set python seed
    np.random.seed(seed) # seed the global NumPy random number generator(RNG)
    torch.manual_seed(seed) # seed the RNG for all devices(both CPU and CUDA) 

set_random_seed(seed=config['seed'])

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device)

In [6]:
gt_df = pd.read_csv(gt_path)
wavelength_df = pd.read_csv(wavelength_path)

In [7]:
num_classes = 4
model101_path="./model101"
model50_path="./model50"

# https://pytorch.org/vision/main/models/generated/torchvision.models.resnet50.html
#model = models.resnet50(weights="DEFAULT")
model101 = models.resnet101(weights=None)
model101.fc = nn.Linear(model101.fc.in_features, num_classes)
model101.to(device)

model50 = models.resnet50(weights=None)
model50.fc = nn.Linear(model50.fc.in_features, num_classes)
model50.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [8]:
criterion = nn.MSELoss()

## Load the data

In [9]:
import os
from glob import glob
from torchvision.transforms.functional import to_pil_image

def load_data(directory: str):
    """Load each cube, reduce its dimensionality and append to array.

    Args:
        directory (str): Directory to either train or test set
    Returns:
        [type]: A list with spectral curve for each sample.
    """
    data = []
    all_files = np.array(
        sorted(
            glob(os.path.join(directory, "*.npz")),
            key=lambda x: int(os.path.basename(x).replace(".npz", "")),
        )
    )
    for file_name in all_files:
        with np.load(file_name) as npz:
            
            arr = npz['data'][[0,75,-1],:,:]
            mask = npz["mask"][[0,75,-1],:,:]

            # arr = npz['data']
            # mask = npz["mask"]

            
            arr = torch.tensor(arr, dtype=torch.float32)
            mask = torch.tensor(~mask, dtype=torch.float32)
            
            transformer = transforms.Resize((128,128), antialias=True)

            arr = transformer(arr)
            mask = transformer(mask)

            arr = arr * mask
            
        #arr = filtering(arr)
        data.append(arr)
    return data


def load_gt(file_path: str):
    """Load labels for train set from the ground truth file.
    Args:
        file_path (str): Path to the ground truth .csv file.
    Returns:
        [type]: 2D numpy array with soil properties levels
    """
    gt_file = pd.read_csv(file_path)
    labels = gt_file[["P", "K", "Mg", "pH"]].values
    return labels


In [10]:
X_train = load_data(base_path + "train_data")
y_train = load_gt(base_path + "train_gt.csv")
X_test = load_data(base_path + "test_data")

In [11]:
tensor_x = torch.stack(X_train)
tensor_y = torch.Tensor(y_train)

batch_size = tensor_x.size(0)

print(tensor_x.shape)
print(tensor_y.shape)


torch.Size([1732, 3, 128, 128])
torch.Size([1732, 4])


In [12]:
from torch.utils.data import TensorDataset, DataLoader

dataset=TensorDataset(tensor_x,tensor_y)

n = int(len(tensor_x) * config['tr_val_split'])
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [n, len(dataset) - n])

train_dataloader = DataLoader(train_dataset,batch_size=32,shuffle=True)
val_dataloader = DataLoader(val_dataset,batch_size=32,shuffle=True) 

test_x = torch.stack(X_test)
testset=TensorDataset(test_x)

test_dataloader= DataLoader(testset,shuffle=False)

## Make predictions and generate submission file

In [13]:
def train_one_epoch(model, optimizer):
    running_loss = 0.
    num_samples = 0

    for _, data in enumerate(train_dataloader):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = model(inputs.to(device))
        
        loss = criterion(outputs, labels.to(device))
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        num_samples += inputs.size(0)
        
    train_loss = running_loss / len(train_dataloader)

    return train_loss

In [14]:
# Initializing in a separate cell so we can easily add more epochs to the same run
def train(model,path):
        # Initializing in a separate cell so we can easily add more epochs to the same run
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    writer = SummaryWriter('./eurosat_trainer_{}'.format(timestamp))
    epoch_number = 0
    best_vloss = 1_000_000.

    if use_AdamW: 
        optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr=config['lr'], momentum=config['momentum'], weight_decay=config['weight_decay'])
    

    for epoch in range(config['epochs']):
        print('============= EPOCH {} ============='.format(epoch_number + 1))

        # Make sure gradient tracking is on, and do a pass over the data
        model.train(True)
        avg_loss = train_one_epoch(model,optimizer)

        running_vloss = 0.0

        # Set the model to evaluation mode, disabling dropout and using population statistics for batch normalization.
        model.eval()

        # Disable gradient computation and reduce memory consumption.
        with torch.no_grad():
            for i, vdata in enumerate(val_dataloader):
                vinputs, vlabels = vdata
                voutputs = model(vinputs.to(device))
                vloss = criterion(voutputs, vlabels.to(device))
                running_vloss += vloss
                _, vpredictions = voutputs.max(dim=-1)

        avg_vloss = running_vloss / len(val_dataloader)
        print('LOSS : train {} | valid {}'.format(round(avg_loss, 4), round(avg_vloss.item(), 4)))

        # Log the running loss averaged per epoch for both training and validation
        writer.add_scalars('Training vs. Validation Loss',
                        { 'Training' : avg_loss, 'Validation' : avg_vloss },
                        epoch_number + 1)
        writer.flush()

        # Track best performance, and save the model's state
        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            torch.save(model.state_dict(), path)

        epoch_number += 1

In [15]:
def train_knowledge_distillation_regression(teacher, student, T, soft_target_loss_weight, ce_loss_weight, device, student_path):
    mse_loss = nn.MSELoss()
    optimizer = torch.optim.AdamW(student.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])

    teacher.eval()  # Teacher set to evaluation mode

    best_vloss = float('inf')  # Set the best validation loss to infinity

    for epoch in range(config['epochs']):
        student.train()  # Student to train mode

        print('============= EPOCH {} ============='.format(epoch + 1))

        running_loss = 0.0
        running_vloss = 0.0

        for _, data in enumerate(train_dataloader):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            with torch.no_grad():
                teacher_logits = teacher(inputs)

            student_logits = student(inputs)

            soft_targets = teacher_logits / T
            soft_prob = student_logits / T

            # Calculate the soft targets loss. No need to scale by T**2 for regression.
            soft_targets_loss = mse_loss(soft_prob, soft_targets)

            # Calculate the true label loss
            label_loss = mse_loss(student_logits, labels)

            # Weighted sum of the two losses
            loss = soft_target_loss_weight * soft_targets_loss + ce_loss_weight * label_loss

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_loss = running_loss / len(train_dataloader)

        # Set the model to evaluation mode, disabling dropout and using population statistics for batch normalization.
        student.eval()

        # Disable gradient computation and reduce memory consumption.
        with torch.no_grad():
            for i, vdata in enumerate(val_dataloader):
                vinputs, vlabels = vdata
                voutputs = student(vinputs.to(device))
                vloss = mse_loss(voutputs, vlabels.to(device))
                running_vloss += vloss.item()

        avg_vloss = running_vloss / len(val_dataloader)
        print('LOSS : train {} | valid {}'.format(round(avg_loss, 4), round(avg_vloss, 4)))

        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            torch.save(student.state_dict(), student_path)


In [16]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def size(path):
    return os.path.getsize(path) / 1024**2

In [17]:
def predict(model,path):
    model.eval()
    predictions = []  # Inizializza una lista per memorizzare le predizioni
    start = time_ns()
    with torch.no_grad():
        for _, data in enumerate(test_dataloader):
            inputs = data[0].to(device)
            # Effettua le previsioni utilizzando il modello
            outputs = model(inputs)
            # Aggiungi le predizioni alla lista delle predizioni
            predictions.append(outputs.cpu().numpy())
    time = time_ns()-start

    predictions_array = np.concatenate(predictions)
    submission_df = pd.DataFrame(data=predictions_array, columns=["P", "K", "Mg", "pH"])
    submission_df.to_csv(path, index_label="sample_index")

    return time

In [18]:
# train(model101,model101_path)
# train(model50,model50_path)

## Baseline

In [19]:
num_classes = 4
baseline_model = models.resnet101(weights=None)
baseline_model.fc = nn.Linear(baseline_model.fc.in_features, num_classes)
baseline_model.load_state_dict(torch.load(model101_path))
baseline_model.to(device)


### 1.35955
baseline_time = predict(baseline_model, "baseline_submission.csv")
baseline_size = size(model101_path)
baseline_parameters = count_parameters(baseline_model)

## Pruning

In [20]:
pruned_model = models.resnet101(weights=None)
pruned_model.fc = nn.Linear(pruned_model.fc.in_features, num_classes)
pruned_model.load_state_dict(torch.load(model101_path))
pruned_model.to(device)
pruned_model.eval()

pruned_path = "./pruned_model_0_2"

parameters_to_prune = []
for module_name, module in pruned_model.named_modules():
    if isinstance(module, torch.nn.Conv2d):
        parameters_to_prune.append((module, "weight"))

parameters_to_prune.append((pruned_model.fc,"weight"))

## send pruning 0.5
# The pruned model:
#  - has 100.00% of parameters
#  - is 1.00 times smaller
#  - is 1.01 times faster

## to send pruning 0.4
# The pruned model:
#  - has 100.00% of parameters
#  - is 1.00 times smaller
#  - is 1.02 times faster

## to send pruning 0.3
# The pruned model:
#  - has 100.00% of parameters
#  - is 1.00 times smaller
#  - is 0.96 times faster

## to send pruning 0.2
# The pruned model:
#  - has 100.00% of parameters
#  - is 1.00 times smaller
#  - is 1.15 times faster

prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.4
)

for module,name in parameters_to_prune:
    prune.remove(module,name)

torch.save(pruned_model.state_dict(), pruned_path)

pruned_time = predict(pruned_model, "pruned_submission_0_2.csv")
pruned_size = size(pruned_path)
pruned_parameters = count_parameters(pruned_model)

print(f"The pruned model:") 
print(f" - has {(pruned_parameters/baseline_parameters)*100:.2f}% of parameters")
print(f" - is {baseline_size/pruned_size:.2f} times smaller")
print(f" - is {baseline_time/pruned_time:.2f} times faster")

The pruned model:
 - has 100.00% of parameters
 - is 1.00 times smaller
 - is 1.15 times faster


## Distillation

In [88]:
teacher_model = models.resnet101(weights=None)
teacher_model.fc = nn.Linear(teacher_model.fc.in_features, num_classes)
teacher_model.load_state_dict(torch.load(model101_path))
teacher_model.to(device)
teacher_model.eval()

small_model = models.resnet50(weights=None)
small_model.fc = nn.Linear(small_model.fc.in_features, num_classes)
small_model.load_state_dict(torch.load(model50_path))
small_model.to(device)
small_model.eval()

student_path = "./student_model"
student_model = models.resnet50(weights=None)
student_model.fc = nn.Linear(student_model.fc.in_features, num_classes)
student_model.to(device)

#train_knowledge_distillation_regression(teacher=teacher_model, student=student_model, T=2, soft_target_loss_weight=0.25, ce_loss_weight=0.75, device=device,student_path=student_path)
student_model.load_state_dict(torch.load(student_path))

small_time = predict(small_model, "small_submission.csv")
small_size = size(model50_path)
small_parameters = count_parameters(small_model)
print(f"The small model:") 
print(f" - has {(small_parameters/baseline_parameters)*100:.2f}% of parameters")
print(f" - is {baseline_size/small_size:.2f} times smaller")
print(f" - is {baseline_time/small_time:.2f} times faster")

student_time = predict(student_model, "student_submission.csv")
student_size = size(student_path)
student_parameters = count_parameters(student_model)
print(f"The student model:") 
print(f" - has {(student_parameters/baseline_parameters)*100:.2f}% of parameters")
print(f" - is {baseline_size/student_size:.2f} times smaller")
print(f" - is {baseline_time/student_time:.2f} times faster")

The small model:
 - has 55.32% of parameters
 - is 1.81 times smaller
 - is 1.69 times faster
The student model:
 - has 55.32% of parameters
 - is 1.81 times smaller
 - is 1.49 times faster


## Quantization

In [89]:
def cal(model,dataloader):
    with torch.no_grad():
        for _, data in enumerate(dataloader):
            inputs = data[0].to(device)
            model(inputs)

In [90]:
device = "cpu"

num_classes = 4
baseline_model = models.resnet50(weights=None)
baseline_model.fc = nn.Linear(baseline_model.fc.in_features, num_classes)
baseline_model.load_state_dict(torch.load(model50_path))
baseline_model.to(device)

baseline_time = predict(baseline_model, "baseline_submission.csv")
baseline_size = size(model50_path)
baseline_parameters = count_parameters(baseline_model)

quantized_path = "./quantized_model"

quantized_model=models.quantization.resnet50()
quantized_model.fc = nn.Linear(quantized_model.fc.in_features, num_classes)
quantized_model.load_state_dict(torch.load(model50_path))
quantized_model.to(device)
quantized_model.eval()

quantized_model.qconfig = torch.quantization.get_default_qconfig("x86")
quantized_model.fuse_model(is_qat=False)
torch.quantization.prepare(quantized_model,inplace=True)

cal(quantized_model,train_dataloader)

quantized_model.eval()
torch.quantization.convert(quantized_model,inplace=True)

torch.save(quantized_model.state_dict(), quantized_path)



In [91]:
quantized_time = predict(quantized_model, "quantized_submission.csv")
quantized_size = size(quantized_path)
quantized_parameters = count_parameters(quantized_model)
print(f"The quantized model:") 
print(f" - has {(quantized_parameters/baseline_parameters)*100:.2f}% of parameters")
print(f" - is {baseline_size/quantized_size:.2f} times smaller")
print(f" - is {baseline_time/quantized_time:.2f} times faster")

The quantized model:
 - has 0.00% of parameters
 - is 3.91 times smaller
 - is 3.41 times faster
