# Creating Training Loop

## VERSIONS
- 00_00: 
    - Initial Version

## Imports

In [2]:
# from importlib.metadata import version
import pandas as pd
# import seaborn as sn
from pathlib import Path
import os
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.nn import Module # For type hinting


## Data Preparation

### Custom Dataset

In [3]:
class WeatherDataset(Dataset):
    """Dataset class For the CA Weather Fire Dataset"""
    def __init__(self, csv_file="../Data/CA_Weather_Fire_Dataset_Cleaned.csv"):
        try:
            self.data = pd.read_csv(csv_file)   # Assign a pandas data frame
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found: {csv_file}")

        # Define feature and label columns
        self.feature_columns = self.data.columns.drop("MAX_TEMP")
        self.label_column = "MAX_TEMP"
        

    def __getitem__(self, index):
        features = self.data.loc[index, self.feature_columns].values
        
        label = self.data.loc[index, self.label_column] # Extract the label for the given index
        return (
            torch.tensor(features, dtype=torch.float),
            torch.tensor(label, dtype=torch.float)
        )

    def __len__(self):
        return len(self.data)

### Data Pipeline

In [4]:
def data_pipeline(root_data_dir: str= "../Data", data_file_path: str="CA_Weather_Fire_Dataset_Cleaned.csv", data_splits_dir: str="DataSplits", batch_size: int=64, num_workers=0, pin_memory: bool=False, drop_last: bool=True) -> tuple[Dataset, Dataset, Dataset, DataLoader, DataLoader, DataLoader]:
    """This function prepares the train, test, and validation datasets.
    Args:
        root_data_dir (str): The root of the Data Directory
        data_file_path (str): The name of the original dataset (with .csv file extension).
        data_splits_dir (str): Path to the train, test, and validation datasets.
        batch_size (int): The dataloader's batch_size.
        num_workers (int): The dataloader's number of workers.
        pin_memory (bool): The dataloader's pin memory option.
        drop_last (bool): The dataloader's drop_last option.

    Returns: 
        train_dataset (Dataset): Dataset Class for the training dataset.
        test_dataset (Dataset): Dataset Class for the test dataset.
        validation_dataset (Dataset): Dataset Class for the validation dataset.
        train_dataloader (DataLoader): The train dataloader.
        test_dataloader (DataLoader): The test dataloader.
        validation_dataloader (DataLoader): The validation dataloader.
        """
    
    if not root_data_dir or not data_file_path or not data_splits_dir:  # Check for empty strings at the beginning
        raise ValueError("File and directory paths cannot be empty strings.")
    
    WEATHER_DATA_DIR = Path(root_data_dir)                  # Set the Data Root Directory

    WEATHER_DATA_CLEAN_PATH = WEATHER_DATA_DIR / data_file_path # Set the path to the complete dataset

    if WEATHER_DATA_CLEAN_PATH.exists():
        print(f"CSV file detected, reading from {WEATHER_DATA_CLEAN_PATH}")
        df = pd.read_csv(WEATHER_DATA_CLEAN_PATH)
    else:
        print(f"Downloading csv file from HuggingFace")
        try:
            df = pd.read_csv("hf://datasets/MaxPrestige/CA_Weather_Fire_Dataset_Cleaned/Data/CA_Weather_Fire_Dataset_Cleaned.csv")  # Download and read the data into a pandas dataframe
            os.makedirs(WEATHER_DATA_DIR, exist_ok=True)        # Create the Data Root Directory
            df.to_csv(WEATHER_DATA_CLEAN_PATH, index=False)     # Save the file, omitting saving the index
        except Exception as e:
            raise RuntimeError(f"An unexpected error occurred during data download or saving: {e}")
    
    DATA_SPLITS_DIR = WEATHER_DATA_DIR / data_splits_dir
    TRAIN_DATA_PATH = DATA_SPLITS_DIR / "train.csv"
    TEST_DATA_PATH = DATA_SPLITS_DIR / "test.csv"
    VALIDATION_DATA_PATH = DATA_SPLITS_DIR / "val.csv"

    if os.path.exists(TRAIN_DATA_PATH) and os.path.exists(TEST_DATA_PATH) and os.path.exists(VALIDATION_DATA_PATH) :
        print(f"Train, Test, and Validation csv datasets detected in '{DATA_SPLITS_DIR}', skipping generation")
    else:
        print(f"Datasets not found in '{DATA_SPLITS_DIR}' or incomplete. Generating datasets...")
        os.makedirs(DATA_SPLITS_DIR, exist_ok=True)     # Create the Data Splits Parent Directory

        shuffled_data = df.sample(frac=1, random_state=42).reset_index(drop=True)   # Shuffle data around to allow data splits to incorporate different years
        # Data Splitting
        num_samples = len(shuffled_data)   # Get the number of samples
        train_size=.80          # 80% of the data will be used for training
        test_size=.10
        val_size=.10

        train_index = int(num_samples * train_size)
        test_end_index = int(num_samples * (train_size + test_size))

        train_data_frame = shuffled_data.iloc[:train_index]
        test_data_frame = shuffled_data.iloc[train_index:test_end_index]
        validation_data_frame = shuffled_data.iloc[test_end_index:]
        # Saving the split data to csv files
        train_data_frame.to_csv(TRAIN_DATA_PATH, index=False)
        test_data_frame.to_csv(TEST_DATA_PATH, index=False)
        validation_data_frame.to_csv(VALIDATION_DATA_PATH, index=False)

    print(f"Initializing DataLoaders and Returning")
    # Initialize the Different Datasets
    train_dataset = WeatherDataset(TRAIN_DATA_PATH)
    test_dataset = WeatherDataset(TEST_DATA_PATH)
    validation_dataset = WeatherDataset(VALIDATION_DATA_PATH)
    # Initialize the Different DataLoaders using the Datasets
    print(f"Creating DataLoaders with batch_size ({batch_size}), num_workers ({num_workers}), pin_memory ({pin_memory}). Training dataset drop_last: ({drop_last})")
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last, shuffle=True)
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last)
    validation_dataloader = DataLoader(dataset=validation_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last)

    return (train_dataset, test_dataset, validation_dataset, train_dataloader, test_dataloader, validation_dataloader)
        

## Agent Architecture

### Layer Block

In [5]:
class LayerBlock(torch.nn.Module):
    """Class for the individual layer blocks."""
    def __init__(self, intermediate_dim=32):
        super().__init__()
        self.Layer1 = torch.nn.Linear(in_features=intermediate_dim, out_features=intermediate_dim)
        self.ReLu = torch.nn.ReLU()
        # self.Layer_Norm1 = torch.nn.LayerNorm(normalized_shape=intermediate_dim)

    def forward(self, x):
        x = self.Layer1(x)
        x = self.ReLu(x)
        # x = self.Layer_Norm1(x)
        return x

### Weather Agent

In [6]:
class WeatherAgent(torch.nn.Module):
    """Class for Agent Structure using multiple Layer Blocks."""
    def __init__(self, cfg):
        super().__init__()
        self.L1 = torch.nn.Linear(in_features=cfg["in_dim"], out_features=cfg["intermediate_dim"])
        
        self.Layers = torch.nn.Sequential(
            *[LayerBlock(cfg["intermediate_dim"]) for _ in range(cfg["num_blocks"])]
        )
        self.out = torch.nn.Linear(in_features=cfg["intermediate_dim"], out_features=cfg["out_dim"])

    def forward(self, x):
        x = self.L1(x)
        x = self.Layers(x)
        x = self.out(x)
        return x

## Main

### Log Iteration Functions

In [7]:
def log_iteration(batch_idx: int, total_batches: int, loss_value: float):
    """Logs the loss of the current batch."""
    print(f"Epoch batch [{batch_idx}/{total_batches}] | Loss: {loss_value:.7f}")

In [8]:
def log_epoch_iteration(epoch: int, avg_epoch_loss: float):
    """Log Current Metrics accumulated in the current epoch iteration.
    Args:
        epoch (int): the current iteration
        avg_epoch_loss (float): The average loss of the current epoch
    Returns:
        N/A
        """
    if avg_epoch_loss:
        print(f"=====================  [EPOCH ({epoch}) LOGGING]  =====================")
        print("| AVERAGES of THIS EPOCH:")
        print(f"| ACCUMULATED LOSS: {avg_epoch_loss:.7f}")
        print(f"===========================================================")
    
    else:
        print("No Data collected for this epoch to log")

### Evaluate Model Function

In [9]:
def evaluate_model(model: Module, dataloader: DataLoader, current_epoch: int = None, max_epochs: int=None, device: str = 'cpu') -> float:
    """
    Evaluates the model on a given dataset and returns the average loss.
    Args:
        model (Module): The Model.
        dataloader (DataLoader): The dataloader to calculate average loss with.
        current_epoch (int): The current epoch [optional].
        max_epochs (int): The maximum number of epochs [optional].
        device (str): The device that the calculations will take place on.
    Returns:
        avg_loss (float): The calculated average loss.
    """
    model.eval()
    total_loss = 0.0
    loss_fn = torch.nn.MSELoss(reduction='sum') # Use reduction='sum' instead of 'mean' for total loss

    if len(dataloader.dataset) == 0:
        print("Warning: Evaluation dataset is empty. Skipping evaluation.")
        return float('nan')
    
    with torch.no_grad():
        for batch_inputs, batch_labels in dataloader:
            batch_inputs, batch_labels = batch_inputs.to(device), batch_labels.unsqueeze(dim=-1).to(device)
            outputs = model(batch_inputs)
            loss = loss_fn(outputs, batch_labels)
            total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader.dataset)     # Calculate the average loss on the dataset

    if current_epoch and max_epochs:   # If the function was called in the training loop
        print(f"===================  [Epoch ({current_epoch}/{max_epochs})]  ===================")
        print(f"Entire Validation Dataset Average Loss: {avg_loss:.4f}")
        print(f"====================================================")

    else:   # If the function was called outside of the training loop
        print(f"===============================================")
        print(f"Entire Dataset Average Loss: {avg_loss:.4f} ")
        print(f"=====================================================")
            
    return avg_loss

### Train Model Function

In [10]:
def train_model(model_config: dict, train_dataloader: DataLoader, validation_dataloader: DataLoader, model: Module = None, epochs=32, learning_rate=0.0003, max_grad_norm=0.5, log_iterations=10, eval_iterations=10, device="cpu") -> Module:
    """The Model Training function.

    Args:
        model_config (dict): The base configurations for building the policies.
        train_dataloader (DataLoader): The dataloader for the training loop.
        validation_dataloader (DataLoader): The dataloader for the validation loop.
        model (WeatherAgent): The model to be trained.
        epochs (int): The number of times the outer loop is performed.
        learning_rate (float): The hyperparameter that affects how much the model's parameters learn on each update iteration.
        max_grad_norm (float): Used to promote numerical stability and prevent exploding gradients.
        log_iterations (int): Used to log information about the state of the Agent.
        eval_iterations (int): Used to run an evaluation of the Agent.
        device (str): The device that the model will be trained on.

    Returns: 
        agent (Module): The Trained Model in evaluation mode.
    """
    print(f"Training Model on {device} with {epochs} main epochs, {learning_rate} learning rate, max_grad_norm={max_grad_norm}.")
    print(f"Logging every {log_iterations} epoch iterations, evaluating every {eval_iterations} epoch iterations.")

    agent = (model if model is not None else WeatherAgent(model_config)).to(device) # Create agent if nothing was passed, otherwise, create the agent. Send agent to device.

    optimizer = torch.optim.Adam(params=agent.parameters(), lr=learning_rate)   # Define the model optimization algorithm
    loss_fn = torch.nn.MSELoss(reduction='mean')       # Define the Loss function

    agent.train()   # Set agent to training mode
    history = {'train_loss': [], 'val_loss': []}

    train_dataloader_length= len(train_dataloader)
    for epoch in tqdm(range(epochs), desc=f">>>>>>>>>>>>>>>>>>>>>\nMain Epoch (Outer Loop)", leave=True):

        epoch_loss_total = 0.0
        for batch_idx, (inputs, labels) in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs} - Training", leave=False)):           # Get a mini-batch of training examples from the dataloader
            optimizer.zero_grad(set_to_none=True)       # Clear the gradients built up; Setting to None to improve performance
            inputs, labels = inputs.to(device), labels.unsqueeze(dim=-1).to(device)   # Move the inputs and labels to the device

            agent_outputs = agent(inputs)       # Pass the inputs to the model and get the outputs.

            print(f"Model Outputs:")
            print(f"{agent_outputs.squeeze()}")

            print(f"labels:")
            print(f"{labels.squeeze()}")

            loss = loss_fn(agent_outputs, labels)      # Calculate the mini-batch loss
            # print(f"Current Loss {loss.item()}")
            epoch_loss_total += loss.item()
            
            loss.backward()         # Calculate the loss with respect to the model parameters
            torch.nn.utils.clip_grad_norm_(parameters=agent.parameters(), max_norm=max_grad_norm)   # Prevent the gradients from affecting the model parameters too much and reduce the risk of exploding gradients
            optimizer.step()      # Update the model's parameters using the learning rate

            # LOGGING LOSS OF CURRENT ITERATION
            if (batch_idx + 1) % log_iterations == 0:
                log_iteration(batch_idx=(batch_idx + 1), total_batches=train_dataloader_length, loss_value=loss.item())

        # CALCULATE AND STORE THE AVERAGE EPOCH LOSS
        epoch_avg_loss = epoch_loss_total / train_dataloader_length
        history["train_loss"].append(epoch_avg_loss)

        # LOG THE AVERAGE LOSS OF THE EPOCH
        log_epoch_iteration(epoch=epoch, avg_epoch_loss=epoch_avg_loss)

        # EVALUATE THE MODEL
        if (epoch + 1) % eval_iterations == 0:
            val_loss = evaluate_model(model=agent, dataloader=validation_dataloader, current_epoch=(epoch + 1), max_epochs=epochs, device=device)
            history["val_loss"].append(val_loss)
        
    return agent.eval(), history

### Testing

In [60]:
model_config={
    "in_dim": 12,
    "intermediate_dim": 32,
    "out_dim": 1,
    "num_blocks": 2
}

In [48]:
agent = WeatherAgent(model_config)

In [49]:
agent

WeatherAgent(
  (L1): Linear(in_features=12, out_features=32, bias=True)
  (Layers): Sequential(
    (0): LayerBlock(
      (Layer1): Linear(in_features=32, out_features=32, bias=True)
      (ReLu): ReLU()
      (Layer_Norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    )
    (1): LayerBlock(
      (Layer1): Linear(in_features=32, out_features=32, bias=True)
      (ReLu): ReLU()
      (Layer_Norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    )
  )
  (out): Linear(in_features=32, out_features=1, bias=True)
)

In [50]:
input=torch.rand(size=(2,12))

In [51]:
output = agent(input)

In [52]:
output

tensor([[-0.8667],
        [-0.7486]], grad_fn=<AddmmBackward0>)

## Main

In [11]:
try:
    (train_dataset, test_dataset, validation_dataset, train_dataloader, test_dataloader, validation_dataloader) = data_pipeline()
except ValueError as e:
    print(f"Caught an error: {e}")

CSV file detected, reading from ..\Data\CA_Weather_Fire_Dataset_Cleaned.csv
Train, Test, and Validation csv datasets detected in '..\Data\DataSplits', skipping generation
Initializing DataLoaders and Returning
Creating DataLoaders with batch_size (64), num_workers (0), pin_memory (False). Training dataset drop_last: (True)


In [12]:
model_config={
    "in_dim": 12,
    "intermediate_dim": 32,
    "out_dim": 1,
    "num_blocks": 2
}

In [134]:
trained_model, hist = train_model(model_config=model_config, train_dataloader=train_dataloader, validation_dataloader=validation_dataloader, epochs=4, log_iterations=1, eval_iterations=8)

Training Model on cpu with 4 main epochs, 0.0003 learning rate, max_grad_norm=0.5.
Logging every 1 epoch iterations, evaluating every 8 epoch iterations.


>>>>>>>>>>>>>>>>>>>>>
Main Epoch (Outer Loop):   0%|          | 0/4 [00:00<?, ?it/s]

Model Outputs:
tensor([0.9712, 2.1014, 2.0928, 1.8745, 1.6424, 1.6969, 1.7761, 1.0881, 1.9655,
        1.9245, 2.2639, 1.6529, 2.3663, 1.1462, 1.6119, 1.8086, 2.3574, 1.5116,
        1.4489, 1.9456, 1.8129, 1.1180, 1.6623, 1.5887, 1.5691, 1.6895, 1.7372,
        1.9484, 1.7330, 1.4150, 2.0094, 1.9068, 1.5276, 1.6267, 1.7209, 1.8517,
        1.3993, 1.7913, 1.7081, 1.2007, 1.8565, 2.0692, 1.5861, 1.8792, 1.7199,
        2.1682, 1.7515, 1.6666, 1.6152, 2.0716, 2.2667, 1.8011, 2.0560, 1.0895,
        1.5024, 1.6704, 2.5720, 2.1182, 1.7161, 1.8177, 1.6458, 1.9280, 2.0789,
        1.2272], grad_fn=<SqueezeBackward0>)
labels:
tensor([80., 76., 69., 59., 71., 67., 66., 69., 74., 66., 76., 75., 63., 71.,
        75., 65., 65., 86., 60., 64., 69., 69., 77., 72., 75., 81., 67., 69.,
        71., 70., 70., 69., 76., 68., 66., 69., 60., 67., 73., 68., 75., 69.,
        60., 69., 69., 67., 66., 75., 64., 67., 68., 60., 66., 68., 68., 68.,
        67., 66., 63., 66., 73., 61., 70., 73.])
Epoch batch




Model Outputs:
tensor([27.0251, 31.4300, 25.9500, 26.0372, 28.8741, 30.5213, 28.0536, 30.7344,
        28.5972, 27.6598, 30.3339, 31.0358, 29.8772, 26.4598, 29.2103, 27.3373,
        30.2389, 28.6838, 30.0393, 30.1053, 25.5657, 26.0165, 29.6810, 31.2094,
        28.3697, 28.9297, 30.8429, 28.7430, 30.2454, 28.1837, 26.5892, 31.6435,
        25.8303, 30.2490, 30.2336, 29.2801, 26.4494, 29.9087, 29.3909, 30.8259,
        27.5113, 28.5369, 26.4570, 26.2329, 26.5124, 26.7179, 31.3971, 27.9538,
        30.0552, 27.5029, 26.3138, 29.8874, 25.9315, 29.4649, 28.8213, 27.3196,
        27.0267, 31.1562, 28.5304, 26.9286, 27.2780, 29.0992, 29.2083, 30.3885],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([64., 67., 69., 76., 72., 86., 67., 91., 69., 66., 68., 64., 79., 72.,
        75., 72., 77., 72., 77., 78., 68., 73., 79., 81., 65., 74., 65., 70.,
        71., 69., 63., 67., 63., 74., 72., 85., 61., 76., 84., 75., 79., 67.,
        67., 62., 57., 63., 66., 68., 73., 67., 65., 83., 68., 76.,

Epoch 1/4 - Training:   7%|▋         | 14/187 [00:00<00:02, 70.92it/s][A

Model Outputs:
tensor([29.5419, 34.5384, 30.5360, 31.1161, 34.0509, 35.4796, 33.6448, 33.4452,
        31.3064, 35.1564, 33.1319, 33.0122, 35.1003, 33.9699, 32.1267, 34.5963,
        36.1908, 31.8496, 35.4293, 30.4583, 32.3788, 34.3738, 33.3562, 35.4959,
        35.9915, 35.2659, 31.8715, 29.6837, 32.1727, 35.3624, 31.9483, 31.8934,
        29.3255, 33.9258, 35.0233, 32.6319, 32.7159, 32.5164, 33.4365, 34.2131,
        32.3714, 29.4708, 33.3835, 34.6284, 34.4409, 29.7280, 30.7390, 36.7604,
        32.3351, 30.9804, 35.6604, 36.2622, 29.8704, 29.7553, 31.7191, 30.2014,
        33.1996, 31.1915, 30.1601, 35.5318, 36.5766, 36.9908, 33.8098, 30.9985],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([59., 74., 63., 64., 76., 67., 74., 73., 74., 74., 70., 75., 84., 74.,
        70., 68., 61., 62., 66., 61., 71., 75., 72., 67., 70., 71., 72., 81.,
        66., 68., 65., 72., 79., 81., 70., 68., 70., 71., 77., 84., 68., 65.,
        70., 72., 73., 66., 81., 54., 65., 57., 68., 70., 72., 59.,




tensor([65., 69., 68., 69., 68., 64., 61., 72., 82., 63., 63., 79., 87., 77.,
        66., 79., 62., 77., 62., 79., 74., 81., 60., 59., 59., 68., 62., 73.,
        77., 62., 72., 67., 61., 68., 82., 58., 58., 65., 66., 67., 85., 70.,
        74., 64., 62., 75., 64., 73., 84., 72., 79., 70., 65., 59., 95., 67.,
        70., 90., 80., 74., 66., 66., 67., 79.])
Epoch batch [31/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([62., 83., 65., 62., 65., 77., 72., 73., 73., 73., 67., 68., 76., 65.,
        61., 73., 72., 67., 70., 84., 79., 72., 64., 72., 67., 74., 60., 63.,
        60., 69., 69., 69., 71., 64., 61., 69.,

>>>>>>>>>>>>>>>>>>>>>  18%|█▊        | 33/187 [00:00<00:01, 84.74it/s][A
Main Epoch (Outer Loop):   0%|          | 0/4 [00:00<?, ?it/s]


Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([69., 66., 62., 73., 73., 85., 73., 94., 62., 65., 57., 65., 69., 75.,
        73., 65., 75., 71., 73., 78., 64., 60., 68., 66., 75., 68., 58., 79.,
        66., 65., 62., 73., 72., 63., 66., 63., 70., 60., 62., 55., 62., 63.,
        66., 82., 71., 68., 70., 83., 78., 66., 61., 76., 63., 66., 62., 68.,
        64., 62., 66., 69., 59., 81., 73., 65.])
Epoch batch [34/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

KeyboardInterrupt: 

In [13]:
trained_model, hist = train_model(model_config=model_config, train_dataloader=train_dataloader, validation_dataloader=validation_dataloader, epochs=4, learning_rate=0.001,log_iterations=1, eval_iterations=8)

Training Model on cpu with 4 main epochs, 0.001 learning rate, max_grad_norm=0.5.
Logging every 1 epoch iterations, evaluating every 8 epoch iterations.


>>>>>>>>>>>>>>>>>>>>>
Main Epoch (Outer Loop):   0%|          | 0/4 [00:00<?, ?it/s]

Model Outputs:
tensor([43.9409, 51.1966, 47.7560, 46.2796, 41.9933, 49.9041, 51.9464, 47.3646,
        50.8985, 48.6637, 50.4555, 44.7487, 46.6035, 52.0799, 43.8080, 43.7545,
        51.1411, 50.8476, 48.8306, 51.2988, 50.1218, 43.1358, 42.4134, 50.8181,
        45.1412, 50.2156, 43.0361, 45.6637, 45.4125, 43.9358, 43.5312, 47.3250,
        51.8932, 43.0604, 45.7408, 49.7098, 51.0858, 48.6915, 42.8022, 42.3638,
        46.2165, 49.8902, 43.8211, 44.1105, 44.5851, 43.0440, 50.4905, 45.2271,
        45.6699, 48.0892, 48.8325, 48.1900, 50.0847, 46.6864, 48.8519, 45.2711,
        44.5665, 43.0330, 43.2005, 44.7060, 48.2368, 45.7347, 51.4520, 45.6308],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([69., 68., 76., 69., 55., 73., 71., 71., 80., 75., 69., 69., 70., 64.,
        66., 57., 77., 69., 71., 79., 76., 71., 70., 78., 87., 81., 68., 70.,
        79., 61., 59., 72., 75., 71., 67., 75., 67., 76., 65., 69., 68., 73.,
        57., 69., 68., 62., 71., 67., 64., 75., 81., 80., 80., 72.,



Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([67., 75., 69., 76., 69., 73., 71., 68., 66., 75., 69., 74., 62., 69.,
        68., 81., 72., 69., 72., 63., 60., 71., 76., 71., 62., 61., 72., 72.,
        66., 62., 74., 66., 73., 79., 78., 73., 55., 65., 72., 69., 76., 64.,
        78., 80., 62., 73., 62., 60., 70., 72., 67., 77., 58., 83., 69., 75.,
        64., 60., 70., 68., 73., 66., 76., 65.])
Epoch batch [17/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na



Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([72., 83., 79., 75., 79., 73., 70., 70., 85., 71., 67., 76., 69., 64.,
        72., 58., 67., 66., 69., 66., 80., 70., 79., 75., 70., 68., 67., 68.,
        71., 89., 74., 61., 67., 78., 74., 64., 62., 67., 68., 57., 90., 74.,
        69., 76., 80., 82., 71., 66., 60., 67., 69., 79., 64., 71., 76., 83.,
        64., 73., 72., 65., 72., 76., 65., 73.])
Epoch batch [34/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na



Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([67., 77., 61., 67., 64., 63., 67., 91., 78., 57., 62., 80., 62., 73.,
        76., 77., 74., 68., 63., 76., 61., 69., 71., 73., 76., 60., 73., 69.,
        75., 71., 75., 69., 72., 72., 71., 73., 74., 69., 67., 82., 73., 68.,
        74., 63., 66., 81., 70., 74., 64., 86., 72., 61., 68., 70., 79., 76.,
        62., 65., 74., 78., 75., 73., 72., 69.])
Epoch batch [54/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na



Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([67., 68., 72., 74., 64., 76., 80., 86., 62., 82., 70., 66., 70., 76.,
        74., 63., 72., 72., 69., 62., 71., 66., 72., 72., 81., 80., 70., 72.,
        71., 79., 73., 73., 76., 68., 58., 57., 67., 68., 69., 81., 77., 77.,
        62., 95., 62., 64., 69., 62., 78., 69., 71., 79., 76., 72., 67., 74.,
        63., 64., 69., 70., 67., 64., 66., 86.])
Epoch batch [75/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na



Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([73., 76., 71., 60., 81., 66., 72., 62., 81., 64., 73., 69., 66., 81.,
        60., 63., 75., 60., 60., 68., 77., 63., 66., 77., 70., 75., 74., 75.,
        60., 75., 72., 71., 67., 74., 73., 76., 79., 80., 76., 76., 69., 74.,
        74., 68., 59., 72., 80., 78., 65., 71., 87., 82., 62., 86., 64., 77.,
        68., 76., 76., 72., 79., 77., 83., 72.])
Epoch batch [94/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na



tensor([65., 69., 65., 76., 72., 71., 73., 71., 81., 64., 69., 69., 58., 66.,
        64., 60., 65., 64., 76., 83., 70., 72., 68., 66., 73., 66., 72., 68.,
        66., 74., 70., 62., 70., 69., 70., 66., 61., 61., 58., 75., 68., 89.,
        69., 86., 72., 75., 72., 77., 68., 69., 60., 72., 63., 67., 79., 91.,
        65., 68., 70., 65., 70., 77., 67., 74.])
Epoch batch [113/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([79., 72., 67., 71., 73., 67., 78., 66., 58., 80., 73., 78., 77., 75.,
        64., 63., 72., 64., 61., 65., 63., 69., 71., 66., 75., 61., 76., 66.,
        68., 71., 82., 69., 63., 59., 81., 79.




Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([62., 67., 70., 73., 72., 64., 61., 56., 71., 83., 77., 78., 65., 67.,
        82., 67., 64., 61., 70., 77., 78., 66., 61., 78., 70., 72., 82., 72.,
        71., 75., 75., 59., 82., 78., 63., 80., 69., 75., 76., 79., 91., 77.,
        62., 82., 67., 66., 68., 66., 57., 68., 77., 77., 56., 69., 65., 68.,
        74., 70., 67., 75., 69., 70., 79., 62.])
Epoch batch [133/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, n

Epoch 1/4 - Training:  80%|███████▉  | 149/187 [00:01<00:00, 89.85it/s][A

Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([68., 60., 88., 58., 76., 69., 63., 76., 80., 84., 80., 70., 67., 70.,
        69., 74., 69., 73., 71., 66., 86., 76., 63., 63., 67., 68., 61., 76.,
        74., 78., 71., 84., 78., 67., 77., 83., 75., 67., 70., 72., 71., 56.,
        71., 75., 63., 69., 63., 66., 67., 70., 73., 63., 73., 61., 73., 77.,
        73., 56., 85., 57., 74., 76., 62., 71.])
Epoch batch [150/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, n

>>>>>>>>>>>>>>>>>>>>>
Main Epoch (Outer Loop):  25%|██▌       | 1/4 [00:02<00:07,  2.37s/it]

Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([ 71.,  65.,  74.,  68.,  85.,  72., 100.,  71.,  60.,  72.,  59.,  61.,
         73.,  69.,  62.,  74.,  76.,  62.,  66.,  65.,  82.,  67.,  84.,  72.,
         80.,  59.,  72.,  63.,  58.,  65.,  60.,  76.,  63.,  74.,  61.,  65.,
         79.,  72.,  67.,  62.,  72.,  67.,  75.,  55.,  58.,  67.,  71.,  62.,
         62.,  76.,  69.,  65.,  63.,  81.,  66.,  62.,  74.,  75.,  74.,  63.,
         78.,  72.,  74.,  76.])
Epoch batch [170/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na



Model Outputs:




tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([60., 67., 69., 84., 66., 67., 77., 64., 66., 60., 76., 76., 68., 82.,
        90., 70., 73., 65., 58., 73., 80., 72., 78., 77., 62., 71., 78., 71.,
        64., 61., 71., 73., 75., 79., 88., 76., 70., 63., 63., 72., 63., 72.,
        72., 89., 74., 72., 66., 70., 62., 71., 67., 78., 68., 70., 69., 79.,
        76., 61., 69., 77., 67., 73., 74., 71.])
Epoch batch [1/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan



Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([71., 83., 78., 64., 74., 69., 69., 83., 75., 68., 70., 74., 76., 79.,
        71., 87., 77., 62., 71., 67., 71., 65., 71., 66., 85., 98., 75., 78.,
        76., 72., 70., 72., 67., 67., 72., 71., 71., 70., 58., 67., 76., 70.,
        68., 69., 61., 68., 80., 73., 71., 70., 66., 69., 89., 67., 74., 74.,
        75., 72., 68., 66., 87., 68., 73., 93.])
Epoch batch [18/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na



Epoch batch [39/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([62., 63., 75., 70., 82., 73., 71., 73., 69., 72., 58., 77., 73., 69.,
        69., 67., 80., 78., 77., 90., 62., 66., 83., 69., 62., 84., 77., 62.,
        85., 80., 63., 72., 67., 82., 67., 73., 69., 67., 66., 78., 79., 69.,
        63., 64., 61., 62., 54., 68., 54., 73., 72., 68., 61., 74., 67., 74.,
        60., 68., 72., 64., 67., 77., 60., 57.])
Epoch batch [40/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan,



Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([75., 79., 88., 63., 71., 78., 85., 69., 77., 66., 74., 72., 60., 62.,
        80., 59., 62., 66., 79., 69., 60., 72., 65., 70., 69., 69., 77., 80.,
        89., 70., 71., 74., 84., 58., 72., 71., 70., 58., 66., 74., 57., 62.,
        66., 71., 65., 68., 69., 67., 58., 73., 74., 72., 66., 71., 64., 65.,
        73., 62., 83., 65., 74., 78., 81., 79.])
Epoch batch [59/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na



Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([78., 77., 67., 73., 70., 71., 75., 60., 59., 84., 63., 82., 68., 72.,
        77., 60., 82., 68., 79., 69., 75., 67., 77., 63., 68., 65., 68., 65.,
        77., 72., 76., 95., 61., 79., 73., 78., 60., 75., 76., 61., 76., 72.,
        77., 70., 73., 67., 66., 71., 70., 76., 64., 66., 76., 78., 72., 60.,
        69., 69., 58., 67., 74., 60., 60., 78.])
Epoch batch [80/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na



Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([60., 70., 72., 77., 76., 82., 86., 72., 83., 72., 66., 73., 74., 75.,
        69., 82., 72., 72., 72., 70., 66., 69., 83., 70., 80., 58., 65., 77.,
        61., 73., 79., 57., 70., 68., 77., 74., 88., 70., 66., 74., 73., 69.,
        68., 69., 76., 78., 65., 69., 81., 77., 80., 92., 76., 82., 78., 73.,
        64., 76., 69., 71., 59., 76., 75., 76.])
Epoch batch [101/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, n



Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([54., 73., 58., 69., 72., 69., 67., 62., 76., 79., 72., 68., 71., 71.,
        76., 73., 76., 79., 74., 78., 74., 72., 78., 66., 69., 70., 78., 74.,
        74., 84., 71., 67., 63., 74., 73., 73., 59., 68., 73., 85., 69., 69.,
        81., 60., 73., 65., 71., 74., 73., 64., 71., 77., 65., 72., 70., 78.,
        69., 74., 70., 62., 70., 69., 53., 65.])
Epoch batch [122/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, n



Epoch batch [141/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([75., 74., 77., 75., 71., 62., 77., 65., 62., 70., 70., 61., 67., 69.,
        75., 72., 74., 67., 75., 68., 73., 75., 83., 73., 68., 80., 64., 64.,
        73., 74., 66., 78., 73., 62., 65., 58., 76., 65., 71., 70., 85., 73.,
        73., 66., 78., 52., 69., 79., 76., 59., 81., 76., 90., 73., 79., 63.,
        69., 68., 76., 68., 81., 90., 75., 61.])
Epoch batch [142/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, na



Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([73., 70., 74., 62., 66., 65., 63., 70., 65., 72., 69., 66., 73., 63.,
        67., 76., 59., 80., 79., 67., 79., 75., 77., 72., 60., 69., 63., 73.,
        71., 68., 65., 67., 69., 65., 60., 69., 66., 67., 66., 80., 80., 76.,
        70., 68., 60., 61., 61., 64., 75., 76., 67., 69., 74., 56., 74., 73.,
        78., 67., 64., 68., 80., 61., 67., 70.])
Epoch batch [160/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, n

>>>>>>>>>>>>>>>>>>>>>
Main Epoch (Outer Loop):  50%|█████     | 2/4 [00:04<00:04,  2.14s/it]

Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([66., 76., 61., 76., 62., 64., 70., 64., 64., 57., 72., 69., 73., 76.,
        69., 72., 59., 73., 79., 61., 76., 65., 72., 66., 74., 68., 73., 68.,
        84., 91., 87., 74., 69., 72., 65., 78., 72., 71., 66., 68., 80., 74.,
        73., 72., 73., 69., 66., 69., 63., 68., 69., 66., 66., 70., 60., 61.,
        58., 73., 62., 71., 75., 83., 67., 63.])
Epoch batch [179/187] | Loss: nan
Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, n

>>>>>>>>>>>>>>>>>>>>>
Main Epoch (Outer Loop):  50%|█████     | 2/4 [00:04<00:04,  2.18s/it]


Model Outputs:
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       grad_fn=<SqueezeBackward0>)
labels:
tensor([ 71.,  67.,  86.,  64.,  77.,  76.,  73.,  79.,  73.,  71.,  62.,  75.,
         75.,  63.,  67.,  73.,  76.,  72.,  62.,  60.,  56.,  76.,  85.,  57.,
         79.,  66., 102.,  86.,  82.,  76.,  62.,  64.,  71.,  63.,  72.,  72.,
         64.,  67.,  64.,  63.,  69.,  84.,  58.,  72.,  74.,  77.,  68.,  76.,
         68.,  76.,  61.,  82.,  62.,  93.,  72.,  65.,  66.,  56.,  62.,  69.,
         68.,  80.,  76.,  86.])
Epoch batch [1/187] | Loss: nan


KeyboardInterrupt: 