In [1]:
import math
from matplotlib import pyplot as plt
import numpy as np
import cv2
import pandas as pd
import soundfile as sf
import os
from pathlib import Path
from sklearn.metrics import confusion_matrix

import torch

from cnn_model import HeatmapFusionCNN
from getAudioSaliency import compute_audio_saliency_heatmap_vectorized, precompute_integrals
from getVideoLabels import filterDf, getModeTileIndex
from getVideoSaliency import compute_video_saliency_heatmap_vectorized

def normalize_heatmaps(heatmaps):
    """Normalize heatmap to [0, 1] range."""
    # returns a list of mins and maxs for each heatmap
    h_mins = np.min(heatmaps, axis=(1, 2), keepdims=True)
    h_maxs = np.max(heatmaps, axis=(1, 2), keepdims=True)

    return (heatmaps - h_mins) / (h_maxs - h_mins)


def getFrame(cap, output_height, output_width, frame_idx):    
    """
    Read video and yield resized frames.
    """
    
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
    ret, frame = cap.read()        
    resized_frame = cv2.resize(frame, (output_width, output_height), interpolation=cv2.INTER_LINEAR)

    return resized_frame

def tile_index_to_coords(idx, numCols):
    """Convert linear index to tile coordinates"""
    y = idx // numCols
    x = idx % numCols
    return x, y

def tile_distance(pred_idx, true_idx, numCols):
    """Calculate tile distance"""
    px, py = tile_index_to_coords(pred_idx, numCols)
    tx, ty = tile_index_to_coords(true_idx, numCols)
    
    # Wrap horizontally
    dx = abs(px - tx)
    dx = min(dx, numCols - dx)
    
    # Don't wrap vertically
    dy = abs(py - ty)
    
    distance = (dx**2 + dy**2) ** 0.5
    return distance

def printAndWriteLine(printedLine, file):
    file.write(printedLine + "\n")
    print(printedLine)


def process_360_video(video_name, video_path, audio_path, output_path, model_path,
                      csv_path, erp_height=1920, erp_width=3840, 
                      sample_every_n_frames=5, numHeatmaps=7,
                      cols = 16, rows = 9, device = "cpu"):
    """
    Main pipeline to process a 360 video and extract audio saliency heatmaps.
    
    Parameters:
        video_path: path to ERP format 360 video
        audio_path: path to first-order ambisonic audio file
        output_path: where to save the output .npy file
        erp_height: height of ERP format (pixels)
        erp_width: width of ERP format (pixels)
        sample_every_n_frames: sample every N frames
    """
    
    # Load audio
    print("Loading ambisonic audio...")
    audio_data, audio_samplerate = sf.read(audio_path)
    
    # Check for 4 channels
    if len(audio_data.shape) == 1:
        raise ValueError(f"Audio is mono. Expected 4-channel first-order ambisonics.")
    elif audio_data.shape[1] != 4:
        raise ValueError(f"Audio has {audio_data.shape[1]} channels. Expected 4-channel first-order ambisonics (W, X, Y, Z).")
    
    # Split into channels
    W = audio_data[:, 0]
    X = audio_data[:, 1]
    Y = audio_data[:, 2]
    Z = audio_data[:, 3]
    
    print(f"Audio shape: {audio_data.shape}")
    print(f"Audio sample rate: {audio_samplerate} Hz")
    print("Successfully loaded 4-channel first-order ambisonics audio")
    
    # Open video to get metadata
    print("Opening video...")
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        raise ValueError(f"Could not open video: {video_path}")
    
    video_fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    print(f"Video FPS: {video_fps}")
    print(f"Total frames: {total_frames}")
    print(f"Video dimensions: {video_width}x{video_height}")
    
    # Check if resizing is needed
    need_resize = video_width != erp_width or video_height != erp_height
    if need_resize:
        print(f"Video will be resized from {video_width}x{video_height} to {erp_width}x{erp_height}")
    
    # Precompute integrals for coarse tiles (20x20 degrees)
    tile_cache = precompute_integrals(tile_size_deg=20)
    
    # Calculate number of sampled frames
    # num_sampled_frames = (total_frames - math.ceil(sample_every_n_frames / 2)) // sample_every_n_frames
    num_sampled_frames = 35    

    # num_sampled_frames = 3

    (labelDf, participants) = filterDf(csv_path, video_name, video_name)
        
    print(f"Processing {num_sampled_frames} frames...")

    print(f"Loading model...")
    # Load the model state
    model = HeatmapFusionCNN()  # Create a new model instance first
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)  # Move to appropriate device
    print(f"Model loaded!")

    numCorrect = 0
    numTotal = 0

    predictedLabels = []
    trueLabels = []
    totalDistance = 0

    with open(output_path, 'w') as file:

            
        # Use frame generator (resizes all frames upfront in the stream). Also, only retrieves them one at a time, instead of keeping it all in memory
        for sampled_frame_idx in range(num_sampled_frames):
            frame_idx = sample_every_n_frames * (sampled_frame_idx + 1)

            prevFrame = getFrame(cap, erp_height, erp_width, frame_idx - 1)
            frame = getFrame(cap, erp_height, erp_width, frame_idx)
            
            printedLine = f"Processing frame {frame_idx}/{total_frames} (sample {sampled_frame_idx}/{num_sampled_frames})"
            printAndWriteLine(printedLine, file)
            
            # Compute audio saliency heatmap
            saliency_heatmaps = np.concatenate([compute_audio_saliency_heatmap_vectorized(W, X, Y, Z, audio_samplerate,
                                                                            frame_idx, video_fps,
                                                                            erp_height, erp_width,
                                                                            tile_cache, sample_every_n_frames,
                                                                            numHeatmaps-2, tile_size_deg=20),
                                                                            compute_video_saliency_heatmap_vectorized(prevFrame, frame, frame_idx, video_fps,
                                                                                                                erp_height, erp_width,
                                                                                                                tile_cache, sample_every_n_frames,
                                                                                                                numHeatmaps-7, tile_size_deg=20)], axis=0
                                                                            )
            
            # Normalize heatmap
            saliency_heatmaps = normalize_heatmaps(saliency_heatmaps)

            heatmaps = torch.from_numpy(saliency_heatmaps).float().to(device)

            # Run inference
            with torch.no_grad():
                outputs = model(heatmaps.unsqueeze(0))
                predicted_tile = outputs[0].argmax(dim=0).item()

            targetTime = frame_idx / video_fps

            actual_tile = getModeTileIndex(targetTime, labelDf, participants, rows, cols)

            printedLine = f"Predicted tile was {predicted_tile}, actual tile was {actual_tile}!"
            printAndWriteLine(printedLine, file)

            predictedLabels.append(predicted_tile)
            trueLabels.append(actual_tile)

            if(predicted_tile == actual_tile):
                numCorrect += 1

            numTotal += 1

            printedLine = f"Num correct thus far is {numCorrect}, num total thus far is {numTotal}"
            printAndWriteLine(printedLine, file)

            distance = tile_distance(predicted_tile, actual_tile, cols)

            printedLine = f"Euclidean distance from true was {distance}"
            printAndWriteLine(printedLine, file)

            totalDistance += distance

            if device.type == 'cuda':
                torch.cuda.empty_cache()
            del heatmaps, outputs  # After you've extracted predicted_tile
        
        classes_present = np.unique(np.concatenate([predictedLabels, trueLabels]))
        
        cm = confusion_matrix(trueLabels, predictedLabels, labels=classes_present)

        # Format with labels
        cm_str = f"Confusion Matrix (Predicted vs True):\n"
        cm_str += f"Classes: {classes_present}\n"
        cm_str += str(cm)

        printAndWriteLine(cm_str, file)

        printedLine = f"Avg distance was: {float(totalDistance) / numTotal:.2f}"
        printAndWriteLine(printedLine, file)

        printedLine = f"Accuracy was: {float(numCorrect) / numTotal:.2f}"
        printAndWriteLine(printedLine, file)
        
        cap.release()


    
if __name__ == "__main__":
    os.chdir("./../..")
    
    # Configuration - modify as needed
    ERP_WIDTH = 1920  # width
    ERP_HEIGHT = 960  # height
    SAMPLE_RATE = 5  # sample every 5 frames
    FILE_NAME = "0004"
     
    VIDEO_PATH = f"Data/Pre-Processed-Data/{FILE_NAME}/{FILE_NAME}_mono_60fps.mp4"  # ERP format 360 video
    AUDIO_PATH = f"Data/Pre-Processed-Data/{FILE_NAME}/{FILE_NAME}.wav"
    INPUT_CSV_PATH = f"Data/Pre-Processed-Data/head_data/head_video_{FILE_NAME}.csv"
    OUTPUT_PATH = f"FinalTestingResults/{FILE_NAME}_Results.txt"
    MODEL_PATH = f"Scripts/FinalCode/cnn_model.pth"
    NUM_HEATMAPS = 9
    TILE_COLS = 16
    TILE_ROWS = 9

    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Run the pipeline
    process_360_video(FILE_NAME, VIDEO_PATH, AUDIO_PATH, OUTPUT_PATH, MODEL_PATH, INPUT_CSV_PATH,
                                      erp_height=ERP_HEIGHT, erp_width=ERP_WIDTH,
                                      sample_every_n_frames=SAMPLE_RATE, numHeatmaps=NUM_HEATMAPS,
                                      cols=TILE_COLS, rows=TILE_ROWS, device = DEVICE)

Repo path was: c:\Users\mahd\Documents\FOV Prediction\Scripts\finalCode\../../U-2-Net-Repo
Using device: cpu
U2 Net Model ready!
Successfully imported core RAFT modules from your fork.
Loading RAFT cpu model...
RAFT model loaded on cpu successfully.
Loading ambisonic audio...
Audio shape: (2880000, 4)
Audio sample rate: 48000 Hz
Successfully loaded 4-channel first-order ambisonics audio
Opening video...
Video FPS: 60.0
Total frames: 3598
Video dimensions: 3840x1920
Video will be resized from 3840x1920 to 1920x960
Precomputing integrals for 20° tiles...
Integral precomputation complete!
Reading CSV file: Data/Pre-Processed-Data/head_data/head_video_0004.csv
Processing 35 frames...
Loading model...
Model loaded!
Processing frame 5/3598 (sample 0/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8454725742340088 1.0559604167938232
After conv1: -0.7022640109062195 0.7890627980232239
After conv2: -4.248074054718018 4.467042446136475
After conv3: -2.8335306644439697 3.3739211559295654
After pool: 0.0 1.8778979778289795
After fc1: -1.508062720298767 5.066052436828613
After fc2 (output): -5.326374053955078 5.29329776763916
Predicted tile was 70, actual tile was 65!
Num correct thus far is 0, num total thus far is 1
Euclidean distance from true was 5.0
Processing frame 10/3598 (sample 1/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8078544735908508 0.9146549701690674
After conv1: -0.6568062901496887 0.6594387888908386
After conv2: -3.9136910438537598 2.9638423919677734
After conv3: -2.5999231338500977 3.427844285964966
After pool: 0.0 2.238872766494751
After fc1: -1.2006548643112183 4.9988694190979
After fc2 (output): -6.631930828094482 7.695465087890625
Predicted tile was 77, actual tile was 66!
Num correct thus far is 0, num total thus far is 2
Euclidean distance from true was 5.0
Processing frame 15/3598 (sample 2/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.7961297631263733 1.0398343801498413
After conv1: -0.6788666844367981 0.784699022769928
After conv2: -4.762764930725098 3.7616491317749023
After conv3: -4.1221795082092285 4.719108581542969
After pool: 0.0 2.1463232040405273
After fc1: -1.346239447593689 4.882150173187256
After fc2 (output): -5.967714786529541 8.434220314025879
Predicted tile was 77, actual tile was 66!
Num correct thus far is 0, num total thus far is 3
Euclidean distance from true was 5.0
Processing frame 20/3598 (sample 3/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8189153671264648 1.049467921257019
After conv1: -0.6506867408752441 0.8171667456626892
After conv2: -5.226389408111572 4.087320804595947
After conv3: -4.242483615875244 4.276645660400391
After pool: 0.0 1.8206597566604614
After fc1: -1.2547770738601685 4.746786117553711
After fc2 (output): -5.641293525695801 6.732853889465332
Predicted tile was 77, actual tile was 66!
Num correct thus far is 0, num total thus far is 4
Euclidean distance from true was 5.0
Processing frame 25/3598 (sample 4/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8242143392562866 0.9872550368309021
After conv1: -0.6456568241119385 0.7460277080535889
After conv2: -4.983203411102295 3.7556979656219482
After conv3: -3.8796470165252686 4.503441333770752
After pool: 0.0 1.9387181997299194
After fc1: -1.4150677919387817 5.687047004699707
After fc2 (output): -5.009988784790039 5.5358991622924805
Predicted tile was 70, actual tile was 66!
Num correct thus far is 0, num total thus far is 5
Euclidean distance from true was 4.0
Processing frame 30/3598 (sample 5/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.7996619939804077 1.0383710861206055
After conv1: -0.6407153606414795 0.8084741234779358
After conv2: -5.32012939453125 4.169215679168701
After conv3: -3.991791009902954 4.192553997039795
After pool: 0.0 1.8585739135742188
After fc1: -1.4473768472671509 4.643476486206055
After fc2 (output): -5.480376720428467 4.9850311279296875
Predicted tile was 77, actual tile was 65!
Num correct thus far is 0, num total thus far is 6
Euclidean distance from true was 4.0
Processing frame 35/3598 (sample 6/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.7880097031593323 0.9311970472335815
After conv1: -0.6389310359954834 0.6872763633728027
After conv2: -4.152876853942871 3.099611282348633
After conv3: -2.6910295486450195 3.5702714920043945
After pool: 0.0 2.483474016189575
After fc1: -1.3829454183578491 5.1108222007751465
After fc2 (output): -6.043682098388672 6.130939960479736
Predicted tile was 70, actual tile was 65!
Num correct thus far is 0, num total thus far is 7
Euclidean distance from true was 5.0
Processing frame 40/3598 (sample 7/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8469880223274231 1.038453221321106
After conv1: -0.6878559589385986 0.7857849597930908
After conv2: -4.952780246734619 4.0932087898254395
After conv3: -3.5922584533691406 4.554183483123779
After pool: 0.0 1.8306047916412354
After fc1: -1.301669716835022 4.960685729980469
After fc2 (output): -5.992653846740723 5.045297622680664
Predicted tile was 70, actual tile was 65!
Num correct thus far is 0, num total thus far is 8
Euclidean distance from true was 5.0
Processing frame 45/3598 (sample 8/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8263257741928101 0.9607481360435486
After conv1: -0.6579660773277283 0.7085361480712891
After conv2: -5.21054220199585 3.7042837142944336
After conv3: -3.633204460144043 4.702797889709473
After pool: 0.0 2.1427536010742188
After fc1: -1.3971885442733765 6.018167018890381
After fc2 (output): -5.804516792297363 5.576107978820801
Predicted tile was 68, actual tile was 65!
Num correct thus far is 0, num total thus far is 9
Euclidean distance from true was 3.0
Processing frame 50/3598 (sample 9/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8211811780929565 1.031185507774353
After conv1: -0.6581156849861145 0.7895683646202087
After conv2: -5.098116874694824 3.868565797805786
After conv3: -4.025670528411865 3.9474360942840576
After pool: 0.0 1.798255205154419
After fc1: -1.3000388145446777 4.728382587432861
After fc2 (output): -4.718759536743164 4.186463832855225
Predicted tile was 77, actual tile was 65!
Num correct thus far is 0, num total thus far is 10
Euclidean distance from true was 4.0
Processing frame 55/3598 (sample 10/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8507259488105774 0.9140177965164185
After conv1: -0.6568363308906555 0.6548255681991577
After conv2: -4.275055408477783 3.494325637817383
After conv3: -2.6788179874420166 3.958364725112915
After pool: 0.0 2.3706448078155518
After fc1: -1.4574960470199585 5.415861129760742
After fc2 (output): -4.853543281555176 5.051483154296875
Predicted tile was 77, actual tile was 65!
Num correct thus far is 0, num total thus far is 11
Euclidean distance from true was 4.0
Processing frame 60/3598 (sample 11/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8610494136810303 0.9552416801452637
After conv1: -0.6754617691040039 0.6449794769287109
After conv2: -3.869720458984375 2.808363914489746
After conv3: -2.4848151206970215 3.5860700607299805
After pool: 0.0 2.3564207553863525
After fc1: -1.477203369140625 5.576220512390137
After fc2 (output): -6.072748184204102 6.44338321685791
Predicted tile was 77, actual tile was 65!
Num correct thus far is 0, num total thus far is 12
Euclidean distance from true was 4.0
Processing frame 65/3598 (sample 12/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8190039396286011 0.944896936416626
After conv1: -0.647247850894928 0.7003781199455261
After conv2: -4.451555252075195 3.3543386459350586
After conv3: -3.6820545196533203 4.410452365875244
After pool: 0.0 2.4186131954193115
After fc1: -1.4148991107940674 4.850283622741699
After fc2 (output): -5.743826866149902 5.308683395385742
Predicted tile was 77, actual tile was 64!
Num correct thus far is 0, num total thus far is 13
Euclidean distance from true was 3.0
Processing frame 70/3598 (sample 13/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8137699365615845 0.9747231602668762
After conv1: -0.6477314829826355 0.7309423089027405
After conv2: -4.760627269744873 3.3985066413879395
After conv3: -3.6668295860290527 4.271191120147705
After pool: 0.0 2.1453325748443604
After fc1: -1.4727156162261963 5.215339660644531
After fc2 (output): -5.534282684326172 4.820956230163574
Predicted tile was 70, actual tile was 64!
Num correct thus far is 0, num total thus far is 14
Euclidean distance from true was 6.0
Processing frame 75/3598 (sample 14/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.7931197881698608 0.9288826584815979
After conv1: -0.6202314496040344 0.6969931125640869
After conv2: -4.390472888946533 3.0360894203186035
After conv3: -4.0927414894104 4.281890869140625
After pool: 0.0 2.5364394187927246
After fc1: -1.3514941930770874 4.998280048370361
After fc2 (output): -6.08750581741333 5.149755477905273
Predicted tile was 70, actual tile was 64!
Num correct thus far is 0, num total thus far is 15
Euclidean distance from true was 6.0
Processing frame 80/3598 (sample 15/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8037466406822205 0.9330234527587891
After conv1: -0.6530986428260803 0.6054633855819702
After conv2: -3.8748538494110107 3.2136857509613037
After conv3: -2.7724618911743164 3.3539304733276367
After pool: 0.0 2.040173053741455
After fc1: -1.512853980064392 5.951663017272949
After fc2 (output): -6.117144584655762 6.552178382873535
Predicted tile was 70, actual tile was 64!
Num correct thus far is 0, num total thus far is 16
Euclidean distance from true was 6.0
Processing frame 85/3598 (sample 16/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8074315786361694 1.0288177728652954
After conv1: -0.6716670393943787 0.7767760157585144
After conv2: -4.415172576904297 4.207357883453369
After conv3: -2.5057899951934814 2.67081618309021
After pool: 0.0 1.8920091390609741
After fc1: -1.5102304220199585 5.190557479858398
After fc2 (output): -6.08524227142334 5.242792129516602
Predicted tile was 77, actual tile was 79!
Num correct thus far is 0, num total thus far is 17
Euclidean distance from true was 2.0
Processing frame 90/3598 (sample 17/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8143611550331116 0.9419675469398499
After conv1: -0.6539565324783325 0.6926335096359253
After conv2: -4.431217193603516 3.4098260402679443
After conv3: -3.7269675731658936 4.257199764251709
After pool: 0.0 2.333878755569458
After fc1: -1.3510160446166992 4.741885185241699
After fc2 (output): -5.8296051025390625 8.596440315246582
Predicted tile was 77, actual tile was 78!
Num correct thus far is 0, num total thus far is 18
Euclidean distance from true was 1.0
Processing frame 95/3598 (sample 18/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.7951516509056091 0.9583035111427307
After conv1: -0.6582075953483582 0.70401531457901
After conv2: -4.349826335906982 3.447521209716797
After conv3: -2.7118756771087646 3.641820192337036
After pool: 0.0 2.2144100666046143
After fc1: -1.1990078687667847 5.095314979553223
After fc2 (output): -6.601128101348877 5.218992233276367
Predicted tile was 71, actual tile was 78!
Num correct thus far is 0, num total thus far is 19
Euclidean distance from true was 7.0
Processing frame 100/3598 (sample 19/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.793669581413269 0.953285276889801
After conv1: -0.6707307696342468 0.6044131517410278
After conv2: -3.8634495735168457 2.922372579574585
After conv3: -2.5445754528045654 3.1495306491851807
After pool: 0.0 2.2541110515594482
After fc1: -1.2542086839675903 5.641135215759277
After fc2 (output): -4.581752777099609 5.431392669677734
Predicted tile was 77, actual tile was 77!
Num correct thus far is 1, num total thus far is 20
Euclidean distance from true was 0.0
Processing frame 105/3598 (sample 20/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8046830892562866 0.9792787432670593
After conv1: -0.6555736660957336 0.7309712767601013
After conv2: -4.686904430389404 3.419992685317993
After conv3: -3.4590399265289307 3.66986346244812
After pool: 0.0 2.282097101211548
After fc1: -1.4373537302017212 4.9780120849609375
After fc2 (output): -5.309762477874756 6.162893295288086
Predicted tile was 77, actual tile was 77!
Num correct thus far is 2, num total thus far is 21
Euclidean distance from true was 0.0
Processing frame 110/3598 (sample 21/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8145084977149963 1.0771766901016235
After conv1: -0.6887149214744568 0.8239284157752991
After conv2: -5.009619235992432 5.186785697937012
After conv3: -2.9430973529815674 3.3738574981689453
After pool: 0.0 2.5582501888275146
After fc1: -1.6640455722808838 4.874958515167236
After fc2 (output): -5.624250888824463 6.432581424713135
Predicted tile was 77, actual tile was 77!
Num correct thus far is 3, num total thus far is 22
Euclidean distance from true was 0.0
Processing frame 115/3598 (sample 22/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8002418875694275 1.0584672689437866
After conv1: -0.6547943949699402 0.8224382996559143
After conv2: -4.665722370147705 3.9468374252319336
After conv3: -4.7526750564575195 4.416706085205078
After pool: 0.0 2.563175916671753
After fc1: -1.4394502639770508 4.6763916015625
After fc2 (output): -5.789361476898193 5.818636894226074
Predicted tile was 77, actual tile was 77!
Num correct thus far is 4, num total thus far is 23
Euclidean distance from true was 0.0
Processing frame 120/3598 (sample 23/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8511996269226074 0.988135039806366
After conv1: -0.6734703183174133 0.7331549525260925
After conv2: -5.129940509796143 3.4790892601013184
After conv3: -3.738520860671997 4.664563179016113
After pool: 0.0 2.0602362155914307
After fc1: -1.3887616395950317 4.805315017700195
After fc2 (output): -5.755354881286621 7.051784515380859
Predicted tile was 77, actual tile was 77!
Num correct thus far is 5, num total thus far is 24
Euclidean distance from true was 0.0
Processing frame 125/3598 (sample 24/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8432187438011169 0.9080095887184143
After conv1: -0.6385507583618164 0.6882636547088623
After conv2: -5.143194198608398 3.457134485244751
After conv3: -3.123746156692505 3.4439234733581543
After pool: 0.0 2.1653363704681396
After fc1: -1.4097092151641846 6.112798690795898
After fc2 (output): -5.38360071182251 6.69924259185791
Predicted tile was 70, actual tile was 77!
Num correct thus far is 5, num total thus far is 25
Euclidean distance from true was 7.0
Processing frame 130/3598 (sample 25/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8233008980751038 0.9580976366996765
After conv1: -0.6721075177192688 0.647409200668335
After conv2: -3.7848689556121826 2.7108657360076904
After conv3: -2.6035714149475098 3.3103296756744385
After pool: 0.0 2.474409341812134
After fc1: -1.4355299472808838 5.496045112609863
After fc2 (output): -5.583592891693115 6.745416164398193
Predicted tile was 77, actual tile was 77!
Num correct thus far is 6, num total thus far is 26
Euclidean distance from true was 0.0
Processing frame 135/3598 (sample 26/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8818140029907227 1.027289867401123
After conv1: -0.7029741406440735 0.6805118918418884
After conv2: -3.2406482696533203 2.757063150405884
After conv3: -2.8996851444244385 3.699089288711548
After pool: 0.0 1.8814257383346558
After fc1: -1.3878676891326904 5.328784465789795
After fc2 (output): -5.5070271492004395 5.383902072906494
Predicted tile was 70, actual tile was 77!
Num correct thus far is 6, num total thus far is 27
Euclidean distance from true was 7.0
Processing frame 140/3598 (sample 27/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8182579874992371 0.9048880934715271
After conv1: -0.6502766013145447 0.6811156272888184
After conv2: -4.2318010330200195 3.217919111251831
After conv3: -2.8609752655029297 3.42128849029541
After pool: 0.0 1.9880540370941162
After fc1: -1.383681297302246 5.77836275100708
After fc2 (output): -6.266424179077148 4.965614318847656
Predicted tile was 70, actual tile was 77!
Num correct thus far is 6, num total thus far is 28
Euclidean distance from true was 7.0
Processing frame 145/3598 (sample 28/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8152509331703186 0.9016660451889038
After conv1: -0.6357071399688721 0.6497228741645813
After conv2: -3.9983835220336914 3.110067129135132
After conv3: -2.357842445373535 2.927055835723877
After pool: 0.0 2.1490931510925293
After fc1: -1.2776559591293335 5.63051700592041
After fc2 (output): -5.7190423011779785 5.096924781799316
Predicted tile was 77, actual tile was 77!
Num correct thus far is 7, num total thus far is 29
Euclidean distance from true was 0.0
Processing frame 150/3598 (sample 29/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8115295767784119 1.040634036064148
After conv1: -0.66253262758255 0.7503643035888672
After conv2: -5.024006366729736 4.224064826965332
After conv3: -3.0352602005004883 3.9121763706207275
After pool: 0.0 1.7575308084487915
After fc1: -1.330680012702942 4.974983215332031
After fc2 (output): -5.695553302764893 5.109551429748535
Predicted tile was 70, actual tile was 77!
Num correct thus far is 7, num total thus far is 30
Euclidean distance from true was 7.0
Processing frame 155/3598 (sample 30/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8511655926704407 0.9353945851325989
After conv1: -0.673244833946228 0.7013679146766663
After conv2: -3.8429079055786133 2.6511952877044678
After conv3: -3.2549049854278564 3.726616382598877
After pool: 0.0 2.0944113731384277
After fc1: -1.3299356698989868 5.468141078948975
After fc2 (output): -5.439357757568359 7.958605766296387
Predicted tile was 77, actual tile was 77!
Num correct thus far is 8, num total thus far is 31
Euclidean distance from true was 0.0
Processing frame 160/3598 (sample 31/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8086874485015869 0.9992552399635315
After conv1: -0.6313549280166626 0.7680572271347046
After conv2: -4.689962863922119 3.3202126026153564
After conv3: -2.368530750274658 3.0309879779815674
After pool: 0.0 2.1727728843688965
After fc1: -1.3482342958450317 5.140874862670898
After fc2 (output): -6.748669624328613 8.698079109191895
Predicted tile was 77, actual tile was 77!
Num correct thus far is 9, num total thus far is 32
Euclidean distance from true was 0.0
Processing frame 165/3598 (sample 32/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8965502381324768 1.0139579772949219
After conv1: -0.7121359705924988 0.7465569376945496
After conv2: -4.930568695068359 4.10792875289917
After conv3: -3.1869966983795166 4.27144718170166
After pool: 0.0 2.123541831970215
After fc1: -1.3602765798568726 4.519447326660156
After fc2 (output): -5.375112533569336 7.183988571166992
Predicted tile was 77, actual tile was 77!
Num correct thus far is 10, num total thus far is 33
Euclidean distance from true was 0.0
Processing frame 170/3598 (sample 33/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.8420531749725342 1.0294464826583862
After conv1: -0.684238076210022 0.7717120051383972
After conv2: -4.65862512588501 4.700159072875977
After conv3: -2.750641107559204 3.0219509601593018
After pool: 0.0 2.0994513034820557
After fc1: -1.331347942352295 4.4769792556762695
After fc2 (output): -6.507315635681152 7.323462009429932
Predicted tile was 77, actual tile was 77!
Num correct thus far is 11, num total thus far is 34
Euclidean distance from true was 0.0
Processing frame 175/3598 (sample 34/35)


  src = F.upsample(src,size=tar.shape[2:],mode='bilinear')



Trying scale 1.0...
Padded tensor shapes: t1=torch.Size([1, 3, 960, 1920]), t2=torch.Size([1, 3, 960, 1920])


  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):
  with autocast(enabled=self.args.mixed_precision):


✓ Flow computed successfully at scale 1.0
before fusion: 0.0 1.0
After fusion: -0.7951954007148743 1.0662610530853271
After conv1: -0.6767958998680115 0.8178372979164124
After conv2: -4.8527350425720215 4.024990081787109
After conv3: -3.932375907897949 3.1884214878082275
After pool: 0.0 2.1827688217163086
After fc1: -1.5382978916168213 4.975139617919922
After fc2 (output): -6.091174125671387 5.013524532318115
Predicted tile was 75, actual tile was 77!
Num correct thus far is 11, num total thus far is 35
Euclidean distance from true was 2.0
Confusion Matrix (Predicted vs True):
Classes: [64 65 66 68 70 71 75 77 78 79]
[[ 0  0  0  0  3  0  0  1  0  0]
 [ 0  0  0  1  3  0  0  4  0  0]
 [ 0  0  0  0  1  0  0  3  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  4  0  1 11  0  0]
 [ 0  0  0  0  0  1  0  1  0  0]
 [ 0  0  0  0  0  0  0  1  0  0]]
Avg distance was: 3.26
Accuracy was: 0.31
