# Model Prototyping

Building the basis for our model experimentation

In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import numpy as np
import pandas as pd
import torch
import os
import json

from torch.utils import data
from torch.nn import Conv2d, AvgPool2d, ReLU, Dropout, Flatten, Linear, Sequential, Module
from torch.optim import Adam
from time import time

from tqdm import tqdm

device = torch.device("cuda:0")
torch.set_default_dtype(torch.float64)

MODELS_DIR  = '/home/cxw/sonos_rirs/models/'

In [2]:
model_dict = {}
model_dict['name'] = "testrun2_regularization"
model_dict['notes'] = "same as test run but with regularization"
model_dict['data_path'] = '/home/cxw/sonos_rirs/features/080122_5k_phase/feature_df.csv'
model_dict['model_path'] = os.path.join(MODELS_DIR, model_dict['name'])

In [3]:
try:
    # 尝试导入IPython
    from IPython import get_ipython
    # 检查是否在IPython环境下
    if get_ipython() is not None:
        # 加载autoreload扩展
        %load_ext autoreload
        # 设置autoreload为2
        %autoreload 2
except ImportError:
    # 如果IPython没有被安装，则不作任何操作
    pass

In [4]:
# %autoreload 2
# # import volume_estimation.modeling as model_funcs
# model_funcs.train_model(model_funcs.Baseline_Model, model_dict,\
#                         overwrite=True, epochs=1,log=False) #######################################################

In [5]:
feat_df = pd.read_csv(model_dict['data_path'])
model_path = os.path.join(MODELS_DIR, model_dict['name'])

dataset = []

    
def create_dataloader(feature_df, batch_size=1, log=True):
    dataset = []
    for row in tqdm(feature_df.iterrows()):
        feat_file = row[1]['file_feature']
        loaded = np.load(feat_file)

        feature = loaded['feat']
        feature = feature.reshape((feature.shape[0], feature.shape[1]))
        feature = np.real(feature)

        vol = loaded['vol']
        if log:
            vol = np.log10(vol)
        dataset.append((feature, vol))
    
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    return dataloader

dataloader = create_dataloader(feat_df, log=False)

32000it [00:19, 1649.73it/s]


In [6]:
savename = './testmodeldict.json'
with open(savename, 'w') as f:
    json.dump(model_dict, f)
    
with open(savename) as f:
    load_dict = json.load(f)
    
for key in load_dict.keys():
    print(key, load_dict[key])

name testrun2_regularization
notes same as test run but with regularization
data_path /home/cxw/sonos_rirs/features/080122_5k_phase/feature_df.csv
model_path /home/cxw/sonos_rirs/models/testrun2_regularization


In [7]:
train_df = feat_df[feat_df['split']=='train']
val_df = feat_df[feat_df['split']=='val']
test_df = feat_df[feat_df['split']=='test']

print("Creating training dataloader")
train_dataloader = create_dataloader(train_df, batch_size=64)        ##################################batch_size

print("Creating validation dataloader")
val_dataloader = create_dataloader(val_df)

print("Creating test dataloader")
test_dataloader = create_dataloader(test_df)

Creating training dataloader


19200it [00:11, 1633.60it/s]


Creating validation dataloader


6420it [00:03, 1618.61it/s]


Creating test dataloader


6380it [00:03, 1651.96it/s]


In [8]:
import torch.nn.functional as F
features, labels = next(iter(train_dataloader))
features = F.pad(features, (0, 0, 0, 2))
# features = features.squeeze(1)
# features = features.transpose(1,2)
print(f"Feature batch shape: {features.size()}")
print(f"Labels batch shape: {labels.size()}")


Feature batch shape: torch.Size([64, 32, 1997])
Labels batch shape: torch.Size([64])


In [9]:
# -*- coding: utf-8 -*-
# @Time    : 6/10/21 5:04 PM
# @Author  : Yuan Gong
# @Affiliation  : Massachusetts Institute of Technology
# @Email   : yuangong@mit.edu
# @File    : ast_models.py

import torch
import torch.nn as nn
from torch.cuda.amp import autocast
import os
import wget
os.environ['TORCH_HOME'] = '../../pretrained_models'
import timm
from timm.models.layers import to_2tuple,trunc_normal_

# override the timm package to relax the input shape constraint.
class PatchEmbed(nn.Module):
    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()

        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.proj(x).flatten(2).transpose(1, 2)
        return x

class ASTModel(nn.Module):
    """
    The AST model.
    :param label_dim: the label dimension, i.e., the number of total classes, it is 527 for AudioSet, 50 for ESC-50, and 35 for speechcommands v2-35
    :param fstride: the stride of patch spliting on the frequency dimension, for 16*16 patchs, fstride=16 means no overlap, fstride=10 means overlap of 6
    :param tstride: the stride of patch spliting on the time dimension, for 16*16 patchs, tstride=16 means no overlap, tstride=10 means overlap of 6
    :param input_fdim: the number of frequency bins of the input spectrogram
    :param input_tdim: the number of time frames of the input spectrogram
    :param imagenet_pretrain: if use ImageNet pretrained model
    :param audioset_pretrain: if use full AudioSet and ImageNet pretrained model
    :param model_size: the model size of AST, should be in [tiny224, small224, base224, base384], base224 and base 384 are same model, but are trained differently during ImageNet pretraining.
    """
    def __init__(self, label_dim=1, fstride=10, tstride=10, input_fdim=224, input_tdim=1024, imagenet_pretrain=True, audioset_pretrain=False, model_size='base384', verbose=True):

        super(ASTModel, self).__init__()
        # assert timm.__version__ == '0.4.5', 'Please use timm == 0.4.5, the code might not be compatible with newer versions.'

        if verbose == True:
            print('---------------AST Model Summary---------------')
            print('ImageNet pretraining: {:s}, AudioSet pretraining: {:s}'.format(str(imagenet_pretrain),str(audioset_pretrain)))
        # override timm input shape restriction
        timm.models.vision_transformer.PatchEmbed = PatchEmbed

        # if AudioSet pretraining is not used (but ImageNet pretraining may still apply)
        if audioset_pretrain == False:

            print("Frozen CLIP Pretrainied")
            self.v = timm.create_model("vit_base_patch32_224",  pretrained=False)
            from timm.models.vision_transformer import Block
            ckpt = torch.load("Meta-Transformer_base_patch16_encoder.pth")
            self.v.blocks.load_state_dict(ckpt,strict=True)
            for p in self.v.named_parameters():
                if p[0] == "pos_embed" or p[0] == "patch_embed.proj.weight":
                    continue
                else:
                    p[1].requires_grad = False
            self.original_num_patches = self.v.patch_embed.num_patches
            self.oringal_hw = int(self.original_num_patches ** 0.5)
            self.original_embedding_dim = self.v.pos_embed.shape[2]
            self.mlp_head = nn.Sequential(nn.LayerNorm(self.original_embedding_dim), nn.Linear(self.original_embedding_dim, label_dim))

            # automatcially get the intermediate shape
            f_dim, t_dim = self.get_shape(fstride, tstride, input_fdim, input_tdim)
            num_patches = (f_dim-1)* (t_dim-1)
            self.v.patch_embed.num_patches = num_patches
            # print("patch_embed.num_patches ", self.v.patch_embed.num_patches)
            if verbose == True:
                print('frequncey stride={:d}, time stride={:d}'.format(fstride, tstride))
                print('number of patches={:d}'.format(num_patches))

            # the linear projection layer
            new_proj = torch.nn.Conv2d(1, self.original_embedding_dim, kernel_size=(16, 16), stride=(fstride, tstride))
            if imagenet_pretrain == True:
                new_proj.weight = torch.nn.Parameter(torch.sum(self.v.patch_embed.proj.weight, dim=1).unsqueeze(1))
                new_proj.bias = self.v.patch_embed.proj.bias
            self.v.patch_embed.proj = new_proj

            new_pos_embed = nn.Parameter(torch.zeros(1, self.v.patch_embed.num_patches , self.original_embedding_dim))
            self.v.pos_embed = new_pos_embed
            self.v.dist_token = nn.Parameter(torch.zeros(1, 1, self.original_embedding_dim))
            trunc_normal_(self.v.pos_embed, std=.02)

        # now load a model that is pretrained on both ImageNet and AudioSet
        elif audioset_pretrain == True:
            if audioset_pretrain == True and imagenet_pretrain == False:
                raise ValueError('currently model pretrained on only audioset is not supported, please set imagenet_pretrain = True to use audioset pretrained model.')
            if model_size != 'base384':
                raise ValueError('currently only has base384 AudioSet pretrained model.')
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            if os.path.exists('../../pretrained_models/audioset_10_10_0.4593.pth') == False:
                # this model performs 0.4593 mAP on the audioset eval set
                audioset_mdl_url = 'https://www.dropbox.com/s/cv4knew8mvbrnvq/audioset_0.4593.pth?dl=1'
                wget.download(audioset_mdl_url, out='../../pretrained_models/audioset_10_10_0.4593.pth')
            sd = torch.load('../../pretrained_models/audioset_10_10_0.4593.pth', map_location=device)
            audio_model = ASTModel(label_dim=527, fstride=10, tstride=10, input_fdim=128, input_tdim=1024, imagenet_pretrain=False, audioset_pretrain=False, model_size='base384', verbose=False)
            audio_model = torch.nn.DataParallel(audio_model)
            audio_model.load_state_dict(sd, strict=False)
            self.v = audio_model.module.v
            self.original_embedding_dim = self.v.pos_embed.shape[2]
            self.mlp_head = nn.Sequential(nn.LayerNorm(self.original_embedding_dim), nn.Linear(self.original_embedding_dim, label_dim))
            f_dim, t_dim = self.get_shape(fstride, tstride, input_fdim, input_tdim)
            num_patches = f_dim * t_dim
            self.v.patch_embed.num_patches = num_patches
            if verbose == True:
                print('frequncey stride={:d}, time stride={:d}'.format(fstride, tstride))
                print('number of patches={:d}'.format(num_patches))

            new_pos_embed = self.v.pos_embed[:, 2:, :].detach().reshape(1, 1212, 768).transpose(1, 2).reshape(1, 768, 12, 101)
            # if the input sequence length is larger than the original audioset (10s), then cut the positional embedding
            if t_dim < 101:
                new_pos_embed = new_pos_embed[:, :, :, 50 - int(t_dim/2): 50 - int(t_dim/2) + t_dim]
            # otherwise interpolate
            else:
                new_pos_embed = torch.nn.functional.interpolate(new_pos_embed, size=(12, t_dim), mode='bilinear')
            if f_dim < 12:
                new_pos_embed = new_pos_embed[:, :, 6 - int(f_dim/2): 6 - int(f_dim/2) + f_dim, :]
            # otherwise interpolate
            elif f_dim > 12:
                new_pos_embed = torch.nn.functional.interpolate(new_pos_embed, size=(f_dim, t_dim), mode='bilinear')
            new_pos_embed = new_pos_embed.reshape(1, 768, num_patches).transpose(1, 2)
            self.v.pos_embed = nn.Parameter(torch.cat([self.v.pos_embed[:, :2, :].detach(), new_pos_embed], dim=1))

    def get_shape(self, fstride, tstride, input_fdim=224, input_tdim=1024):
        test_input = torch.randn(1, 1, input_fdim, input_tdim)
        test_proj = nn.Conv2d(1, self.original_embedding_dim, kernel_size=(16, 16), stride=(fstride, tstride))
        test_out = test_proj(test_input)
        f_dim = test_out.shape[2]
        t_dim = test_out.shape[3]
        return f_dim, t_dim

    @autocast()
    def forward(self, x):
        """
        :param x: the input spectrogram, expected shape: (batch_size, time_frame_num, frequency_bins), e.g., (12, 1024, 128)
        :return: prediction
        """
        # expect input x = (batch_size, time_frame_num, frequency_bins), e.g., (12, 1024, 128)
        x = x.unsqueeze(1)
        x = x.transpose(2, 3)
        B = x.shape[0]
        x = self.v.patch_embed(x)
        # print(x.shape)
        cls_tokens = self.v.cls_token.expand(B, -1, -1)
        dist_token = self.v.dist_token.expand(B, -1, -1)
        x = x
        x = self.v.pos_drop(x)
        for blk in self.v.blocks:
            x = blk(x)
        x = self.v.norm(x)
        x = (x[:, 0] + x[:, 1]) / 2

        x = self.mlp_head(x)
        return x

# if __name__ == '__main__':
#     input_tdim = 1997
#     ast_mdl = ASTModel(input_tdim=input_tdim)
#     # input a batch of 10 spectrogram, each with 100 time frames and 128 frequency bins
#     test_input = torch.rand([10, input_tdim, 32])
#     test_output = ast_mdl(test_input)
#     # output should be in shape [10, 527], i.e., 10 samples, each with prediction of 527 classes.
#     print(test_output.shape)

    # input_tdim = 256
    # ast_mdl = ASTModel(input_tdim=input_tdim,label_dim=50, audioset_pretrain=True)
    # # input a batch of 10 spectrogram, each with 512 time frames and 128 frequency bins
    # test_input = torch.rand([10, input_tdim, 128])
    # test_output = ast_mdl(test_input)
    # # output should be in shape [10, 50], i.e., 10 samples, each with prediction of 50 classes.
    # print(test_output.shape)

In [10]:
input_height = features.size()[1]
input_width = features.size()[2]
model = ASTModel(input_tdim=input_width,input_fdim=input_height,label_dim=1, imagenet_pretrain=False, audioset_pretrain=False).to(device)
model = torch.nn.DataParallel(model)
# model = Baseline_Model((input_height, input_width)).to(device)

---------------AST Model Summary---------------
ImageNet pretraining: False, AudioSet pretraining: False
Frozen CLIP Pretrainied
frequncey stride=10, time stride=10
number of patches=198


In [11]:
def MSE(output, target):
    loss = torch.mean((output - target)**2)
    return loss

def Bias(output, target):
    loss = torch.mean(torch.abs(10**output - 10**target))
    return loss

def CovStep(output, target, output_mean, target_mean):
    loss = torch.mean(((output - output_mean) * (target - target_mean)))
    return loss

def MeanAbsLogStep(output, target, log=True):
    #convert out of log
    if log:
        vol_pred = 10**output
        vol_target = 10**target
    else:
        vol_pred = output
        vol_target = target
    loss = torch.mean(torch.abs(torch.log(vol_pred/vol_target)))
    return loss

def compute_eval_metrics(dataloader, model, log=True):
    target_sum = 0
    pred_sum = 0
    n_steps = 0
    
    for (x,y) in dataloader:        
        (x, y) = (x.to(device), y.to(device))
        pred = model(x)
        target_sum += y.sum()
        pred_sum += pred.sum()
        n_steps += 1
    
    target_mean = target_sum/n_steps
    pred_mean = pred_sum/n_steps
    
    mse = 0
    mean_error = 0
    cov = 0
    abs_log_ratio = 0
    
    var_pred = 0 #technically var * N but gets cancelled out in Pearson calculation
    var_target = 0 
    
    for (x,y) in dataloader:        
        (x, y) = (x.to(device), y.to(device))
        pred = model(x)
        mse += MSE(pred, y)
        mean_error += Bias(pred, y)
        cov += CovStep(pred, y, pred_mean, target_mean)
        abs_log_ratio += MeanAbsLogStep(pred, y, log=log)
        
        var_pred += MSE(pred, pred_mean)
        var_target += MSE(y, target_mean)
        
        pears = CovStep(pred, y, pred_mean, target_mean)/(torch.sqrt(MSE(pred, pred_mean))*torch.sqrt(MSE(y, target_mean)))
    
    out_dict = {}
    out_dict['mse'] = (mse / n_steps).item()
    out_dict['bias'] = (mean_error / n_steps).item()
    out_dict['pearson_cor'] = (cov/(torch.sqrt(var_pred) * torch.sqrt(var_target))).item()
    out_dict['mean_mult'] = (torch.exp(abs_log_ratio/n_steps)).item()
    
    return out_dict
    
# with torch.no_grad():
#     eval_dict = compute_eval_metrics(val_dataloader, model)
#     print(eval_dict)

In [12]:
opt = Adam(model.parameters(),lr=0.0005, weight_decay=1e-2)

def save_checkpoint(epoch, model, opt, filename='checkpoint.pth.tar'):
    state = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': opt.state_dict()
    }
    torch.save(state, filename)

def load_checkpoint(filename='checkpoint.pth.tar'):
    checkpoint = torch.load(filename)
    return checkpoint['epoch'], checkpoint['model_state_dict'], checkpoint['optimizer_state_dict']

# 设置保存模型的文件夹和文件名
save_dir = 'model_checkpoints'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
checkpoint_path = os.path.join(save_dir, 'checkpoint.pth.tar')

start_epoch = 0
if os.path.isfile(checkpoint_path):
    print(f"Loading checkpoint from '{checkpoint_path}'")
    start_epoch, model_state_dict, optimizer_state_dict = load_checkpoint(checkpoint_path)
    model.load_state_dict(model_state_dict)
    opt.load_state_dict(optimizer_state_dict)
    print(f"Resuming from epoch {start_epoch}")

hist = {
	"train_loss": [],
	"val_loss": [],
    "val_bias": [],
    "val_pearson_cor": [],
    "val_mean_mult": []
}

for ep in range(150):     #########################################################################                   
    t_start = time()
    model.train()
    
    train_loss = 0
    val_loss = 0
    train_steps = 0
    val_steps = 0
    
    for (x, y) in train_dataloader:
        (x, y) = (x.to(device), y.to(device))
        pred = model(x)
        loss = MSE(pred, y.reshape((y.shape[0], 1)))
        
        opt.zero_grad()
        loss.backward()
        opt.step()
        
        train_loss += loss
        train_steps += 1
    
    with torch.no_grad():
        model.eval()
        
        val_metrics = compute_eval_metrics(val_dataloader, model)
    
    
    hist['train_loss'].append(train_loss/train_steps)
    hist['val_loss'].append(val_metrics['mse'])
    hist['val_bias'].append(val_metrics['bias'])
    hist['val_pearson_cor'].append(val_metrics['pearson_cor'])
    hist['val_mean_mult'].append(val_metrics['mean_mult'])
    
    t_end = time()
    
    t_elapsed = t_end - t_start
    print("Epoch: {}\tDuration: {:.2f}s\tTrain loss: {:.4f}\tVal loss: {:.4f}\tVal bias:{:.4f}\tVal Pearson correlation: {:.4e}\tVal MeanMult: {:.4f}"\
          .format(ep, t_elapsed, train_loss/train_steps, val_metrics['mse'],\
                  val_metrics['bias'], val_metrics['pearson_cor'],val_metrics['mean_mult']))
    
     
    if (ep + 1) % 5 == 0:
        save_checkpoint(ep + 1, model, opt, checkpoint_path)
        print(f"Checkpoint saved at epoch {ep + 1}")
    
    
    

Epoch: 0	Duration: 2555.70s	Train loss: 0.7632	Val loss: 0.6263	Val bias:2600.0717	Val Pearson correlation: 4.3090e-01	Val MeanMult: 4.5628
Epoch: 1	Duration: 2545.63s	Train loss: 0.6019	Val loss: 0.5796	Val bias:2536.5032	Val Pearson correlation: 5.0809e-01	Val MeanMult: 4.0615
Epoch: 2	Duration: 2545.79s	Train loss: 0.5638	Val loss: 0.5671	Val bias:2537.3250	Val Pearson correlation: 5.3077e-01	Val MeanMult: 3.9327
Epoch: 3	Duration: 2546.88s	Train loss: 0.5287	Val loss: 0.5272	Val bias:2512.8939	Val Pearson correlation: 5.6043e-01	Val MeanMult: 3.8499
Epoch: 4	Duration: 2545.22s	Train loss: 0.5006	Val loss: 0.5089	Val bias:2463.1379	Val Pearson correlation: 5.8169e-01	Val MeanMult: 3.7031
Checkpoint saved at epoch 5
Epoch: 5	Duration: 2546.27s	Train loss: 0.4933	Val loss: 0.5052	Val bias:2456.6187	Val Pearson correlation: 5.8799e-01	Val MeanMult: 3.6161
Epoch: 6	Duration: 2547.02s	Train loss: 0.4747	Val loss: 0.4980	Val bias:2426.9070	Val Pearson correlation: 5.9598e-01	Val MeanMult:

KeyboardInterrupt: 

In [13]:
torch.cuda.empty_cache()

In [None]:
import csv

# 创建一个空列表来存储pred和y的值
data_to_save = []

test1_df = test_df.sample(5)

mae = 0.0 
total_samples = 0

test_dataloader1 = create_dataloader(test1_df) 
test_random = compute_eval_metrics(test_dataloader1,model) 
print(test_random)

for(x,y) in test_dataloader: 
    (x,y) = (x.to(device),y.to(device)) 
    pred = model(x) 
    for i in range(len(pred)):
        data_to_save.append([pred[i].item(), y[i].item()])

# 指定要保存的CSV文件名
csv_filename = 'predictions.csv'

# 打开CSV文件并将数据写入
with open(csv_filename, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    
    # 写入列名（如果需要）
    csv_writer.writerow(['Prediction', 'Actual'])
    
    # 写入数据
    csv_writer.writerows(data_to_save)

print(f'Data saved to {csv_filename}')
#     print(pred,'///',y)

#     # 计算绝对误差
#     absolute_error = torch.abs(pred-y)

#     # 累加绝对误差和样本数
#     mae += absolute_error.sum().item()
    

# #     计算平均绝对误差


# mae /= 5 
# print("MAE:", mae)


In [14]:
with torch.no_grad():
    eval_test = compute_eval_metrics(test_dataloader, model)
    print(eval_test)

{'mse': 0.4021832471723209, 'bias': 2114.7526902020213, 'pearson_cor': 0.6933627519262174, 'mean_mult': 2.953152021092662}


In [None]:
hist.keys()
np.std(hist['val_loss'][15:])
np.arange(100)[-10:]

In [None]:
random_df = feat_df.sample(20)

with torch.no_grad():
    random_dataloader = create_dataloader(random_df)
    eval_random = compute_eval_metrics(random_dataloader, model)
    print(eval_random)

    for (x, y) in random_dataloader:
            (x, y) = (x.to(device), y.to(device))
            pred = model(x)
            print(pred.item())

In [None]:
class print_Model(Module):
    def __init__(self, seq):
        super(print_Model, self).__init__()
        self.net = seq

    def forward(self, x):
        print("Start\n{}".format(x.size()))
        for layer in self.net:
            x = layer(x)
            print(layer)
            print(x.size())
        return x

In [None]:
%load_ext autoreload


In [None]:
random_df = feat_df.sample(20)

random_dataloader = create_dataloader(random_df, log=False)


load_model = Baseline_Model((input_height, input_width)).to(device)
model_name = 'testrun2_regularization'
load_model.load_state_dict(torch.load(os.path.join(MODELS_DIR,model_name,'model_state.pt'), map_location=torch.device('cpu')))

load_metrics = compute_eval_metrics(test_dataloader, load_model, log=False)
for key in load_metrics.keys():
    print(key, load_metrics[key])

for (x, y) in random_dataloader:
    (x, y) = (x.to(device), y.to(device))
    pred = load_model(x)
    print(pred.item())


In [None]:
%load_ext autoreload


In [None]:
feat_df['vol'].hist()

### 