In [1]:
!pip install omegaconf
!pip install conformer
!pip install ml-collections
!pip install librosa

Collecting ml-collections
  Downloading ml_collections-0.1.1.tar.gz (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ml-collections
  Building wheel for ml-collections (setup.py) ... [?25l[?25hdone
  Created wheel for ml-collections: filename=ml_collections-0.1.1-py3-none-any.whl size=94507 sha256=04a3f362e143cc08e6e97f92f33655aa82ac269c8e5616ea9559f99f371cc73f
  Stored in directory: /root/.cache/pip/wheels/7b/89/c9/a9b87790789e94aadcfc393c283e3ecd5ab916aed0a31be8fe
Successfully built ml-collections
Installing collected packages: ml-collections
Successfully installed ml-collections-0.1.1


Required Classes and definition from IELT

In [2]:
# coding=utf-8

import copy
import math
from os.path import join as pjoin
import ml_collections
import torch
import torch.nn as nn
from torch.nn import Dropout, Softmax, Linear, Conv2d, LayerNorm
from torch.nn.modules.utils import _pair

ATTENTION_Q = "MultiHeadDotProductAttention_1/query"
ATTENTION_K = "MultiHeadDotProductAttention_1/key"
ATTENTION_V = "MultiHeadDotProductAttention_1/value"
ATTENTION_OUT = "MultiHeadDotProductAttention_1/out"
FC_0 = "MlpBlock_3/Dense_0"
FC_1 = "MlpBlock_3/Dense_1"
ATTENTION_NORM = "LayerNorm_0"
MLP_NORM = "LayerNorm_2"


def np2th(weights, conv=False):
	"""Possibly convert HWIO to OIHW."""
	if conv:
		weights = weights.transpose([3, 2, 0, 1])
	return torch.from_numpy(weights)


def swish(x):
	return x * torch.sigmoid(x)


ACT2FN = {"gelu": torch.nn.functional.gelu, "relu": torch.nn.functional.relu, "swish": swish}


class Mlp(nn.Module):
	def __init__(self, config):
		super(Mlp, self).__init__()
		self.fc1 = Linear(config.hidden_size, config.mlp_dim)
		self.fc2 = Linear(config.mlp_dim, config.hidden_size)
		self.act_fn = ACT2FN["gelu"]
		self.dropout = Dropout(config.dropout_rate)

		self._init_weights()

	def _init_weights(self):
		nn.init.xavier_uniform_(self.fc1.weight)
		nn.init.xavier_uniform_(self.fc2.weight)
		nn.init.normal_(self.fc1.bias, std=1e-6)
		nn.init.normal_(self.fc2.bias, std=1e-6)

	def forward(self, x):
		x = self.fc1(x)
		x = self.act_fn(x)
		x = self.dropout(x)
		x = self.fc2(x)
		x = self.dropout(x)
		return x


class Embeddings(nn.Module):
	"""Construct the embeddings from patch, position embeddings.
	"""

	def __init__(self, config, img_size, in_channels=3):
		super(Embeddings, self).__init__()
		img_size = _pair(img_size)

		patch_size = _pair(config.patches)
		n_patches = (img_size[0] // patch_size[0]) * (img_size[1] // patch_size[1])
		self.patch_embeddings = Conv2d(in_channels=in_channels,
		                               out_channels=config.hidden_size,
		                               kernel_size=patch_size,
		                               stride=patch_size)
		self.position_embeddings = nn.Parameter(torch.zeros(1, n_patches + 1, config.hidden_size))
		self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))

		self.dropout = Dropout(config.dropout_rate)

	def forward(self, x):
		B = x.shape[0]
		cls_tokens = self.cls_token.expand(B, -1, -1)

		x = self.patch_embeddings(x)
		x = x.flatten(2)
		x = x.transpose(-1, -2)
		x = torch.cat((cls_tokens, x), dim=1)

		embeddings = x + self.position_embeddings
		embeddings = self.dropout(embeddings)
		return embeddings


class Encoder(nn.Module):
	def __init__(self, config):
		super(Encoder, self).__init__()
		self.layer = nn.ModuleList()
		# for _ in range(config.num_layers):
		for _ in range(config.num_layers + 1):
			layer = Block(config)
			self.layer.append(copy.deepcopy(layer))

	def forward(self, hidden_states):
		# attmap = []
		for layer in self.layer:
			hidden_states, weights = layer(hidden_states)
		# print(weights.shape)
		# attmap.append(weights)
		return hidden_states


class Transformer(nn.Module):
	def __init__(self, config, img_size):
		super(Transformer, self).__init__()
		self.embeddings = Embeddings(config, img_size=img_size)
		self.encoder = Encoder(config)

	def forward(self, input_ids):
		embedding_output = self.embeddings(input_ids)
		part_encoded = self.encoder(embedding_output)
		return part_encoded


class LabelSmoothing(nn.Module):
	"""
	NLL loss with label smoothing.
	"""

	def __init__(self, smoothing=0.0):
		"""
		Constructor for the LabelSmoothing module.
		param smoothing: label smoothing factor
		"""
		super(LabelSmoothing, self).__init__()
		self.confidence = 1.0 - smoothing
		self.smoothing = smoothing

	def forward(self, x, target):
		logprobs = torch.nn.functional.log_softmax(x, dim=-1)
		nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
		nll_loss = nll_loss.squeeze(1)
		smooth_loss = -logprobs.mean(dim=-1)
		loss = self.confidence * nll_loss + self.smoothing * smooth_loss
		return loss.mean()


class Attention(nn.Module):
	def __init__(self, config, assess=False):
		super(Attention, self).__init__()
		self.assess = assess
		self.num_attention_heads = config.num_heads
		self.attention_head_size = int(config.hidden_size / self.num_attention_heads)
		self.all_head_size = self.num_attention_heads * self.attention_head_size

		self.query = Linear(config.hidden_size, self.all_head_size)
		self.key = Linear(config.hidden_size, self.all_head_size)
		self.value = Linear(config.hidden_size, self.all_head_size)

		self.out = Linear(config.hidden_size, config.hidden_size)
		self.attn_dropout = Dropout(config.att_dropout)
		self.proj_dropout = Dropout(config.att_dropout)

		self.softmax = Softmax(dim=-1)

	def transpose_for_scores(self, x):
		new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
		x = x.view(*new_x_shape)
		return x.permute(0, 2, 1, 3)

	def forward(self, hidden_states):
		mixed_query_layer = self.query(hidden_states)
		mixed_key_layer = self.key(hidden_states)
		mixed_value_layer = self.value(hidden_states)

		query_layer = self.transpose_for_scores(mixed_query_layer)
		key_layer = self.transpose_for_scores(mixed_key_layer)
		value_layer = self.transpose_for_scores(mixed_value_layer)

		attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
		attention_scores = attention_scores / math.sqrt(self.attention_head_size)
		attention_probs = self.softmax(attention_scores)
		weights = attention_probs
		attention_probs = self.attn_dropout(attention_probs)

		context_layer = torch.matmul(attention_probs, value_layer)
		context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
		new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
		context_layer = context_layer.view(*new_context_layer_shape)
		attention_output = self.out(context_layer)
		attention_output = self.proj_dropout(attention_output)
		if self.assess:
			return attention_output, weights, attention_scores
		else:
			return attention_output, weights


class Block(nn.Module):
	def __init__(self, config, assess=False):
		super(Block, self).__init__()
		self.assess = assess
		self.hidden_size = config.hidden_size
		self.attention_norm = LayerNorm(config.hidden_size, eps=1e-6)
		self.ffn_norm = LayerNorm(config.hidden_size, eps=1e-6)
		self.ffn = Mlp(config)
		self.attn = Attention(config, self.assess)

	def forward(self, x):
		h = x
		x = self.attention_norm(x)
		if self.assess:
			x, weights, score = self.attn(x)
		else:
			x, weights = self.attn(x)
		x = x + h

		h = x
		x = self.ffn_norm(x)
		x = self.ffn(x)
		x = x + h
		return x, weights

	def load_from(self, weights, n_block):
		ROOT = f"Transformer/encoderblock_{n_block}"
		with torch.no_grad():
			query_weight = np2th(weights[pjoin(ROOT, ATTENTION_Q, "kernel")]).view(self.hidden_size,
			                                                                       self.hidden_size).t()
			key_weight = np2th(weights[pjoin(ROOT, ATTENTION_K, "kernel")]).view(self.hidden_size, self.hidden_size).t()
			value_weight = np2th(weights[pjoin(ROOT, ATTENTION_V, "kernel")]).view(self.hidden_size,
			                                                                       self.hidden_size).t()
			out_weight = np2th(weights[pjoin(ROOT, ATTENTION_OUT, "kernel")]).view(self.hidden_size,
			                                                                       self.hidden_size).t()

			query_bias = np2th(weights[pjoin(ROOT, ATTENTION_Q, "bias")]).view(-1)
			key_bias = np2th(weights[pjoin(ROOT, ATTENTION_K, "bias")]).view(-1)
			value_bias = np2th(weights[pjoin(ROOT, ATTENTION_V, "bias")]).view(-1)
			out_bias = np2th(weights[pjoin(ROOT, ATTENTION_OUT, "bias")]).view(-1)

			self.attn.query.weight.copy_(query_weight)
			self.attn.key.weight.copy_(key_weight)
			self.attn.value.weight.copy_(value_weight)
			self.attn.out.weight.copy_(out_weight)
			self.attn.query.bias.copy_(query_bias)
			self.attn.key.bias.copy_(key_bias)
			self.attn.value.bias.copy_(value_bias)
			self.attn.out.bias.copy_(out_bias)

			mlp_weight_0 = np2th(weights[pjoin(ROOT, FC_0, "kernel")]).t()
			mlp_weight_1 = np2th(weights[pjoin(ROOT, FC_1, "kernel")]).t()
			mlp_bias_0 = np2th(weights[pjoin(ROOT, FC_0, "bias")]).t()
			mlp_bias_1 = np2th(weights[pjoin(ROOT, FC_1, "bias")]).t()

			self.ffn.fc1.weight.copy_(mlp_weight_0)
			self.ffn.fc2.weight.copy_(mlp_weight_1)
			self.ffn.fc1.bias.copy_(mlp_bias_0)
			self.ffn.fc2.bias.copy_(mlp_bias_1)

			self.attention_norm.weight.copy_(np2th(weights[pjoin(ROOT, ATTENTION_NORM, "scale")]))
			self.attention_norm.bias.copy_(np2th(weights[pjoin(ROOT, ATTENTION_NORM, "bias")]))
			self.ffn_norm.weight.copy_(np2th(weights[pjoin(ROOT, MLP_NORM, "scale")]))
			self.ffn_norm.bias.copy_(np2th(weights[pjoin(ROOT, MLP_NORM, "bias")]))

# if __name__ == '__main__':
# from core.vit import *
# config = get_b16_config()
# import ml_collections
def get_b16_config():
	"""Returns the ViT-B/16 configuration."""
	config = ml_collections.ConfigDict()
	config.patches = (16, 16)
	config.hidden_size = 768
	config.mlp_dim = 3072
	config.num_heads = 12
	config.num_layers = 12
	config.att_dropout = 0.0
	config.dropout_rate = 0.1
	config.classifier = 'token'
	return config


This is for Image exact Implementation

In [5]:
import time

import numpy as np
from scipy import ndimage
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
# from models.modules import *
# from models.vit import get_b16_config


class InterEnsembleLearningTransformer(nn.Module):
	def __init__(self, config, img_size=448, num_classes=2, dataset='cub', smooth_value=0.,
	             loss_alpha=0.4, cam=True, dsm=True, fix=True, update_warm=500,
	             vote_perhead=24, total_num=126, assess=False):
		super(InterEnsembleLearningTransformer, self).__init__()
		self.assess = assess
		self.smooth_value = smooth_value
		self.num_classes = num_classes
		self.loss_alpha = loss_alpha
		self.cam = cam

		self.embeddings = Embeddings(config, img_size=img_size)
		self.encoder = IELTEncoder(config, update_warm, vote_perhead, dataset, cam, dsm,
		                           fix, total_num, assess)
		self.head = Linear(config.hidden_size, num_classes)
		self.softmax = Softmax(dim=-1)


	def forward(self, x, labels=None):
		test_mode = False if labels is not None else True
		x = self.embeddings(x)
		if self.assess:
			x, xc, assess_list = self.encoder(x, test_mode)
		else:
			x, xc = self.encoder(x, test_mode)

		if self.cam:
			complement_logits = self.head(xc)
			probability = self.softmax(complement_logits)
			weight = self.head.weight
			assist_logit = probability * (weight.sum(-1))
			part_logits = self.head(x) + assist_logit
		else:
			part_logits = self.head(x)

		if self.assess:
			return part_logits, assess_list

		elif test_mode:
			return part_logits

		else:
			if self.smooth_value == 0:
				loss_fct = CrossEntropyLoss()
			else:
				loss_fct = LabelSmoothing(self.smooth_value)

			if self.cam:
				loss_p = loss_fct(part_logits.view(-1, self.num_classes), labels.view(-1))
				loss_c = loss_fct(complement_logits.view(-1, self.num_classes), labels.view(-1))
				alpha = self.loss_alpha
				loss = (1 - alpha) * loss_p + alpha * loss_c
			else:
				loss = loss_fct(part_logits.view(-1, self.num_classes), labels.view(-1))
			return part_logits, loss

	def get_eval_data(self):
		return self.encoder.select_num

	def load_from(self, weights):
		with torch.no_grad():
			nn.init.zeros_(self.head.weight)
			nn.init.zeros_(self.head.bias)

			self.embeddings.patch_embeddings.weight.copy_(np2th(weights["embedding/kernel"], conv=True))
			self.embeddings.patch_embeddings.bias.copy_(np2th(weights["embedding/bias"]))
			self.embeddings.cls_token.copy_(np2th(weights["cls"]))
			# self.encoder.patch_norm.weight.copy_(np2th(weights["Transformer/encoder_norm/scale"]))
			# self.encoder.patch_norm.bias.copy_(np2th(weights["Transformer/encoder_norm/bias"]))
			# self.encoder.clr_encoder.patch_norm.weight.copy_(np2th(weights["Transformer/encoder_norm/scale"]))
			# self.encoder.clr_encoder.patch_norm.bias.copy_(np2th(weights["Transformer/encoder_norm/bias"]))

			posemb = np2th(weights["Transformer/posembed_input/pos_embedding"])
			posemb_new = self.embeddings.position_embeddings
			if posemb.size() == posemb_new.size():
				self.embeddings.position_embeddings.copy_(posemb)
			else:
				ntok_new = posemb_new.size(1)

				posemb_tok, posemb_grid = posemb[:, :1], posemb[0, 1:]
				ntok_new -= 1

				gs_old = int(np.sqrt(len(posemb_grid)))
				gs_new = int(np.sqrt(ntok_new))
				# print('load_pretrained: grid-size from %s to %s' % (gs_old, gs_new))
				posemb_grid = posemb_grid.reshape(gs_old, gs_old, -1)

				zoom = (gs_new / gs_old, gs_new / gs_old, 1)
				posemb_grid = ndimage.zoom(posemb_grid, zoom, order=1)
				posemb_grid = posemb_grid.reshape((1, gs_new * gs_new, -1))
				posemb = np.concatenate([posemb_tok, posemb_grid], axis=1)
				self.embeddings.position_embeddings.copy_(np2th(posemb))

			for bname, block in self.encoder.named_children():
				for uname, unit in block.named_children():
					if not bname.startswith('key') and not bname.startswith('clr'):
						if uname == '12':
							uname = '11'
						unit.load_from(weights, n_block=uname)


class MultiHeadVoting(nn.Module):
	def __init__(self, config, vote_perhead=24, fix=True):
		super(MultiHeadVoting, self).__init__()
		self.fix = fix
		self.num_heads = config.num_heads
		self.vote_perhead = vote_perhead

		if self.fix:
			self.kernel = torch.tensor([[1, 2, 1],
			                            [2, 4, 2],
			                            [1, 2, 1]], device='cuda').unsqueeze(0).unsqueeze(0).half()
			self.conv = F.conv2d
		else:
			self.conv = nn.Conv2d(1, 1, 3, 1, 1)

	def forward(self, x, select_num=None, last=False):
		B, patch_num = x.shape[0], x.shape[3] - 1
		select_num = self.vote_perhead if select_num is None else select_num
		count = torch.zeros((B, patch_num), dtype=torch.int, device='cuda').half()
		score = x[:, :, 0, 1:]
		_, select = torch.topk(score, self.vote_perhead, dim=-1)
		select = select.reshape(B, -1)

		for i, b in enumerate(select):
			count[i, :] += torch.bincount(b, minlength=patch_num)

		if not last:
			count = self.enhace_local(count)
			pass

		patch_value, patch_idx = torch.sort(count, dim=-1, descending=True)
		patch_idx += 1
		return patch_idx[:, :select_num], count

	def enhace_local(self, count):
		B, H = count.shape[0], math.ceil(math.sqrt(count.shape[1]))
		count = count.reshape(B, H, H)
		if self.fix:
			count = self.conv(count.unsqueeze(1), self.kernel, stride=1, padding=1).reshape(B, -1)
		else:
			count = self.conv(count.unsqueeze(1)).reshape(B, -1)
		return count


class CrossLayerRefinement(nn.Module):
	def __init__(self, config, clr_layer):
		super(CrossLayerRefinement, self).__init__()
		self.clr_layer = clr_layer
		self.clr_norm = LayerNorm(config.hidden_size, eps=1e-6)

	def forward(self, x, cls):
		out = [torch.stack(token) for token in x]
		out = torch.stack(out).squeeze(1)
		out = torch.cat((cls, out), dim=1)
		out, weights = self.clr_layer(out)
		out = self.clr_norm(out)
		return out, weights


class IELTEncoder(nn.Module):
	def __init__(self, config, update_warm=500, vote_perhead=24, dataset='cub',
	             cam=True, dsm=True, fix=True, total_num=126, assess=False):
		super(IELTEncoder, self).__init__()
		self.assess = assess
		self.warm_steps = update_warm
		self.layer = nn.ModuleList()
		self.layer_num = config.num_layers
		self.vote_perhead = vote_perhead
		self.dataset = dataset
		self.cam = cam
		self.dsm = dsm

		for _ in range(self.layer_num - 1):
			self.layer.append(Block(config, assess=self.assess))

		if self.dataset == 'dog' or self.dataset == 'nabrids':
			self.layer.append(Block(config, assess=self.assess))
			self.clr_layer = self.layer[-1]
			if self.cam:
				self.layer.append(Block(config, assess=self.assess))
				self.key_layer = self.layer[-1]
		else:
			self.clr_layer = Block(config)
			if self.cam:
				self.key_layer = Block(config)

		if self.cam:
			self.key_norm = LayerNorm(config.hidden_size, eps=1e-6)

		self.patch_select = MultiHeadVoting(config, self.vote_perhead, fix)

		self.total_num = total_num
		## for CUB and NABirds
		self.select_rate = torch.tensor([16, 14, 12, 10, 8, 6, 8, 10, 12, 14, 16], device='cuda') / self.total_num
		## for Others
		# self.select_rate = torch.ones(self.layer_num-1,device='cuda')/(self.layer_num-1)

		self.select_num = self.select_rate * self.total_num
		self.clr_encoder = CrossLayerRefinement(config, self.clr_layer)
		self.count = 0

	def forward(self, hidden_states, test_mode=False):
		if not test_mode:
			self.count += 1
		B, N, C = hidden_states.shape
		complements = [[] for i in range(B)]
		class_token_list = []
		if self.assess:
			layer_weights = []
			layer_selected = []
			layer_score = []
		else:
			pass

		for t in range(self.layer_num - 1):
			layer = self.layer[t]
			select_num = torch.round(self.select_num[t]).int()
			hidden_states, weights = layer(hidden_states)
			select_idx, select_score = self.patch_select(weights, select_num)
			for i in range(B):
				complements[i].extend(hidden_states[i, select_idx[i, :]])
			class_token_list.append(hidden_states[:, 0].unsqueeze(1))
			if self.assess:
				layer_weights.append(weights)
				layer_score.append(select_score)
				layer_selected.extend(select_idx)
		cls_token = hidden_states[:, 0].unsqueeze(1)

		clr, weights = self.clr_encoder(complements, cls_token)
		sort_idx, _ = self.patch_select(weights, select_num=24, last=True)

		if not test_mode and self.count >= self.warm_steps and self.dsm:
			# if not test_mode and self.count >= 500 and self.dsm:
			layer_count = self.count_patch(sort_idx)
			self.update_layer_select(layer_count)

		class_token_list = torch.cat(class_token_list, dim=1)

		if not self.cam:
			return clr[:, 0], None
		else:
			out = []
			for i in range(B):
				out.append(clr[i, sort_idx[i, :]])
			out = torch.stack(out).squeeze(1)
			out = torch.cat((cls_token, out), dim=1)
			out, _ = self.key_layer(out)
			key = self.key_norm(out)

		if self.assess:
			assess_list = [layer_weights, layer_selected, layer_score, sort_idx]
			return key[:, 0], clr[:, 0], assess_list
		else:

			# fused = torch.cat((class_token_list, clr[:, 0].unsqueeze(1)), dim=1)
			# clr[:, 0] = fused.mean(1)
			return key[:, 0], clr[:, 0]

	def update_layer_select(self, layer_count):
		alpha = 1e-3  # if self.dataset != 'dog' and self.dataset == 'nabirds' else 1e-4
		new_rate = layer_count / layer_count.sum()

		self.select_rate = self.select_rate * (1 - alpha) + alpha * new_rate
		self.select_rate /= self.select_rate.sum()
		self.select_num = self.select_rate * self.total_num

	def count_patch(self, sort_idx):
		layer_count = torch.cumsum(self.select_num, dim=-1)
		sort_idx = (sort_idx - 1).reshape(-1)
		for i in range(self.layer_num - 1):
			mask = (sort_idx < layer_count[i])
			layer_count[i] = mask.sum()
		cum_count = torch.cat((torch.tensor([0], device='cuda'), layer_count[:-1]))
		layer_count -= cum_count
		return layer_count.int()

	## Old Implementation
	# layer_count = torch.zeros(self.layer_num, device='cuda').int()
	# sort_idx = (sort_idx - 1).reshape(-1)
	# sorted, _ = torch.sort(sort_idx)
	# for j in range(self.layer_num):
	# 	if j == (self.layer_num - 1):
	# 		layer_count[j] = len(sorted)
	# 		break
	# 	a = self.select_num[:j + 1].sum()
	# 	for i, val in enumerate(sorted):
	# 		flag = True
	# 		if flag and val > a:
	# 			layer_count[j] += i
	# 			sorted = sorted[i:]
	# 			flag = False
	# 		if not flag:
	# 			break
	# return layer_count

# !pip install ml-collections
if __name__ == '__main__':
	start = time.time()
	config = get_b16_config()
	# com = clrEncoder(config,)
	# com.to(device='cuda')
	net = InterEnsembleLearningTransformer(config).cuda()
	# hidden_state = torch.arange(400*768).reshape(2,200,768)/1.0
	x = torch.rand(4, 3, 448, 448, device='cuda')
	y = net(x)
	print(y.shape)

torch.Size([4, 2])


This is for Audio data


In [6]:
import time

import numpy as np
from scipy import ndimage
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from transformers import Wav2Vec2Model

# from models.modules import *
# from models.vit import get_b16_config

class InterEnsembleLearningTransformer(nn.Module):
	def __init__(self, config, img_size=448, num_classes=200, dataset='cub', smooth_value=0.,
	             loss_alpha=0.4, cam=True, dsm=True, fix=True, update_warm=500,
	             vote_perhead=24, total_num=126, assess=False):
		super(InterEnsembleLearningTransformer, self).__init__()
		self.assess = assess
		self.smooth_value = smooth_value
		self.num_classes = num_classes
		self.loss_alpha = loss_alpha
		self.cam = cam
        # anmol changed 1
        # self.embeddings = Embeddings(config, img_size=img_size)
		self.embeddings = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

		self.encoder = IELTEncoder(config, update_warm, vote_perhead, dataset, cam, dsm,
		                           fix, total_num, assess)
		self.head = Linear(config.hidden_size, num_classes)
		self.softmax = Softmax(dim=-1)

	def forward(self, x, labels=None):
		test_mode = False if labels is not None else True
        # anmol changed 2
		x = self.embeddings(x).last_hidden_state
		if self.assess:
			x, xc, assess_list = self.encoder(x, test_mode)
		else:
			x, xc = self.encoder(x, test_mode)

		if self.cam:
			complement_logits = self.head(xc)
			probability = self.softmax(complement_logits)
			weight = self.head.weight
			assist_logit = probability * (weight.sum(-1))
			part_logits = self.head(x) + assist_logit
		else:
			part_logits = self.head(x)

		if self.assess:
			return part_logits, assess_list

		elif test_mode:
			return part_logits

		else:
			if self.smooth_value == 0:
				loss_fct = CrossEntropyLoss()
			else:
				loss_fct = LabelSmoothing(self.smooth_value)

			if self.cam:
				loss_p = loss_fct(part_logits.view(-1, self.num_classes), labels.view(-1))
				loss_c = loss_fct(complement_logits.view(-1, self.num_classes), labels.view(-1))
				alpha = self.loss_alpha
				loss = (1 - alpha) * loss_p + alpha * loss_c
			else:
				loss = loss_fct(part_logits.view(-1, self.num_classes), labels.view(-1))
			return part_logits, loss

	def get_eval_data(self):
		return self.encoder.select_num

	def load_from(self, weights):
		with torch.no_grad():
			nn.init.zeros_(self.head.weight)
			nn.init.zeros_(self.head.bias)

			# self.embeddings.patch_embeddings.weight.copy_(np2th(weights["embedding/kernel"], conv=True))
			# self.embeddings.patch_embeddings.bias.copy_(np2th(weights["embedding/bias"]))
			# self.embeddings.cls_token.copy_(np2th(weights["cls"]))
			# self.encoder.patch_norm.weight.copy_(np2th(weights["Transformer/encoder_norm/scale"]))
			# self.encoder.patch_norm.bias.copy_(np2th(weights["Transformer/encoder_norm/bias"]))
			# self.encoder.clr_encoder.patch_norm.weight.copy_(np2th(weights["Transformer/encoder_norm/scale"]))
			# self.encoder.clr_encoder.patch_norm.bias.copy_(np2th(weights["Transformer/encoder_norm/bias"]))

			# posemb = np2th(weights["Transformer/posembed_input/pos_embedding"])
			# posemb_new = self.embeddings.position_embeddings
			# if posemb.size() == posemb_new.size():
			# 	self.embeddings.position_embeddings.copy_(posemb)
			# else:
			# 	ntok_new = posemb_new.size(1)

			# 	posemb_tok, posemb_grid = posemb[:, :1], posemb[0, 1:]
			# 	ntok_new -= 1

			# 	gs_old = int(np.sqrt(len(posemb_grid)))
			# 	gs_new = int(np.sqrt(ntok_new))
			# 	# print('load_pretrained: grid-size from %s to %s' % (gs_old, gs_new))
			# 	posemb_grid = posemb_grid.reshape(gs_old, gs_old, -1)

			# 	zoom = (gs_new / gs_old, gs_new / gs_old, 1)
			# 	posemb_grid = ndimage.zoom(posemb_grid, zoom, order=1)
			# 	posemb_grid = posemb_grid.reshape((1, gs_new * gs_new, -1))
			# 	posemb = np.concatenate([posemb_tok, posemb_grid], axis=1)
			# 	self.embeddings.position_embeddings.copy_(np2th(posemb))

			for bname, block in self.encoder.named_children():
				for uname, unit in block.named_children():
					if not bname.startswith('key') and not bname.startswith('clr'):
						if uname == '12':
							uname = '11'
						unit.load_from(weights, n_block=uname)

class MultiHeadVoting(nn.Module):
	def __init__(self, config, vote_perhead=24, fix=True):
		super(MultiHeadVoting, self).__init__()
		self.fix = fix
		self.num_heads = config.num_heads
		self.vote_perhead = vote_perhead

        # anmol changed 3
		# if self.fix:
		# 	self.kernel = torch.tensor([[1, 2, 1],
		# 	                            [2, 4, 2],
		# 	                            [1, 2, 1]], device='cuda').unsqueeze(0).unsqueeze(0).half()
		# 	self.conv = F.conv2d
		# else:
		# 	self.conv = nn.Conv2d(1, 1, 3, 1, 1)
		if self.fix:
			self.kernel = torch.tensor([1, 2, 1], device='cuda').unsqueeze(0).unsqueeze(0).half()
		else:
			self.kernel = torch.ones(1, 1, 3, device='cuda').half()

	def forward(self, x, select_num=None, last=False):
        # anmol changed 4
		B, seq_len = x.shape[0], x.shape[1]  # Adapt for sequence length
		select_num = self.vote_perhead if select_num is None else select_num
		count = torch.zeros((B, seq_len), dtype=torch.int, device='cuda').half()
    # anmol changed 5
		score = x[:, :, 0]  # Removed spatial dimension handling for audio
		_, select = torch.topk(score, self.vote_perhead, dim=-1)
		select = select.reshape(B, -1)

		for i, b in enumerate(select):
			# count[i, :] += torch.bincount(b, minlength=seq_len)
            # anmol changed 6
			bincount_result = torch.bincount(b, minlength=seq_len)
			count[i, :seq_len] += bincount_result[:seq_len]

		if not last:
			count = self.enhace_local(count)

		patch_value, patch_idx = torch.sort(count, dim=-1, descending=True)
		patch_idx += 1
		return patch_idx[:, :select_num], count

	def enhace_local(self, count):
        # anmol changed 7
		B, seq_len = count.shape[0], count.shape[1]
		if self.fix:
			count = F.conv1d(count.unsqueeze(1), self.kernel, stride=1, padding=1).reshape(B, -1)
		else:
			count = F.conv1d(count.unsqueeze(1), self.kernel, stride=1, padding=1).reshape(B, -1)
		return count


class IELTEncoder(nn.Module):
	def __init__(self, config, update_warm=500, vote_perhead=24, dataset='cub',
	             cam=True, dsm=True, fix=True, total_num=126, assess=False):
		super(IELTEncoder, self).__init__()
		self.assess = assess
		self.warm_steps = update_warm
		self.layer = nn.ModuleList()
		self.layer_num = config.num_layers
		self.vote_perhead = vote_perhead
		self.dataset = dataset
		self.cam = cam
		self.dsm = dsm

		for _ in range(self.layer_num - 1):
			self.layer.append(Block(config, assess=self.assess))

		if self.dataset == 'dog' or self.dataset == 'nabrids':
			self.layer.append(Block(config, assess=self.assess))
			self.clr_layer = self.layer[-1]
			if self.cam:
				self.layer.append(Block(config, assess=self.assess))
				self.key_layer = self.layer[-1]
		else:
			self.clr_layer = Block(config)
			if self.cam:
				self.key_layer = Block(config)

		if self.cam:
			self.key_norm = LayerNorm(config.hidden_size, eps=1e-6)

		self.patch_select = MultiHeadVoting(config, self.vote_perhead, fix)

		self.total_num = total_num
		## for CUB and NABirds
		self.select_rate = torch.tensor([16, 14, 12, 10, 8, 6, 8, 10, 12, 14, 16], device='cuda') / self.total_num
		## for Others
		# self.select_rate = torch.ones(self.layer_num-1,device='cuda')/(self.layer_num-1)

		self.select_num = self.select_rate * self.total_num
		self.clr_encoder = CrossLayerRefinement(config, self.clr_layer)
		self.count = 0

	def forward(self, hidden_states, test_mode=False):
		if not test_mode:
			self.count += 1
		B, N, C = hidden_states.shape
		complements = [[] for i in range(B)]
		class_token_list = []
		if self.assess:
			layer_weights = []
			layer_selected = []
			layer_score = []

		for t in range(self.layer_num - 1):
			layer = self.layer[t]
			select_num = torch.round(self.select_num[t]).int()
			hidden_states, weights = layer(hidden_states)
			select_idx, select_score = self.patch_select(weights, select_num)
			for i in range(B):
				complements[i].extend(hidden_states[i, select_idx[i, :]])
			class_token_list.append(hidden_states[:, 0].unsqueeze(1))
			if self.assess:
				layer_weights.append(weights)
				layer_score.append(select_score)
				layer_selected.extend(select_idx)
		cls_token = hidden_states[:, 0].unsqueeze(1)

		clr, weights = self.clr_encoder(complements, cls_token)
		sort_idx, _ = self.patch_select(weights, select_num=24, last=True)

		if not test_mode and self.count >= self.warm_steps and self.dsm:
			layer_count = self.count_patch(sort_idx)
			self.update_layer_select(layer_count)

		class_token_list = torch.cat(class_token_list, dim=1)

		if not self.cam:
			return clr[:, 0], None
		else:
			out = []
			for i in range(B):
				out.append(clr[i, sort_idx[i, :]])
			out = torch.stack(out).squeeze(1)
			out = torch.cat((cls_token, out), dim=1)
			out, _ = self.key_layer(out)
			key = self.key_norm(out)

		if self.assess:
			assess_list = [layer_weights, layer_selected, layer_score, sort_idx]
			return key[:, 0], clr[:, 0], assess_list
		else:
			return key[:, 0], clr[:, 0]

# !pip install ml-collections
if __name__ == '__main__':
	start = time.time()
	config = get_b16_config()
	net = InterEnsembleLearningTransformer(config).cuda()
    # anmol changed 8
	x = torch.rand(4, 16000, device='cuda')  # 1-second audio samples at 16kHz
	y = net(x)
	print(y.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([4, 200])


In [9]:
import time
import librosa
import torch
import numpy as np

if __name__ == '__main__':
    start = time.time()
    config = get_b16_config()
    net = InterEnsembleLearningTransformer(config).cuda()

    duration = 10
    sampling_rate = 16000

    audio = np.random.normal(0, 1, sampling_rate * duration).astype(np.float32)

    audio = librosa.util.fix_length(audio, size=sampling_rate * duration)

    x = torch.tensor([audio] * 4, device='cuda')

    y = net(x)
    print(y.shape)


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  x = torch.tensor([audio] * 4, device='cuda')


torch.Size([4, 200])
