In [1]:
import argparse
import datetime
import numpy as np
import time
import torch
import torch.backends.cudnn as cudnn
import json

from pathlib import Path
import timm
from timm.data import Mixup
from timm.models import create_model
from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
from timm.scheduler import create_scheduler
from timm.optim import create_optimizer
from timm.utils import NativeScaler, get_state_dict, ModelEma

from datasets import build_dataset
from engine import train_one_epoch, evaluate
from losses import DistillationLoss
from samplers import RASampler
from augment import new_data_aug_generator

import models
import models_v2
from Growth import GrowthBlock
from growth_utils import *
import utils

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from regression_utils_add import return_layers
import torch.nn as nn

In [3]:
import os
path = os.path.dirname(timm.__file__)

In [4]:
path

'C:\\Users\\akash\\FILES\\Research\\Growth\\Transformer\\deit-main\\timm'

In [5]:
model = create_model(
        "deit_small_patch16_224",
        pretrained=False,
        num_classes=1000,
        drop_rate=0.0,
        drop_path_rate=0.1,
        drop_block_rate=None,
        img_size=224
    )

In [6]:
sum(p.numel() for p in model.parameters())

11417704

In [7]:
len(model.blocks)

12

In [8]:
rand = torch.rand(1,3,224,224)

In [9]:
model(rand)

tensor([[ 3.9423e-01,  1.0607e-01, -4.2674e-01,  3.5378e-01, -3.0509e-01,
         -3.5112e-01, -1.0366e-01, -2.7353e-01, -3.9596e-01, -1.7731e-01,
         -2.3314e-01,  1.7888e-01,  1.6268e-01,  3.8845e-01, -4.7584e-01,
         -5.0599e-01, -4.8835e-02, -7.9453e-02,  3.0131e-01, -5.0275e-01,
          1.1758e-01, -1.4041e+00,  9.8829e-02,  4.0085e-01, -3.7122e-02,
         -5.9900e-02, -3.2752e-01,  3.1380e-01,  1.9434e-01,  3.9211e-01,
          4.6031e-01, -3.5910e-01, -8.9669e-01, -5.2335e-01, -3.5155e-01,
         -3.3317e-01,  1.2243e-01, -7.1191e-02, -2.5903e-01, -1.9185e-01,
         -1.4236e-01, -2.8565e-01, -6.0403e-02, -2.9026e-01, -2.5405e-03,
         -2.8210e-01, -6.2894e-02,  5.1871e-01,  4.0110e-01,  3.1361e-01,
         -3.0044e-01,  3.9687e-01, -1.4883e-01,  1.3035e-01,  7.2563e-02,
         -2.3867e-01,  1.2066e-01, -1.2972e-02,  1.0374e+00, -1.0325e-01,
         -7.5619e-02,  1.6402e-01,  3.0383e-04,  3.0912e-02,  1.5658e-01,
          1.5838e-01, -4.0331e-01, -2.

In [5]:
m2=torch.load("splitted model3.pt")

In [9]:
import cv2
image =  cv2.imread(r"C:\Users\akash\FILES\Research\intrinsic\Datasets\tiny-imagenet-200\train\n01629819\images\n01629819_0.JPEG")
image = cv2.resize(image,(224,224))

In [10]:
img=torch.Tensor(np.expand_dims(image,axis=0)).permute([0,3,1,2])

In [11]:
len(model.blocks)

12

In [12]:
o1=model(img)
print(torch.sum(img))

tensor(20289604.)


In [13]:
def growth_wrapper(model):
    for i in range(len(model.blocks)):
        #print(f"Block {i} QKV {model.blocks[i].attn.qkv}")
        model.blocks[i].attn.qkv = GrowthBlock(model.blocks[i].attn.qkv,act_on=False)
        #print(f"Block {i} proj {model.blocks[i].attn.proj}")
        model.blocks[i].attn.proj = GrowthBlock(model.blocks[i].attn.proj,act_on=False)
        #print(f"Block {i} MLP fc1 {model.blocks[i].mlp.fc1}")
        model.blocks[i].mlp.fc1 = GrowthBlock(model.blocks[i].mlp.fc1)
        #print(f"Block {i} MLP fc2 {model.blocks[i].mlp.fc2}")
        model.blocks[i].mlp.fc2 = GrowthBlock(model.blocks[i].mlp.fc2)
    return model

In [14]:
m2=growth_wrapper(model)

In [10]:
def get_all_linear_layers_transformer(model):
    l=[]
    l_att =[]
    for i in range(len(model.blocks)):
        #print(f"Block {i} QKV {model.blocks[i].attn.qkv}")
        a=get_all_linear_layers(model.blocks[i].attn.qkv,typ='list')
        l+=a
        a_att = [[i,0,j] for j in range(len(a))]
        l_att+=a_att
        b=get_all_linear_layers(model.blocks[i].attn.proj,typ='list')
        l+=b
        b_att = [[i,1,j] for j in range(len(b))]
        l_att+=b_att
        c=get_all_linear_layers(model.blocks[i].mlp.fc1,typ='list')
        l+=c
        c_att = [[i,2,j] for j in range(len(c))]
        l_att+=c_att
        d=get_all_linear_layers(model.blocks[i].mlp.fc2,typ='list')
        l+=d
        d_att = [[i,3,j] for j in range(len(d))]
        l_att+=d_att
        #model.blocks[i].attn.qkv = GrowthBlock(model.blocks[i].attn.qkv,act_on=False)
        #print(f"Block {i} proj {model.blocks[i].attn.proj}")
        #model.blocks[i].attn.proj = GrowthBlock(model.blocks[i].attn.proj,act_on=False)
        #print(f"Block {i} MLP fc1 {model.blocks[i].mlp.fc1}")
        #model.blocks[i].mlp.fc1 = GrowthBlock(model.blocks[i].mlp.fc1)
        #print(f"Block {i} MLP fc2 {model.blocks[i].mlp.fc2}")
        #model.blocks[i].mlp.fc2 = GrowthBlock(model.blocks[i].mlp.fc2)
    return l,l_att

In [8]:
def calc_all_eigs(layers):
    eigs=[]
    for i,layer in enumerate(layers):
        gradient = layer.weight
        splitting = split_matrix(gradient)
        #print(splitting.shape)
        eig = splitting.eig()
        eigs.append(torch.min(eig,axis=0).values)
    return torch.Tensor(eigs)

In [12]:
l,la=get_all_linear_layers_transformer(m2)

In [13]:
l,la=get_all_linear_layers_transformer(m2)

In [9]:
def find_split_layers(model,percent=20,layer_percent=20):
    l,la=get_all_linear_layers_transformer(model)
    eigs = calc_all_eigs(l)
    r,c = eigs.shape
    rank = eigs.flatten().argsort()
    limit = (rank.shape(0)/100)*percent
    d={}
    for i in range(limit):
        pos = rank[i]
        layer_num = pos//len(l)
        try:
            d[str(layer_num)]+=1
        except:         
            d[str(layer_num)]=1
    sel_layer_data=[]                                                                                                                                                                                                                                                                                                                                                                                                           
    for k,v in d.items():
        layer = l[int(k)]
        if v >= layer.out_nodes * (layer_percent/100):
            sel_layer_data.append([l[int(k)]]+la[int(k)]+[v])
    return sel_layer_data

In [None]:
def ret_growth_model(block, num):
    if num==0:
        return block.attn.qkv
    elif num==1:
        return block.attn.proj
    elif num==2:
        return block.mlp.fc1
    else:
        return block.mlp.fc2

def assign_model(model,block,num,gb):
    if num==0:
        model.blocks[block].attn.qkv =gb
    elif num==1:
        model.blocks[block].attn.proj = gb
    elif num==2:
        model.blocks[block].mlp.fc1 = gb
    else:
        model.blocks[block].mlp.fc2 = gb
    return model

def split_nodewise(model,percent=20,layer_percent=20,act_on=True):
    sel_layers_data=find_split_layers(model)
    for layer_data in sel_layers_data:
        _,_,neg_index=calculate_max_layer([layer_data[0]])
        neg_index = neg_index[:layer_data[-1]]
        perm = permutation(choices,layer.out_features)
        block = layer_data[1]
        growth_block =layer_data[2]
        layers = get_all_linear_layers(ret_growth_model(model.blocks[block],growth_block))
        layer=layer_data[0]
        s_layer=None
        for i,l in enumerate(layers):
            if l==layer_data[0]:
                layers=layers.pop(i)
                s_layer=i
                break
        print(s_layer)        
        nw,ow = ret_new_weights(layer.weight,choices)
        nb,ob = ret_new_bias(layer.bias,choices)
        ow.requires_grad = True
        ob.requires_grad = True
        old_layer = nn.Linear(layer.in_features,layer.out_features-len(neg_index))
        #print(ow.shape,old_layer)
        old_layer.weight= nn.Parameter(ow)
        old_layer.bias = nn.Parameter(ob)
        #print(ow.requires_grad,ob.requires_grad)
        new_layer,feature_bottleneck,old_split,skip_fc,int_metrics = return_layers(layer,choices,num_nodes= len(neg_index)+10,samp_size=10000,reg_epochs=100,verbose=True)
        #new_layer,old_split,int_metrics = return_layers_data(loaders,model,layer,choices,num_nodes= len(neg_index)+10,samp_size=10000,reg_epochs=100,verbose=True)
        layers = layers[:s_layer]+[old_layer,new_layer,feature_bottleneck,old_split,skip_fc]+layers[s_layer:]
        layers= {str(i): layers[i] for i in range(len(layers))}
        #print(layers)
        _,_,architecture_array = return_arc_array(model.architecture_array,0,s_layer,perm)
        model = assign_model(model,block,growth_block,GrowthModel(layers,architecture_array,act_on))
    return model
        
    

In [17]:
def split(model):
    for i in range(len(model.blocks)):
        #print(f"Block {i} QKV {model.blocks[i].attn.qkv}")
        model.blocks[i].attn.qkv = GrowthStep(model.blocks[i].attn.qkv,act_on=False)
        #print(f"Block {i} proj {model.blocks[i].attn.proj}")
        model.blocks[i].attn.proj = GrowthStep(model.blocks[i].attn.proj,act_on=False)
        #print(f"Block {i} MLP fc1 {model.blocks[i].mlp.fc1}")
        model.blocks[i].mlp.fc1 = GrowthStep(model.blocks[i].mlp.fc1)
        #print(f"Block {i} MLP fc2 {model.blocks[i].mlp.fc2}")
        model.blocks[i].mlp.fc2 = GrowthStep(model.blocks[i].mlp.fc2)
    return model
    # print(model.blocks[i])

In [18]:
def GrowthStep(model,follow_neg=True,nodes=5,act_on=True):
    layers =get_all_linear_layers(model,"list")
    s_layer,layer,neg_index=calculate_max_layer(layers)
    if follow_neg==False:
        neg_index = neg_index[:nodes]
    if len(neg_index) > layer.out_features/2:
        neg_index = neg_index[:len(neg_index)//2]
    
    choices=[n[0] for n in neg_index]
    choices.sort()
    
    print(choices)
    chosen_index= c_index(layer.out_features,choices)
    perm = permutation(choices,layer.out_features)
    print(s_layer)
    layer=layers.pop(s_layer)
    nw,ow = ret_new_weights(layer.weight,choices)
    nb,ob = ret_new_bias(layer.bias,choices)
    ow.requires_grad = True
    ob.requires_grad = True
    old_layer = nn.Linear(layer.in_features,layer.out_features-len(neg_index))
    #print(ow.shape,old_layer)
    old_layer.weight= nn.Parameter(ow)
    old_layer.bias = nn.Parameter(ob)
    #print(ow.requires_grad,ob.requires_grad)
    new_layer,feature_bottleneck,old_split,skip_fc,int_metrics = return_layers(layer,choices,num_nodes= len(neg_index)+10,samp_size=10000,reg_epochs=1000,verbose=True)
    #new_layer,old_split,int_metrics = return_layers_data(loaders,model,layer,choices,num_nodes= len(neg_index)+10,samp_size=10000,reg_epochs=100,verbose=True)
    layers = layers[:s_layer]+[old_layer,new_layer,feature_bottleneck,old_split,skip_fc]+layers[s_layer:]
    layers= {str(i): layers[i] for i in range(len(layers))}
    #print(layers)
    _,_,architecture_array = return_arc_array(model.architecture_array,0,s_layer,perm)
    model1 = GrowthModel(layers,architecture_array,act_on)
    return model1, nw,ow,nb,ob,perm,int_metrics

In [19]:
def return_arc_array(a_array,i_num,sel_layer,positional):
    def create_numbered_arc_array(arc_array,init_num):
        name_array=[]
        arc_new_arr=[]
        for i in range(len(arc_array)):
            if arc_array[i] ==0:
                name_array.append(init_num)
                if init_num != sel_layer:
                    arc_new_arr.append(0)
                else:
                    arc_new_arr.append([[0,0,0,0,0],positional])
                init_num+=1
            else:
                named_child_array,init_num,narr = create_numbered_arc_array(arc_array[i][0],init_num)
                #print(arc_array[i][1])
                name_array.append([named_child_array,arc_array[i][1]])
                arc_new_arr.append([narr,arc_array[i][1]])
        return name_array,init_num,arc_new_arr
    return create_numbered_arc_array(a_array,i_num)


In [20]:
return_arc_array([[[[[0,0,0,0,0],[1,2]],0,0,0,0],[3,4]],0],0,6,[5,6])

([[[[[0, 1, 2, 3, 4], [1, 2]], 5, 6, 7, 8], [3, 4]], 9],
 10,
 [[[[[0, 0, 0, 0, 0], [1, 2]], 0, [[0, 0, 0, 0, 0], [5, 6]], 0, 0], [3, 4]],
  0])

In [9]:
print(sum([p.numel() for p in model.parameters()]))

5910800


In [10]:
print(sum([p.numel() for p in m2.parameters()]))

7134032


In [21]:
l[0].weight.shape

torch.Size([2304, 768])

In [3]:
a = torch.tensor([[1,2,3,4],[0,7,9,8]])
r,c=a.shape

In [7]:
torch.min(a,axis=0).values

tensor([0, 2, 3, 4])

In [8]:
a.flatten().argsort().view(2,4)

tensor([[4, 0, 1, 2],
        [3, 5, 7, 6]])

In [17]:
7//4  

1

In [19]:
7- (7//4 *c)

3

In [6]:
split_1 = torch.load(r"C:\Users\akash\Downloads\1.pt")

In [7]:
split_2 =torch.load(r"C:\Users\akash\Downloads\2.pt")

In [8]:
split_1

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 192, kernel_size=(16, 16), stride=(16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): GrowthModel(
          (layer_dict): ModuleDict(
            (0): Linear(in_features=192, out_features=576, bias=True)
          )
        )
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): GrowthModel(
          (layer_dict): ModuleDict(
            (0): Linear(in_features=192, out_features=192, bias=True)
          )
        )
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): GrowthModel(
          (layer_dict): ModuleDict(
            (0): Linear(in_features=192, out_features=768, bias=True)
          )
        )
        

In [9]:
split_2

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 192, kernel_size=(16, 16), stride=(16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): GrowthModel(
          (layer_dict): ModuleDict(
            (0): Linear(in_features=192, out_features=576, bias=True)
          )
        )
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): GrowthModel(
          (layer_dict): ModuleDict(
            (0): Linear(in_features=192, out_features=192, bias=True)
          )
        )
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): GrowthModel(
          (layer_dict): ModuleDict(
            (0): Linear(in_features=192, out_features=768, bias=True)
          )
        )
        

In [6]:
one = torch.load(r"C:\Users\akash\Downloads\1.pt")

In [16]:
inp=torch.rand((1,3,224,224)).cuda()

In [17]:
op = one(inp)

In [18]:
op.shape

torch.Size([1, 10])

In [19]:
two = torch.load(r"C:\Users\akash\Downloads\2.pt")

In [21]:
op2=two(inp)

RuntimeError: mat1 dim 1 must match mat2 dim 0

In [23]:
two

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 192, kernel_size=(16, 16), stride=(16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): GrowthModel(
          (layer_dict): ModuleDict(
            (0): Linear(in_features=192, out_features=576, bias=True)
          )
        )
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): GrowthModel(
          (layer_dict): ModuleDict(
            (0): Linear(in_features=192, out_features=192, bias=True)
          )
        )
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): GrowthModel(
          (layer_dict): ModuleDict(
            (0): Linear(in_features=192, out_features=768, bias=True)
          )
        )
        

In [24]:
one

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 192, kernel_size=(16, 16), stride=(16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): GrowthModel(
          (layer_dict): ModuleDict(
            (0): Linear(in_features=192, out_features=576, bias=True)
          )
        )
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): GrowthModel(
          (layer_dict): ModuleDict(
            (0): Linear(in_features=192, out_features=192, bias=True)
          )
        )
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): GrowthModel(
          (layer_dict): ModuleDict(
            (0): Linear(in_features=192, out_features=768, bias=True)
          )
        )
        