In [33]:
from util import *
from pprint import pp
architecture = 'architecture.yaml'
workload = 'Configs/two_fc.workload.yaml'
mapping = 'Configs/data_parallel_fc.mapping.yaml'

show_config(architecture)

variables:
  global_cycle_seconds: 1e-9
  technology: "45nm"

architecture:
  version: 0.4
  nodes:
  - !Component
    name: MainMemory
    class: DRAM
    attributes: {width: 256, block_size: 32, word_bits: 8, datawidth: 8}
    required_actions: ['read', 'write']
  - !Component
    name: GlobalBuffer
    class: SRAM
    attributes:
        depth: 8192
        width: 256
        block_size: 32
        word_bits: 8
        datawidth: 8
        n_rdwr_ports: 2
        n_rd_ports: 0
        n_wr_ports: 0
    required_actions: ['read', 'write']
  - !Component
    name: MACC
    class: intmac
    attributes:
        datawidth: 8
        width: 16
        cycle_time: 1e-9
    required_actions: ['compute']


In [34]:
show_config(workload)

problem:
  - shape:
      name: Fc1
      dimensions: [ P1, M1, C1 ]
      data_spaces:
      - name: Fmap1
        dimensions: [ Fmap1_C, Fmap1_P ]
        projection: '[ C1, P1 ]'
      - name: Filter1
        dimensions: [ Filter1_C, Filter1_M ]
        projection: '[ C1, M1 ]'
      - name: Fmap2
        dimensions: [ Fmap2_C, Fmap2_P ]
        projection: '[ M1, P1 ]'
        read_write: True

    instance: >-
      0 <= P1 < 128 and 0 <= M1 < 64 and 0 <= C1 < 64

  - shape:
      name: Fc2
      dimensions: [ P2, M2, C2 ]
      data_spaces:
      - name: Fmap2
        dimensions: [ Fmap2_C, Fmap2_P ]
        projection: '[ C2, P2 ]'
      - name: Filter2
        dimensions: [ Filter2_C, Filter2_M ]
        projection: '[ C2, M2 ]'
      - name: Fmap3
        dimensions: [ Fmap3_C, Fmap3_P ]
        projection: '[ M2, P2 ]'
        read_write: True

    instance: >-
      0 <= P2 < 128 and 0 <= M2 < 64 and 0 <= C2 < 64



In [35]:
show_config(mapping)

mapping:
  type: fused
  nodes:
  - type: storage
    target: 0  # level 0 is bound to global DRAM
    dspace: [Filter1, Filter2, Fmap1, Fmap3] #-------- node 2.a
  - type: storage
    target: 1  # level 1 is bound to DRAM
    dspace: [Filter1, Filter2]
  - type: temporal #---------------------------------- node 2.b
    rank: P2
    tile_shape: 1
  - type: storage  #---------------------------------- node 2.c
    target: 1  # level 1 is bound to DRAM
    dspace: [Fmap1, Fmap2, Fmap3]
  - type: sequential  #------------------------------- node 2.d
    branches:
    - - type: spatial
        rank: C1
        tile_shape: 16 #---------------------- split into 4 tiles along channels
      - type: temporal
        rank: M1
        tile_shape: 1
      - type: compute
        einsum: Fc1
        target: 2  # level 3 is bound to MACC
    - - type: spatial
        rank: C2
        tile_shape: 16
      - type: temporal
        rank: M2
        tile_shape: 1
      - type: compute
        einsum: F

In [36]:
from pytimeloop.looptree.run import run_looptree

bindings = {
    0: 'MainMemory',
    1: 'GlobalBuffer',
    2: 'MACC'
}

In [37]:
stats = run_looptree(
    CONFIG_DIR,
    [architecture, workload, mapping],
    TMP_DIR,
    bindings,
    call_accelergy=True
)
print('Latency:', stats.latency)
print('Energy:')
pp(stats.energy)

Latency: 16384
Energy:
{('MainMemory', 'read'): 33554432.0,
 ('GlobalBuffer', 'read'): 255179358.208,
 ('GlobalBuffer', 'write'): 3771662.336,
 ('MainMemory', 'write'): 16777216.0,
 ('MACC', 'compute'): 3434086.4}


In [44]:
architecture = 'architecture.yaml'
workload = 'Configs/transformer.workload.yaml'
# workload = 'Configs/two_fc.workload.yaml'
mapping = 'Configs/data_parallel_transformer.mapping.yaml'
# mapping = 'Configs/data_parallel_fc.mapping.yaml'

In [45]:
stats = run_looptree(
    CONFIG_DIR,
    [architecture, workload, mapping],
    TMP_DIR,
    bindings,
    call_accelergy=True
)
print('Latency:', stats.latency)
print('Energy:')
pp(stats.energy)

RuntimeError: yaml-cpp: error at line 