# Imports & open Device

In [1]:
import time
import torch
import ttnn

torch.manual_seed(0)
device_id = 0
device = ttnn.open(device_id)

[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Initializing device 0
[38;2;000;128;000m                 Device[0m | [1m[38;2;100;149;237mINFO    [0m | Opening user mode device driver
[32m2024-01-04 22:37:38.426[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Detected 1 PCI device
[32m2024-01-04 22:37:38.441[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 0 device_id: 0xfaca revision: 0)
[32m2024-01-04 22:37:38.526[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Disable PCIE DMA
[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | AI CLK for device 0 is:   1202 MHz


# Enable program cache

In [2]:
ttnn.enable_program_cache()

[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | Program Cache: enabled.


# Matrix Multiplications 

# Constants

In [3]:
b = 8
n = 12
s = 384
h = 1024

In [4]:
A = torch.randn((b, s, h), dtype=torch.bfloat16)
A = ttnn.from_torch(A)
# tilize before matmul
A = ttnn.to_layout(A, ttnn.TILE_LAYOUT)
A = ttnn.to_device(A, device, memory_config=ttnn.DRAM_MEMORY_CONFIG)

In [5]:
B = torch.randn((h, h), dtype=torch.bfloat16)
B = ttnn.from_torch(B)
B = ttnn.to_layout(B, ttnn.TILE_LAYOUT)
B = ttnn.to_device(B, device, memory_config=ttnn.DRAM_MEMORY_CONFIG)

In [6]:
C = torch.randn((h, s), dtype=torch.bfloat16)
C = ttnn.from_torch(C)
C = ttnn.to_layout(C, ttnn.TILE_LAYOUT)
C = ttnn.to_device(C, device, memory_config=ttnn.L1_MEMORY_CONFIG)

In [7]:
D = torch.randn((s, s), dtype=torch.bfloat16)
D = ttnn.from_torch(D)
D = ttnn.to_layout(D, ttnn.TILE_LAYOUT)
D = ttnn.to_device(D, device, memory_config=ttnn.L1_MEMORY_CONFIG)

# Matmul 1

In [8]:
start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.DRAM_MEMORY_CONFIG,
        dtype=ttnn.bfloat16, 
        #core_grid=(b, n),
        core_grid=(6,6),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.DRAM_MEMORY_CONFIG,)
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R1.shape)

Took: 1.2992770671844482 seconds!
[8, 384, 1024]


In [9]:
start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.DRAM_MEMORY_CONFIG,
        dtype=ttnn.bfloat16, 
        #core_grid=(b, n),
        core_grid=(8,8),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.DRAM_MEMORY_CONFIG,)
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R1.shape)

Took: 0.7126712799072266 seconds!
[8, 384, 1024]


# Rerun to utilize program cache for speedup

In [10]:
"""
start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.DRAM_MEMORY_CONFIG,
        dtype=ttnn.bfloat16, 
        core_grid=(b, n),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.DRAM_MEMORY_CONFIG,)
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R1.shape)
"""

'\nstart = time.time()\nM1 = ttnn.matmul(\n        A,\n        B,\n        memory_config=ttnn.DRAM_MEMORY_CONFIG,\n        dtype=ttnn.bfloat16, \n        core_grid=(b, n),\n    )\nR1 = ttnn.add(M1, M1, memory_config=ttnn.DRAM_MEMORY_CONFIG,)\nend = time.time()\nduration = end - start\nprint("Took: " +  str(duration) + " seconds!")\nR1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)\nprint(R1.shape)\n'

### Use L1 memory

In [11]:
A = torch.randn((b, s, h), dtype=torch.bfloat16)
A = ttnn.from_torch(A)
# tilize before matmul
A = ttnn.to_layout(A, ttnn.TILE_LAYOUT)
# put on L1 moemory 
A = ttnn.to_device(A, device, memory_config=ttnn.L1_MEMORY_CONFIG)


B = torch.randn((h, h), dtype=torch.bfloat16)
B = ttnn.from_torch(B)
B = ttnn.to_layout(B, ttnn.TILE_LAYOUT)
B = ttnn.to_device(B, device, memory_config=ttnn.L1_MEMORY_CONFIG)



start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.L1_MEMORY_CONFIG, # place on L1 memory
        dtype=ttnn.bfloat16, 
        core_grid=(b, n),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.L1_MEMORY_CONFIG) # place on L1 memory
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
#R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
#print(R1.shape)

Took: 0.6838667392730713 seconds!


# Matmul 2 : data type bfloat8_b

In [12]:
start = time.time()
M2 = ttnn.matmul(
        R1,
        C,
        memory_config=ttnn.L1_MEMORY_CONFIG,
        dtype=ttnn.bfloat8_b, # use float8 data type
        core_grid=(b, n), # specify grid cores to run matmul on
    )
R2 = ttnn.add(M2, M2, memory_config=ttnn.L1_MEMORY_CONFIG) # place on L1 memory
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R2 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R2.shape)

Took: 1.1955561637878418 seconds!
[8, 384, 1024]


In [13]:
start = time.time()
M2 = ttnn.matmul(
        R1,
        C,
        memory_config=ttnn.L1_MEMORY_CONFIG,
        dtype=ttnn.bfloat8_b, # use float8 data type
        core_grid=(b, n), # specify grid cores to run matmul on
    )
R2 = ttnn.add(M2, M2, memory_config=ttnn.L1_MEMORY_CONFIG) # place on L1 memory
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R2 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R2.shape)

Took: 0.0015702247619628906 seconds!
[8, 384, 1024]


# Matmul3 followed by softmax

In [14]:
M3 = ttnn.matmul(
        M2,
        D,
        memory_config=ttnn.L1_MEMORY_CONFIG,
        dtype=ttnn.bfloat8_b, # use float8 data type
        core_grid=(b, n), # specify grid cores to run matmul on
    )

In [17]:
#ttnn.softmax
start = time.time()
R3 = ttnn.softmax(M3, -1)
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R3 = ttnn.to_layout(R3, ttnn.ROW_MAJOR_LAYOUT)
print(R3[:, 1:10, 0])

Took: 1.2932393550872803 seconds!
Tensor([ [0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399],
    [0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399],
    [0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399],
    [0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399],
    [0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399],
    [0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399],
    [0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399],
    [0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399, 0.00259399]], dtype=bfloat16 )



### # Visualize results

In [16]:
# print parts of M1 and M2; for slicing we need to change the layout to row_major
M1 = ttnn.to_layout(M1, ttnn.ROW_MAJOR_LAYOUT)
print("M1: ", M1[1, :4,:4])

M2 = ttnn.to_layout(M2, ttnn.ROW_MAJOR_LAYOUT)
print("M2: ", M2[1, :4,:4])

M3 = ttnn.to_layout(M3, ttnn.ROW_MAJOR_LAYOUT)
print("M3: ", M3[1, :4,:4])

M1:  Tensor([ [8.9375, 3.65625, 19.25, -44.5],
    [13.625, -25, 0.851562, 71.5],
    [-2.78125, -2.73438, -0.554688, -9.0625],
    [48.25, 21, -6.0625, 40.25]], dtype=bfloat16 )

M2:  Tensor([ [0, 0, 0, 0],
    [0, 0, 0, 0],
    [0, 0, 0, 0],
    [0, 0, 0, 0]], dtype=bfloat16 )

M3:  Tensor([ [0, 0, 0, 0],
    [0, 0, 0, 0],
    [0, 0, 0, 0],
    [0, 0, 0, 0]], dtype=bfloat16 )



# Matmul optim

# Defragment L1 memory Space

In [None]:
ttnn.close(device)