# Imports & open Device

In [1]:
import time
import torch
import ttnn

torch.manual_seed(0)
device_id = 0
device = ttnn.open(device_id)

[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Initializing device 0
[38;2;000;128;000m                 Device[0m | [1m[38;2;100;149;237mINFO    [0m | Opening user mode device driver
[32m2024-01-05 00:01:10.723[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Detected 1 PCI device
[32m2024-01-05 00:01:10.738[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 0 device_id: 0xfaca revision: 0)
[32m2024-01-05 00:01:10.814[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Disable PCIE DMA
[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | AI CLK for device 0 is:   1202 MHz


# Enable program cache

In [2]:
ttnn.enable_program_cache()

[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | Program Cache: enabled.


# Matrix Multiplications 

# Constants

In [3]:
b = 8
n = 12
s = 384
h = 1024

# Define matrix A and B and place them on DRAM

In [4]:
A = torch.randn((b, s, h), dtype=torch.bfloat16)
A = ttnn.from_torch(A)
# tilize before matmul
A = ttnn.to_layout(A, ttnn.TILE_LAYOUT)
A = ttnn.to_device(A, device, memory_config=ttnn.DRAM_MEMORY_CONFIG)

In [5]:
B = torch.randn((h, h), dtype=torch.bfloat16)
B = ttnn.from_torch(B)
B = ttnn.to_layout(B, ttnn.TILE_LAYOUT)
B = ttnn.to_device(B, device, memory_config=ttnn.DRAM_MEMORY_CONFIG)

# Define matrix C and D and place them on L1 

In [6]:
C = torch.randn((h, s), dtype=torch.bfloat16)
C = ttnn.from_torch(C)
C = ttnn.to_layout(C, ttnn.TILE_LAYOUT)
C = ttnn.to_device(C, device, memory_config=ttnn.L1_MEMORY_CONFIG)

In [7]:
D = torch.randn((s, s), dtype=torch.bfloat16)
D = ttnn.from_torch(D)
D = ttnn.to_layout(D, ttnn.TILE_LAYOUT)
D = ttnn.to_device(D, device, memory_config=ttnn.L1_MEMORY_CONFIG)

# Matmul 1

In [8]:
start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.DRAM_MEMORY_CONFIG,
        dtype=ttnn.bfloat16, 
        core_grid=(b, n),
        #core_grid=(6,6),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.DRAM_MEMORY_CONFIG,)
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R1.shape)

Took: 1.2370569705963135 seconds!
[8, 384, 1024]


# Rerun matmul to take advantage of program cache speedup

In [9]:
start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.DRAM_MEMORY_CONFIG,
        dtype=ttnn.bfloat16, 
        core_grid=(b, n),
        #core_grid=(6,6),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.DRAM_MEMORY_CONFIG,)
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R1.shape)

Took: 0.002558469772338867 seconds!
[8, 384, 1024]


### Use L1 memory

In [10]:
# Re-dfine matrices A and B and place them on L1 memory 
A = torch.randn((b, s, h), dtype=torch.bfloat16)
A = ttnn.from_torch(A)
# tilize before matmul
A = ttnn.to_layout(A, ttnn.TILE_LAYOUT)
# put on L1 moemory 
A = ttnn.to_device(A, device, memory_config=ttnn.L1_MEMORY_CONFIG)


B = torch.randn((h, h), dtype=torch.bfloat16)
B = ttnn.from_torch(B)
B = ttnn.to_layout(B, ttnn.TILE_LAYOUT)
B = ttnn.to_device(B, device, memory_config=ttnn.L1_MEMORY_CONFIG)



start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.L1_MEMORY_CONFIG, # place on L1 memory
        dtype=ttnn.bfloat16, 
        core_grid=(b, n),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.L1_MEMORY_CONFIG) # place on L1 memory
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
#R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
#print(R1.shape)

Took: 0.567232608795166 seconds!


# Rerun matmul to take advantage of program cache speedup

In [11]:
start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.L1_MEMORY_CONFIG, # place on L1 memory
        dtype=ttnn.bfloat16, 
        core_grid=(b, n),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.L1_MEMORY_CONFIG) # place on L1 memory
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
#R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
#print(R1.shape)

Took: 0.0025701522827148438 seconds!


# Matmul 2 : data type bfloat8_b

In [12]:
start = time.time()
M2 = ttnn.matmul(
        R1,
        C,
        memory_config=ttnn.L1_MEMORY_CONFIG,
        dtype=ttnn.bfloat8_b, # use float8 data type
        core_grid=(b, n), # specify grid cores to run matmul on
    )
R2 = ttnn.add(M2, M2, memory_config=ttnn.L1_MEMORY_CONFIG) # place on L1 memory
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R2 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R2.shape)

Took: 1.2832131385803223 seconds!
[8, 384, 1024]


# Rerun matmul to take advantage of program cache speedup

In [13]:
start = time.time()
M2 = ttnn.matmul(
        R1,
        C,
        memory_config=ttnn.L1_MEMORY_CONFIG,
        dtype=ttnn.bfloat8_b, # use float8 data type
        core_grid=(b, n), # specify grid cores to run matmul on
    )
R2 = ttnn.add(M2, M2, memory_config=ttnn.L1_MEMORY_CONFIG) # place on L1 memory
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R2 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R2.shape)

Took: 0.0024156570434570312 seconds!
[8, 384, 1024]


# the dtype did not speed up the matmul, is this due to L1 fragmentation? 

In [14]:
A = torch.randn((b, s, h), dtype=torch.bfloat16)
A = ttnn.from_torch(A)
# tilize before matmul
A = ttnn.to_layout(A, ttnn.TILE_LAYOUT)
A = ttnn.to_device(A, device, memory_config=ttnn.DRAM_MEMORY_CONFIG)

B = torch.randn((h, h), dtype=torch.bfloat16)
B = ttnn.from_torch(B)
B = ttnn.to_layout(B, ttnn.TILE_LAYOUT)
B = ttnn.to_device(B, device, memory_config=ttnn.DRAM_MEMORY_CONFIG)


start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.DRAM_MEMORY_CONFIG,
        dtype=ttnn.bfloat8_b, 
        core_grid=(b, n),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.DRAM_MEMORY_CONFIG)
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R1.shape)

Took: 0.701779842376709 seconds!
[8, 384, 1024]


In [15]:
start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.DRAM_MEMORY_CONFIG,
        dtype=ttnn.bfloat8_b, 
        core_grid=(b, n),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.DRAM_MEMORY_CONFIG,)
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R1.shape)

Took: 0.0016560554504394531 seconds!
[8, 384, 1024]


# Matmul3 followed by softmax

In [None]:
start = time.time()
M3 = ttnn.matmul(
        M2,
        D,
        memory_config=ttnn.L1_MEMORY_CONFIG,
        dtype=ttnn.bfloat8_b, # use float8 data type
        core_grid=(b, n), # specify grid cores to run matmul on
    )

#ttnn.softmax
R3 = ttnn.softmax(M3, -1)
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R3 = ttnn.to_layout(R3, ttnn.ROW_MAJOR_LAYOUT)
print(R3[:, 1:10, 0])

In [None]:
# Rerun matmul to take advantage of program cache speedup

In [None]:
start = time.time()
M3 = ttnn.matmul(
        M2,
        D,
        memory_config=ttnn.L1_MEMORY_CONFIG,
        dtype=ttnn.bfloat8_b, # use float8 data type
        core_grid=(b, n), # specify grid cores to run matmul on
    )

#ttnn.softmax
R3 = ttnn.softmax(M3, -1)
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R3 = ttnn.to_layout(R3, ttnn.ROW_MAJOR_LAYOUT)
print(R3[:, 1:10, 0])

In [None]:
ttnn.close(device)

# Defragment L1 memory Space