# Imports & open Device

In [1]:
import time
import torch
import ttnn

torch.manual_seed(0)
device_id = 0
device = ttnn.open(device_id)

[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Initializing device 0
[38;2;000;128;000m                 Device[0m | [1m[38;2;100;149;237mINFO    [0m | Opening user mode device driver
[32m2024-01-05 02:04:45.117[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Detected 1 PCI device
[32m2024-01-05 02:04:45.139[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using 1 Hugepages/NumHostMemChannels for TTDevice (pci_interface_id: 0 device_id: 0xfaca revision: 0)
[32m2024-01-05 02:04:45.221[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Disable PCIE DMA
[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | AI CLK for device 0 is:   1202 MHz


# Enable program cache

In [2]:
ttnn.enable_program_cache()

[38;2;000;128;000m                     Op[0m | [1m[38;2;100;149;237mINFO    [0m | Program Cache: enabled.


# Matrix Multiplications 

# Constants

In [3]:
b = 8
n = 12
s = 384
h = 1024

# Define matrix A and B and place them on DRAM

In [4]:
A = torch.randn((b, s, h), dtype=torch.bfloat16)
A = ttnn.from_torch(A)
# tilize before matmul
A = ttnn.to_layout(A, ttnn.TILE_LAYOUT)
A = ttnn.to_device(A, device, memory_config=ttnn.DRAM_MEMORY_CONFIG)

In [5]:
B = torch.randn((h, h), dtype=torch.bfloat16)
B = ttnn.from_torch(B)
B = ttnn.to_layout(B, ttnn.TILE_LAYOUT)
B = ttnn.to_device(B, device, memory_config=ttnn.DRAM_MEMORY_CONFIG)

# Define matrix C and D and place them on L1 

In [6]:
C = torch.randn((h, s), dtype=torch.bfloat16)
C = ttnn.from_torch(C)
C = ttnn.to_layout(C, ttnn.TILE_LAYOUT)
C = ttnn.to_device(C, device, memory_config=ttnn.L1_MEMORY_CONFIG)

In [7]:
D = torch.randn((s, s), dtype=torch.bfloat16)
D = ttnn.from_torch(D)
D = ttnn.to_layout(D, ttnn.TILE_LAYOUT)
D = ttnn.to_device(D, device, memory_config=ttnn.L1_MEMORY_CONFIG)

# Matmul 1

In [None]:
start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.DRAM_MEMORY_CONFIG,
        dtype=ttnn.bfloat16, 
        core_grid=(b, n),
        #core_grid=(6,6),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.DRAM_MEMORY_CONFIG,)
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R1.shape)

# Rerun matmul to take advantage of program cache speedup

In [None]:
start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.DRAM_MEMORY_CONFIG,
        dtype=ttnn.bfloat16, 
        core_grid=(b, n),
        #core_grid=(6,6),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.DRAM_MEMORY_CONFIG,)
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R1.shape)

### Use L1 memory

In [None]:
# Re-dfine matrices A and B and place them on L1 memory 
A = torch.randn((b, s, h), dtype=torch.bfloat16)
A = ttnn.from_torch(A)
# tilize before matmul
A = ttnn.to_layout(A, ttnn.TILE_LAYOUT)
# put on L1 moemory 
A = ttnn.to_device(A, device, memory_config=ttnn.L1_MEMORY_CONFIG)


B = torch.randn((h, h), dtype=torch.bfloat16)
B = ttnn.from_torch(B)
B = ttnn.to_layout(B, ttnn.TILE_LAYOUT)
B = ttnn.to_device(B, device, memory_config=ttnn.L1_MEMORY_CONFIG)



start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.L1_MEMORY_CONFIG, # place on L1 memory
        dtype=ttnn.bfloat16, 
        core_grid=(b, n),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.L1_MEMORY_CONFIG) # place on L1 memory
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
#R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
#print(R1.shape)

# Rerun matmul to take advantage of program cache speedup

In [None]:
start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.L1_MEMORY_CONFIG, # place on L1 memory
        dtype=ttnn.bfloat16, 
        core_grid=(b, n),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.L1_MEMORY_CONFIG) # place on L1 memory
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
#R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
#print(R1.shape)

# Matmul 2 : data type bfloat8_b

In [None]:
start = time.time()
M2 = ttnn.matmul(
        R1,
        C,
        memory_config=ttnn.L1_MEMORY_CONFIG,
        dtype=ttnn.bfloat8_b, # use float8 data type
        core_grid=(b, n), # specify grid cores to run matmul on
    )
R2 = ttnn.add(M2, M2, memory_config=ttnn.L1_MEMORY_CONFIG) # place on L1 memory
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R2 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R2.shape)

# Rerun matmul to take advantage of program cache speedup

In [None]:
start = time.time()
M2 = ttnn.matmul(
        R1,
        C,
        memory_config=ttnn.L1_MEMORY_CONFIG,
        dtype=ttnn.bfloat8_b, # use float8 data type
        core_grid=(b, n), # specify grid cores to run matmul on
    )
R2 = ttnn.add(M2, M2, memory_config=ttnn.L1_MEMORY_CONFIG) # place on L1 memory
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R2 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R2.shape)

# the dtype did not speed up the matmul, is this due to L1 fragmentation? 

In [None]:
A = torch.randn((b, s, h), dtype=torch.bfloat16)
A = ttnn.from_torch(A)
# tilize before matmul
A = ttnn.to_layout(A, ttnn.TILE_LAYOUT)
A = ttnn.to_device(A, device, memory_config=ttnn.DRAM_MEMORY_CONFIG)

B = torch.randn((h, h), dtype=torch.bfloat16)
B = ttnn.from_torch(B)
B = ttnn.to_layout(B, ttnn.TILE_LAYOUT)
B = ttnn.to_device(B, device, memory_config=ttnn.DRAM_MEMORY_CONFIG)


start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.DRAM_MEMORY_CONFIG,
        dtype=ttnn.bfloat8_b, 
        core_grid=(b, n),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.DRAM_MEMORY_CONFIG)
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R1.shape)

In [None]:
start = time.time()
M1 = ttnn.matmul(
        A,
        B,
        memory_config=ttnn.DRAM_MEMORY_CONFIG,
        dtype=ttnn.bfloat8_b, 
        core_grid=(b, n),
    )
R1 = ttnn.add(M1, M1, memory_config=ttnn.DRAM_MEMORY_CONFIG,)
end = time.time()
duration = end - start
print("Took: " +  str(duration) + " seconds!")
R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
print(R1.shape)

# Matmul3 followed by softmax

In [None]:
#start = time.time()
#M3 = ttnn.matmul(
#        M2,
#        D,
#        memory_config=ttnn.L1_MEMORY_CONFIG,
#        dtype=ttnn.bfloat8_b, # use float8 data type
#        core_grid=(b, n), # specify grid cores to run matmul on
#    )

#ttnn.softmax
#R3 = ttnn.softmax(M3, -1)
#end = time.time()
#duration = end - start
#print("Took: " +  str(duration) + " seconds!")
#R3 = ttnn.to_layout(R3, ttnn.ROW_MAJOR_LAYOUT)
#print(R3[:, 1:10, 0])

In [None]:
# Rerun matmul to take advantage of program cache speedup

In [None]:
#start = time.time()
#M3 = ttnn.matmul(
#        M2,
#        D,
#        memory_config=ttnn.L1_MEMORY_CONFIG,
#        dtype=ttnn.bfloat8_b, # use float8 data type
#        core_grid=(b, n), # specify grid cores to run matmul on
#    )

#ttnn.softmax
#R3 = ttnn.softmax(M3, -1)
#end = time.time()
#duration = end - start
#print("Took: " +  str(duration) + " seconds!")
#R3 = ttnn.to_layout(R3, ttnn.ROW_MAJOR_LAYOUT)
#print(R3[:, 1:10, 0])

# Time different batch size and grid core size combinations 

In [8]:
batch = [1, 2, 4, 6, 8, 10]
grid = [1, 2, 4, 6, 8, 10]

for b in batch:
    for g in grid:
        A = torch.randn((b, s, h), dtype=torch.bfloat16)
        A = ttnn.from_torch(A)
        # tilize before matmul
        A = ttnn.to_layout(A, ttnn.TILE_LAYOUT)
        A = ttnn.to_device(A, device, memory_config=ttnn.L1_MEMORY_CONFIG)

        B = torch.randn((h, h), dtype=torch.bfloat16)
        B = ttnn.from_torch(B)
        B = ttnn.to_layout(B, ttnn.TILE_LAYOUT)
        B = ttnn.to_device(B, device, memory_config=ttnn.L1_MEMORY_CONFIG)

        try:
            start = time.time()
            M1 = ttnn.matmul(
            A,
            B,
            memory_config=ttnn.DRAM_MEMORY_CONFIG,
            dtype=ttnn.bfloat16, 
            core_grid=(g, g),
            )
            R1 = ttnn.add(M1, M1, memory_config=ttnn.DRAM_MEMORY_CONFIG,)
            end = time.time()
            duration = end - start
            #print("\n")
            #print("b: " + str(b))
            #print("g: " + str(g))
            #print("Took: " +  str(duration) + " seconds!")
            #print("\n")
            #R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
            #print(R1.shape)
        except Exception as E:
            pass
            #print("b is: " + str(b) + " g is: " + str(g))
            #print("ERROR!)
            #print("Exception: ", E)
        


[38;2;000;128;000m                 Always[0m | [1m[38;2;255;000;000mFATAL   [0m | mcast_in1 is not implemented yet.
[38;2;000;128;000m                 Always[0m | [1m[38;2;255;000;000mFATAL   [0m | mcast_in1 is not implemented yet.
[38;2;000;128;000m                 Always[0m | [1m[38;2;255;000;000mFATAL   [0m | mcast_in1 is not implemented yet.
[38;2;000;128;000m                 Always[0m | [1m[38;2;255;000;000mFATAL   [0m | Statically allocated circular buffers on core range [(x=0,y=0) - (x=1,y=1)] grow to 1564672 B which is beyond max L1 size of 1048576 B
[38;2;000;128;000m                 Always[0m | [1m[38;2;255;000;000mFATAL   [0m | mcast_in1 is not implemented yet.
[38;2;000;128;000m                 Always[0m | [1m[38;2;255;000;000mFATAL   [0m | Statically allocated circular buffers on core range [(x=0,y=0) - (x=1,y=1)] grow to 2154496 B which is beyond max L1 size of 1048576 B
[38;2;000;128;000m                 Always[0m | [1m[38;2;255;000;000m

# Rerun matmul to take advantage of program cache speedup

In [9]:
batch = [1, 2, 4, 6, 8, 10]
grid = [1, 2, 4, 6, 8, 10]
grids = ["     ", "grid1", "grid2", "grid4", "grid6", "grid8", "grid10"]
table = []
table.append(grids)
for b in batch:
    row = []
    row.append("batch" + str(b))
    for g in grid:
        A = torch.randn((b, s, h), dtype=torch.bfloat16)
        A = ttnn.from_torch(A)
        # tilize before matmul
        A = ttnn.to_layout(A, ttnn.TILE_LAYOUT)
        A = ttnn.to_device(A, device, memory_config=ttnn.L1_MEMORY_CONFIG)

        B = torch.randn((h, h), dtype=torch.bfloat16)
        B = ttnn.from_torch(B)
        B = ttnn.to_layout(B, ttnn.TILE_LAYOUT)
        B = ttnn.to_device(B, device, memory_config=ttnn.L1_MEMORY_CONFIG)

        try:
            start = time.time()
            M1 = ttnn.matmul(
            A,
            B,
            memory_config=ttnn.DRAM_MEMORY_CONFIG,
            dtype=ttnn.bfloat16, 
            core_grid=(g, g),
            )
            R1 = ttnn.add(M1, M1, memory_config=ttnn.DRAM_MEMORY_CONFIG,)
            end = time.time()
            duration = end - start
            row.append(round(duration, 6))
            print("\n")
            print("b: " + str(b))
            print("g: " + str(g))
            print("Took: " +  str(duration) + " seconds!")
            print("\n")
            #R1 = ttnn.to_layout(R1, ttnn.ROW_MAJOR_LAYOUT)
            #print(R1.shape)
        except Exception as E:
            row.append(" ERROR ")
            print("b is: " + str(b) + " g is: " + str(g))
            print("ERROR!")
            #print("Exception: ", E)
    table.append(row)

for row in table:
    print(row)


b is: 1 g is: 1
ERROR!
[38;2;000;128;000m                 Always[0m | [1m[38;2;255;000;000mFATAL   [0m | mcast_in1 is not implemented yet.


b: 1
g: 2
Took: 0.0008168220520019531 seconds!




b: 1
g: 4
Took: 0.0007686614990234375 seconds!




b: 1
g: 6
Took: 0.0007557868957519531 seconds!




b: 1
g: 8
Took: 0.0008060932159423828 seconds!




b: 1
g: 10
Took: 0.0007746219635009766 seconds!


b is: 2 g is: 1
ERROR!
[38;2;000;128;000m                 Always[0m | [1m[38;2;255;000;000mFATAL   [0m | mcast_in1 is not implemented yet.


b: 2
g: 2
Took: 0.0008080005645751953 seconds!




b: 2
g: 4
Took: 0.0007913112640380859 seconds!




b: 2
g: 6
Took: 0.0007541179656982422 seconds!




b: 2
g: 8
Took: 0.0007760524749755859 seconds!




b: 2
g: 10
Took: 0.0007596015930175781 seconds!


b is: 4 g is: 1
ERROR!
[38;2;000;128;000m                 Always[0m | [1m[38;2;255;000;000mFATAL   [0m | mcast_in1 is not implemented yet.
b is: 4 g is: 2
ERROR!
[38;2;000;128;000m              

In [None]:
table = [['Row {} Col {}'.format(row + 1, col +1) for col in range(6)] for row in range(6)]
for row in table:
    print(row)

In [None]:
ttnn.close(device)

In [None]:
ttnn.close(device)

# Defragment L1 memory Space