In [1]:
import torch
import numpy as np


print("\n--- Softmax Precision Demo ---")
logits_fp32 = torch.tensor([128.0] + [128.5], dtype=torch.float32)
logits_bf16 = logits_fp32.to(torch.bfloat16)

softmax_fp32 = torch.nn.functional.softmax(logits_fp32, dim=0)
softmax_bf16 = torch.nn.functional.softmax(logits_bf16, dim=0)

print("Softmax Output (float32):", softmax_fp32)
print("Softmax Output (bfloat16):", softmax_bf16) # since 1.00000001 x 2^7 , since bf16 has only 7 bits of mantissa it cant get the 1 in the 8th bit so it gets rounded to 128 
print("Difference:", (softmax_fp32 - softmax_bf16))
difference = softmax_fp32 - softmax_bf16
percentage_error = difference / softmax_fp32 * 100

print("Percentage Error (%):", percentage_error)


--- Softmax Precision Demo ---
Softmax Output (float32): tensor([0.3775, 0.6225])
Softmax Output (bfloat16): tensor([0.5000, 0.5000], dtype=torch.bfloat16)
Difference: tensor([-0.1225,  0.1225])
Percentage Error (%): tensor([-32.4361,  19.6735])


In [2]:
print("\n--- Matrix Multiplication Precision Demo ---")
A = torch.randn((5, 5), dtype=torch.float32) * 1000  # Large values
B = torch.randn((5, 5), dtype=torch.float32) * 1000

A_bf16 = A.to(torch.bfloat16)
B_bf16 = B.to(torch.bfloat16)

C_fp32 = A @ B
C_bf16 = A_bf16 @ B_bf16

print("Matrix Multiplication Result (float32):\n", C_fp32)
print("Matrix Multiplication Result (bfloat16):\n", C_bf16.to(torch.float32))
print("Difference:\n", (C_fp32 - C_bf16.to(torch.float32)))


--- Matrix Multiplication Precision Demo ---
Matrix Multiplication Result (float32):
 tensor([[ 4761508.5000,  3016870.7500,  -413075.7188, -4229289.5000,
         -1386271.0000],
        [ 1121975.1250, -2302104.7500, -1298255.6250,   567901.2500,
           277495.8750],
        [ 2676308.7500,  -825978.3750, -1779275.0000, -2562160.0000,
          1479074.2500],
        [-2551562.0000, -2716938.7500,  -620452.7500,  4154820.0000,
          -229077.9688],
        [-1324644.5000, -1441797.3750, -1320998.3750,   544282.6250,
          2094519.1250]])
Matrix Multiplication Result (bfloat16):
 tensor([[ 4751360.,  3014656.,  -409600., -4227072., -1384448.],
        [ 1122304., -2293760., -1294336.,   565248.,   276480.],
        [ 2670592.,  -823296., -1777664., -2572288.,  1482752.],
        [-2539520., -2703360.,  -618496.,  4145152.,  -231424.],
        [-1327104., -1433600., -1318912.,   544768.,  2097152.]])
Difference:
 tensor([[ 10148.5000,   2214.7500,  -3475.7188,  -2217.5000, 

In [3]:
print("\n--- Summation Precision Demo ---")
small_numbers = torch.tensor([1e-8] * 1000000, dtype=torch.float32)
large_number = torch.tensor([1e2], dtype=torch.float32)

sum_fp32 = large_number + small_numbers.sum()
sum_bf16 = large_number + small_numbers.to(torch.bfloat16).sum()

print("Summation Result (float32):", sum_fp32.item())
print("Summation Result (bfloat16):", sum_bf16.item())
print("Difference:", sum_fp32.item() - sum_bf16.item())


--- Summation Precision Demo ---
Summation Result (float32): 100.01000213623047
Summation Result (bfloat16): 100.010009765625
Difference: -7.62939453125e-06


In [4]:
print("\n--- Roundoff Error Calculation ---")
number_ranges = [(2, 4), (32, 64), (1024, 2048), (2**20, 2**21), (2**30, 2**31)]
mantissa_bits_bf16 = 7
mantissa_bits_fp32 = 23

for r in number_ranges:
    step_bf16 = (r[1] - r[0]) / (2 ** mantissa_bits_bf16)
    step_fp32 = (r[1] - r[0]) / (2 ** mantissa_bits_fp32)
    print(f"Range {r}: BFloat16 Roundoff Error = {step_bf16}, Float32 Roundoff Error = {step_fp32}")


--- Roundoff Error Calculation ---
Range (2, 4): BFloat16 Roundoff Error = 0.015625, Float32 Roundoff Error = 2.384185791015625e-07
Range (32, 64): BFloat16 Roundoff Error = 0.25, Float32 Roundoff Error = 3.814697265625e-06
Range (1024, 2048): BFloat16 Roundoff Error = 8.0, Float32 Roundoff Error = 0.0001220703125
Range (1048576, 2097152): BFloat16 Roundoff Error = 8192.0, Float32 Roundoff Error = 0.125
Range (1073741824, 2147483648): BFloat16 Roundoff Error = 8388608.0, Float32 Roundoff Error = 128.0


In [22]:
import torch

# Generate numbers between 2 and 4 with small increments
values_fp32 = torch.arange(2.0, 4.01, 0.01, dtype=torch.float32)
values_bf16 = values_fp32.to(torch.bfloat16)

# Print first few values to see the rounding effect
for i in range(10):  # Display only first 10 for brevity
    print(f"Original FP32: {values_fp32[i].item():.5f}, Rounded BF16: {values_bf16[i].item():.5f}")


Original FP32: 2.00000, Rounded BF16: 2.00000
Original FP32: 2.01000, Rounded BF16: 2.01562
Original FP32: 2.02000, Rounded BF16: 2.01562
Original FP32: 2.03000, Rounded BF16: 2.03125
Original FP32: 2.04000, Rounded BF16: 2.04688
Original FP32: 2.05000, Rounded BF16: 2.04688
Original FP32: 2.06000, Rounded BF16: 2.06250
Original FP32: 2.07000, Rounded BF16: 2.06250
Original FP32: 2.08000, Rounded BF16: 2.07812
Original FP32: 2.09000, Rounded BF16: 2.09375


In [24]:
import struct
import torch

def float_to_fp32_bf16_repr(value):
    # Convert to float32 (IEEE 754 Single Precision)
    fp32_bytes = struct.pack('f', value)  # Convert float to IEEE 754 binary format
    fp32_bits = ''.join(f'{b:08b}' for b in struct.unpack('4B', fp32_bytes))  # Convert bytes to bit string
    
    # Convert to bfloat16 (truncate lower 16 bits)
    bf16_tensor = torch.tensor(value, dtype=torch.bfloat16)
    bf16_bits = f'{bf16_tensor.view(torch.int16).item():016b}'  # Convert bfloat16 to bit string
    
    # Display results
    print(f"Value: {value}\n")
    print(f"Float32 (IEEE 754) Representation: {fp32_bits[:1]} {fp32_bits[1:9]} {fp32_bits[9:]}")
    print(f"BFloat16 Representation: {bf16_bits[:1]} {bf16_bits[1:9]} {bf16_bits[9:]}")

# Test with 195.25
float_to_fp32_bf16_repr(195.25)

Value: 195.25

Float32 (IEEE 754) Representation: 0 00000000 10000000100001101000011
BFloat16 Representation: 0 10000110 1000011


In [29]:
import struct
import torch

def float32_components(value: float):
    """
    Returns the sign, exponent, and mantissa (23 bits) of a float32 number.
    """
    # Pack the float into 32-bit binary (little-endian), then unpack as an unsigned int
    bits = struct.unpack('<I', struct.pack('<f', value))[0]
    sign = (bits >> 31) & 0x1
    exponent = (bits >> 23) & 0xFF
    mantissa = bits & 0x7FFFFF
    return sign, exponent, mantissa

def bf16_components(value: float):
    """
    Returns the sign, exponent, and mantissa (7 bits) of a bfloat16 number.
    """
    # Convert to bfloat16 using PyTorch, then interpret the 16 bits as an integer
    bf16_val = torch.tensor(value, dtype=torch.bfloat16).view(torch.int16).item() & 0xFFFF
    sign = (bf16_val >> 15) & 0x1
    exponent = (bf16_val >> 7) & 0xFF
    mantissa = bf16_val & 0x7F
    return sign, exponent, mantissa

def print_float32_bf16(value: float):
    """
    Prints the binary representation of `value` in float32 (IEEE 754) and bfloat16.
    """
    s32, e32, m32 = float32_components(value)
    s_bf16, e_bf16, m_bf16 = bf16_components(value)
    
    # Build binary strings
    float32_bin = f"{s32} {e32:08b} {m32:023b}"       # 1 + 8 + 23 = 32 bits
    bf16_bin    = f"{s_bf16} {e_bf16:08b} {m_bf16:07b}" # 1 + 8 + 7 = 16 bits
    
    print(f"Value: {value}")
    print(f"  Float32 (IEEE 754): {float32_bin}")
    print(f"  BFloat16:           {bf16_bin}\n")

# Test values
values = [30.0, 31.99999, 32.0, 32.000001,2 , 32.015625]
for v in values:
    print_float32_bf16(v)


Value: 30.0
  Float32 (IEEE 754): 0 10000011 11100000000000000000000
  BFloat16:           0 10000011 1110000

Value: 31.99999
  Float32 (IEEE 754): 0 10000011 11111111111111111111011
  BFloat16:           0 10000100 0000000

Value: 32.0
  Float32 (IEEE 754): 0 10000100 00000000000000000000000
  BFloat16:           0 10000100 0000000

Value: 32.000001
  Float32 (IEEE 754): 0 10000100 00000000000000000000000
  BFloat16:           0 10000100 0000000

Value: 2
  Float32 (IEEE 754): 0 10000000 00000000000000000000000
  BFloat16:           0 10000000 0000000

Value: 32.015625
  Float32 (IEEE 754): 0 10000100 00000000001000000000000
  BFloat16:           0 10000100 0000000

