### MODEL ARCHITECTURE

In [3]:
import torch.nn as nn
import torch

class LSTMModel3L(nn.Module):
    def __init__(self, in_dim=11, hidden_size=300, num_layers=1, output_size=2):
        super().__init__()
        self.lstm_1 = nn.LSTM(in_dim, hidden_size, num_layers, batch_first=True)
        self.lstm_2 = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.lstm_3 = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        x, _ = self.lstm_1(x)
        x, _ = self.lstm_2(x)
        x, _ = self.lstm_3(x)
        return self.fc(x[:, -1, :])

# Instantiate and load weights
model = LSTMModel3L()
ckpt_path = "best_model.pt"  # <-- change to your checkpoint
torch.save(model, ckpt_path)
state = torch.load(ckpt_path, map_location="cpu", weights_only=False)

# Some checkpoints save model.state_dict() inside a dict
if isinstance(state, dict) and "state_dict" in state:
    state = state["state_dict"]
elif hasattr(state, "state_dict"):
    state = state.state_dict()

# remove unwanted prefixes (e.g. "module." if saved from DDP)
state = {k.replace("module.", ""): v for k, v in state.items()}


### Dumping model weight and saving it in npz

In [4]:
import torch, numpy as np

ckpt_path = "best_model.pt"   # <-- change me

obj = torch.load(ckpt_path, map_location="cpu", weights_only=False)

# 1) Pick the right sub-dict containing tensors
if isinstance(obj, dict):
    if "model_state_dict" in obj:
        sd = obj["model_state_dict"]
    elif "state_dict" in obj:
        sd = obj["state_dict"]
    elif any(torch.is_tensor(v) for v in obj.values()):
        sd = obj
    else:
        raise ValueError("Could not find model weights in checkpoint dict.")
elif hasattr(obj, "state_dict"):
    sd = obj.state_dict()
else:
    raise ValueError("Unrecognized checkpoint format")

# 2) Flatten one level if some entries are nested dicts
flat = {}
for k, v in sd.items():
    if isinstance(v, dict):
        for kk, vv in v.items():
            if torch.is_tensor(vv):
                flat[f"{k}.{kk}"] = vv
    elif torch.is_tensor(v):
        flat[k] = v

# 3) Strip common wrappers (DDP/Lightning/etc.)
def strip_prefix(k):
    for p in ("module.", "model.", "net.", "student."):
        if k.startswith(p):
            return k[len(p):]
    return k

flat = {strip_prefix(k): v for k, v in flat.items()}

# 4) Save to NPZ
npz_path = "state_dict_npz.npz"
np.savez(npz_path, **{k: v.detach().cpu().numpy() for k, v in flat.items()})
print(f"✅ Saved {len(flat)} tensors → {npz_path}")
print("Sample keys:", list(flat.keys()))


✅ Saved 14 tensors → state_dict_npz.npz
Sample keys: ['lstm_1.weight_ih_l0', 'lstm_1.weight_hh_l0', 'lstm_1.bias_ih_l0', 'lstm_1.bias_hh_l0', 'lstm_2.weight_ih_l0', 'lstm_2.weight_hh_l0', 'lstm_2.bias_ih_l0', 'lstm_2.bias_hh_l0', 'lstm_3.weight_ih_l0', 'lstm_3.weight_hh_l0', 'lstm_3.bias_ih_l0', 'lstm_3.bias_hh_l0', 'fc.weight', 'fc.bias']


### Checking dict keys

In [5]:
print(list(flat.keys()))

['lstm_1.weight_ih_l0', 'lstm_1.weight_hh_l0', 'lstm_1.bias_ih_l0', 'lstm_1.bias_hh_l0', 'lstm_2.weight_ih_l0', 'lstm_2.weight_hh_l0', 'lstm_2.bias_ih_l0', 'lstm_2.bias_hh_l0', 'lstm_3.weight_ih_l0', 'lstm_3.weight_hh_l0', 'lstm_3.bias_ih_l0', 'lstm_3.bias_hh_l0', 'fc.weight', 'fc.bias']


### Creating Tf model and saving it

In [10]:
import os 
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# ---- config (static) ----
BATCH, SEQ = 1, 100
HID, OUT   = 300, 2
NPZ        = "state_dict_npz.npz"
SAVE_DIR   = "tf_export_lstm3_unrolled_static"
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"   # optional for stable numerics

sd = np.load(NPZ)

# infer IN_DIM from lstm_1 weights; force 4 layers
Wi1 = sd["lstm_1.weight_ih_l0"]           # (4H, IN_DIM)
IN_DIM = Wi1.shape[1]
assert Wi1.shape[0] // 4 == HID, "HID mismatch with NPZ weights"
N_LAYERS = 3

# ---- build 4×LSTM (unrolled) ----
inp = keras.Input(shape=(SEQ, IN_DIM), name="input")
x = inp
for i in range(1, N_LAYERS + 1):
    x = layers.LSTM(
        HID, return_sequences=True, unroll=True,
        unit_forget_bias=False, activation="tanh",
        recurrent_activation="sigmoid", name=f"lstm_{i}"
    )(x)
x = layers.Lambda(lambda t: t[:, -1, :], name="take_last_t")(x)
out = layers.Dense(OUT, name="fc")(x)
model = keras.Model(inp, out, name="LSTMModel3L_unrolled")

# ---- copy weights ----
def copy_lstm(i):
    Wi = sd[f"lstm_{i}.weight_ih_l0"]   # (4H, in)
    Wh = sd[f"lstm_{i}.weight_hh_l0"]   # (4H, H)
    bi = sd[f"lstm_{i}.bias_ih_l0"]     # (4H,)
    bh = sd[f"lstm_{i}.bias_hh_l0"]     # (4H,)
    model.get_layer(f"lstm_{i}").set_weights([Wi.T, Wh.T, bi + bh])

for i in range(1, N_LAYERS + 1):
    copy_lstm(i)

W = sd["fc.weight"]   # (OUT, HID)
b = sd["fc.bias"]     # (OUT,)
model.get_layer("fc").set_weights([W.T, b])
print("Weights copied ✅")

# quick run
y = model(np.random.randn(BATCH, SEQ, IN_DIM).astype(np.float32))
print("Forward OK, output shape:", y.shape)

# ---- export SavedModel (static signature) ----
os.makedirs(SAVE_DIR, exist_ok=True)
@tf.function(input_signature=[tf.TensorSpec([BATCH, SEQ, IN_DIM], tf.float32, name="input")])
def serve(x): return {"output": model(x, training=False)}
tf.saved_model.save(model, SAVE_DIR, signatures={"serving_default": serve.get_concrete_function()})
print("SavedModel →", SAVE_DIR)

# ---- TFLite FP32 ----
conv = tf.lite.TFLiteConverter.from_saved_model(SAVE_DIR)
open(os.path.join(SAVE_DIR, "model_fp32.tflite"), "wb").write(conv.convert())
print("TFLite FP32 →", os.path.join(SAVE_DIR, "model_fp32.tflite"))

# ---- TFLite FP16 ----
conv = tf.lite.TFLiteConverter.from_saved_model(SAVE_DIR)
conv.optimizations = [tf.lite.Optimize.DEFAULT]
conv.target_spec.supported_types = [tf.float16]
open(os.path.join(SAVE_DIR, "model_fp16.tflite"), "wb").write(conv.convert())
print("TFLite FP16 →", os.path.join(SAVE_DIR, "model_fp16.tflite"))


2025-11-19 16:12:54.555529: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Weights copied ✅
Forward OK, output shape: (1, 2)
INFO:tensorflow:Assets written to: tf_export_lstm3_unrolled_static/assets


INFO:tensorflow:Assets written to: tf_export_lstm3_unrolled_static/assets


SavedModel → tf_export_lstm3_unrolled_static


W0000 00:00:1763548979.973434  322125 tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
W0000 00:00:1763548979.973459  322125 tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2025-11-19 16:12:59.973737: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: tf_export_lstm3_unrolled_static
2025-11-19 16:12:59.980963: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-11-19 16:12:59.980979: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: tf_export_lstm3_unrolled_static
I0000 00:00:1763548980.050670  322125 mlir_graph_optimization_pass.cc:437] MLIR V1 optimization pass is not enabled
2025-11-19 16:13:00.060177: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-11-19 16:13:00.184634: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: tf_export_lstm3_unrolled_static
2025-11-19 16:13:00.295498: I tensorf

TFLite FP32 → tf_export_lstm3_unrolled_static/model_fp32.tflite


W0000 00:00:1763548985.183335  322125 tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
W0000 00:00:1763548985.183358  322125 tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2025-11-19 16:13:05.183497: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: tf_export_lstm3_unrolled_static
2025-11-19 16:13:05.191144: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-11-19 16:13:05.191159: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: tf_export_lstm3_unrolled_static
2025-11-19 16:13:05.261787: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-11-19 16:13:05.384534: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: tf_export_lstm3_unrolled_static
2025-11-19 16:13:05.494318: I tensorflow/cc/saved_model/loader.cc:471] SavedModel load for tags { serve }; Status: success: OK. Took 310824 microseconds.

TFLite FP16 → tf_export_lstm3_unrolled_static/model_fp16.tflite


### Sample i_o saving of torch model

In [12]:
# ---- PyTorch parity export ----
import numpy as np, torch, torch.nn as nn

ckpt_path = "best_model.pt"  # <-- change me
SEQ, HID, OUT = 100, 300, 2

class LSTMModelV3(nn.Module):
    def __init__(self, in_dim=11, hidden_size=HID, num_layers=1, output_size=OUT):
        super().__init__()
        self.lstm_1 = nn.LSTM(in_dim,       hidden_size, num_layers, batch_first=True)
        self.lstm_2 = nn.LSTM(hidden_size,  hidden_size, num_layers, batch_first=True)
        self.lstm_3 = nn.LSTM(hidden_size,  hidden_size, num_layers, batch_first=True)
        # self.lstm_4 = nn.LSTM(hidden_size,  hidden_size, num_layers, batch_first=True)
        self.fc     = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        o1,_ = self.lstm_1(x)
        o2,_ = self.lstm_2(o1)
        o3,_ = self.lstm_3(o2)
        # o4,_ = self.lstm_4(o3)
        y = self.fc(o3[:, -1, :])
        return y, (o1,o2,o3)

# load checkpoint (handles common formats)
obj = torch.load(ckpt_path, map_location="cpu", weights_only=False)
if hasattr(obj, "state_dict"):
    sd = obj.state_dict()
elif isinstance(obj, dict) and "model_state_dict" in obj:
    sd = obj["model_state_dict"]
elif isinstance(obj, dict) and "state_dict" in obj:
    sd = obj["state_dict"]
else:
    sd = obj
sd = {k.replace("module.", ""): v for k,v in sd.items()}

in_dim = sd["lstm_1.weight_ih_l0"].shape[1]
m = LSTMModelV3(in_dim=in_dim).eval()
m.load_state_dict(sd, strict=False)

# fixed input
rng = np.random.default_rng(123)
x_np = rng.standard_normal((1, SEQ, in_dim), dtype=np.float32)
with torch.no_grad():
    y_pt, (o1,o2,o3) = m(torch.from_numpy(x_np))
y_pt = y_pt.cpu().numpy()
o1,o2,o3 = [t.cpu().numpy() for t in (o1,o2,o3)]

np.savez("parity_io_pt.npz",
         x=x_np, y_pt=y_pt, o1=o1, o2=o2, o3=o3,
         in_dim=in_dim, hid=HID, seq=SEQ, out_dim=OUT)
print("wrote parity_io_pt.npz")


wrote parity_io_pt.npz


### Checking Tf model output

In [None]:
# ---- TF parity check that doesn't require keras load_model on SavedModel ----
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

MODEL_DIR = "tf_export_lstm3_unrolled_static"  # your exported SavedModel
NPZ_WEIGHTS = "state_dict_npz.npz"             # Torch → NPZ weights
PT_REF = "parity_io_pt.npz"                    # PyTorch reference (per-layer + final)

# 1) Load NPZ weights and infer shapes
sd = np.load(NPZ_WEIGHTS)
Wi1 = sd["lstm_1.weight_ih_l0"]           # (4H, IN_DIM)
HID = Wi1.shape[0] // 4
IN_DIM = Wi1.shape[1]
SEQ = 100     # use the same you trained/exported
OUT = sd["fc.weight"].shape[0]
print(f"IN_DIM={IN_DIM}, HID={HID}, OUT={OUT}, SEQ={SEQ}")

# 2) Rebuild the unrolled 4×LSTM Keras model (to expose intermediate layers)
inp = keras.Input(shape=(SEQ, IN_DIM), name="input")
x = inp
for i in range(1, 4):
    x = layers.LSTM(
        HID, return_sequences=True, unroll=True,
        unit_forget_bias=False, activation="tanh",
        recurrent_activation="sigmoid", name=f"lstm_{i}"
    )(x)
x = layers.Lambda(lambda t: t[:, -1, :], name="take_last_t")(x)
out = layers.Dense(OUT, name="fc")(x)
model = keras.Model(inp, out, name="LSTMModel3L_unrolled")

# 3) Copy weights from NPZ → Keras
def copy_lstm(i):
    Wi = sd[f"lstm_{i}.weight_ih_l0"]   # (4H, in)
    Wh = sd[f"lstm_{i}.weight_hh_l0"]   # (4H, H)
    bi = sd[f"lstm_{i}.bias_ih_l0"]     # (4H,)
    bh = sd[f"lstm_{i}.bias_hh_l0"]     # (4H,)
    model.get_layer(f"lstm_{i}").set_weights([Wi.T, Wh.T, bi + bh])

for i in range(1, 4):
    copy_lstm(i)

W = sd["fc.weight"]  # (OUT, HID)
b = sd["fc.bias"]    # (OUT,)
model.get_layer("fc").set_weights([W.T, b])

print("Weights set on rebuilt Keras model ✅")

# 4) Load PyTorch reference I/O
ref = np.load(PT_REF)
x_np  = ref["x"]
y_pt  = ref["y_pt"]
o1_pt = ref["o1"]; o2_pt = ref["o2"]; o3_pt = ref["o3"]
print("Loaded PyTorch reference ✅")

# 5) Get intermediate outputs from rebuilt Keras model
l1 = model.get_layer("lstm_1").output
l2 = model.get_layer("lstm_2").output
l3 = model.get_layer("lstm_3").output
# l4 = model.get_layer("lstm_4").output
mid = keras.Model(model.inputs, [l1, l2, l3, model.output])

o1_tf, o2_tf, o3_tf,  y_tf = mid(x_np, training=False)
o1_tf, o2_tf, o3_tf,  y_tf = [np.asarray(t, dtype=np.float32) for t in (o1_tf,o2_tf,o3_tf,y_tf)]

# 6) Also verify the exported SavedModel final output equals the rebuilt Keras output
loaded = tf.saved_model.load(MODEL_DIR)
infer = loaded.signatures["serving_default"]
y_saved = infer(tf.constant(x_np))["output"].numpy().astype(np.float32)

def report(name, a, b):
    diff = np.abs(a - b)
    denom = np.maximum(np.abs(b), 1e-8)  # avoid div-by-zero
    pct_err = (diff / denom) * 100
    print(f"{name:7s}  shape={a.shape}  "
          f"max%={pct_err.max():.3f}%  mean%={pct_err.mean():.3f}%  "
          f"RMSE%={np.sqrt((pct_err**2).mean()):.3f}%")


print("\n--- TF vs PyTorch (per-layer + FC) ---")
report("LSTM1", o1_tf, o1_pt)
report("LSTM2", o2_tf, o2_pt)
report("LSTM3", o3_tf, o3_pt)
# report("LSTM4", o4_tf, o4_pt)
report("FC_OUT", y_tf,  y_pt)

print("\n--- SavedModel vs rebuilt Keras (final FC) ---")
report("SM_OUT", y_saved, y_tf)

tol = 5e-6
ok_layers = all(np.max(np.abs(a - b)) < tol for a,b in
                [(o1_tf,o1_pt), (o2_tf,o2_pt), (o3_tf,o3_pt), (y_tf,y_pt)])
ok_export = np.max(np.abs(y_saved - y_tf)) < tol
print("\nPARITY:", "PASS ✅" if (ok_layers and ok_export) else "MISMATCH ❌")


IN_DIM=11, HID=300, OUT=2, SEQ=100
Weights set on rebuilt Keras model ✅
Loaded PyTorch reference ✅


Expected: ['input']
Received: inputs=Tensor(shape=(1, 100, 11))



--- TF vs PyTorch (per-layer + FC) ---
LSTM1    shape=(1, 100, 300)  max%=0.098%  mean%=0.000%  RMSE%=0.001%
LSTM2    shape=(1, 100, 300)  max%=0.257%  mean%=0.000%  RMSE%=0.002%
LSTM3    shape=(1, 100, 300)  max%=1.138%  mean%=0.000%  RMSE%=0.007%
FC_OUT   shape=(1, 2)  max%=0.000%  mean%=0.000%  RMSE%=0.000%

--- SavedModel vs rebuilt Keras (final FC) ---
SM_OUT   shape=(1, 2)  max%=0.000%  mean%=0.000%  RMSE%=0.000%

PARITY: PASS ✅


### Checking with tflite fp32 model

In [None]:
import os
import numpy as np
import tensorflow as tf

# ---------- Paths ----------
MODEL_DIR   = "tf_export_lstm3_unrolled_static"
TFLITE_FP32 = os.path.join(MODEL_DIR, "model_fp32.tflite")
PT_REF      = "parity_io_pt.npz"

# ---------- Load reference input & PyTorch output ----------
io = np.load(PT_REF)
x_np = io["x"]                  # shape (1, 100, IN_DIM)
y_pt = io["y_pt"].astype(np.float32)

# ---------- Get TensorFlow (SavedModel) output to use as TF reference ----------
saved = tf.saved_model.load(MODEL_DIR)
infer = saved.signatures["serving_default"]
y_tf = infer(tf.constant(x_np))["output"].numpy().astype(np.float32)

# ---------- Run TFLite FP32 ----------
# Prefer tflite_runtime if installed; otherwise fallback to tf.lite.Interpreter
try:
    import tflite_runtime.interpreter as tflite
    Interpreter = tflite.Interpreter
except Exception:
    Interpreter = tf.lite.Interpreter

# Interpreter = tf.lite.Interpreter

interp = Interpreter(model_path=TFLITE_FP32, num_threads=1)
interp.allocate_tensors()

in_details  = interp.get_input_details()
out_details = interp.get_output_details()

# If the TFLite input isn't the shape we need, resize it
need_shape = x_np.shape
if tuple(in_details[0]["shape"]) != need_shape:
    interp.resize_tensor_input(in_details[0]["index"], need_shape, strict=False)
    interp.allocate_tensors()
    in_details  = interp.get_input_details()
    out_details = interp.get_output_details()

# Ensure dtype is float32
x_feed = x_np.astype(np.float32)

interp.set_tensor(in_details[0]["index"], x_feed)
interp.invoke()
y_tfl = interp.get_tensor(out_details[0]["index"]).astype(np.float32)

# ---------- % error reporting ----------
def report_pct(name, pred, ref):
    diff = np.abs(pred - ref)
    denom = np.maximum(np.abs(ref), 1e-8)    # avoid div-by-zero
    pct = (diff / denom) * 100.0
    print(f"{name:10s} shape={pred.shape}  "
          f"max%={pct.max():.6f}%  mean%={pct.mean():.6f}%  RMSE%={np.sqrt((pct**2).mean()):.6f}%")

print("\n--- TFLite FP32 vs TF (SavedModel) ---")
report_pct("TFLite→TF", y_tfl, y_tf)

print("\n--- TFLite FP32 vs PyTorch ---")
report_pct("TFLite→PT", y_tfl, y_pt)

# Optional gate (tight for FP32): pass if both comparisons are extremely close
tol_pct_max = 0.001  # 0.001% max abs % error
ok_tf = (np.max(np.abs((y_tfl - y_tf) / np.maximum(np.abs(y_tf), 1e-8))) * 100) < tol_pct_max
ok_pt = (np.max(np.abs((y_tfl - y_pt) / np.maximum(np.abs(y_pt), 1e-8))) * 100) < tol_pct_max
print("\nPARITY (TFLite FP32):", "PASS ✅" if (ok_tf and ok_pt) else "MISMATCH ❌")



--- TFLite FP32 vs TF (SavedModel) ---
TFLite→TF  shape=(1, 2)  max%=nan%  mean%=nan%  RMSE%=nan%

--- TFLite FP32 vs PyTorch ---
TFLite→PT  shape=(1, 2)  max%=nan%  mean%=nan%  RMSE%=nan%

PARITY (TFLite FP32): MISMATCH ❌


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


### For 100 tensors

In [8]:
# make_parity_pack.py  (run in a PyTorch env)
import numpy as np, torch, torch.nn as nn

ckpt_path = "best_model.pt"   # <-- set this
# make_parity_pack.py  (run in a PyTorch env)

SEQ = 100
HID = 400
OUT = 2
N   = 100
SEED = 123

class LSTMModelV3(nn.Module):
    def __init__(self, in_dim, hidden_size=HID, num_layers=1, output_size=OUT):
        super().__init__()
        self.lstm_1 = nn.LSTM(in_dim,      hidden_size, num_layers, batch_first=True)
        self.lstm_2 = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.lstm_3 = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        # self.lstm_4 = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc     = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        x,_ = self.lstm_1(x); x,_ = self.lstm_2(x)
        x,_ = self.lstm_3(x)
        # x,_ = self.lstm_4(x)
        return self.fc(x[:, -1, :])

# load checkpoint
obj = torch.load(ckpt_path, map_location="cpu")
if hasattr(obj, "state_dict"):
    sd = obj.state_dict()
elif isinstance(obj, dict) and "model_state_dict" in obj:
    sd = obj["model_state_dict"]
elif isinstance(obj, dict) and "state_dict" in obj:
    sd = obj["state_dict"]
else:
    sd = obj
sd = {k.replace("module.", ""): v for k,v in sd.items()}

in_dim = sd["lstm_1.weight_ih_l0"].shape[1]
m = LSTMModelV3(in_dim).eval()
m.load_state_dict(sd, strict=False)

rng = np.random.default_rng(SEED)
X = rng.standard_normal((N, SEQ, in_dim), dtype=np.float32)  # no batch dim here
Y_pt = np.zeros((N, OUT), dtype=np.float32)

with torch.no_grad():
    for i in range(N):
        xi = torch.from_numpy(X[i:i+1])          # (1,SEQ,IN_DIM) keep batch=1
        yi = m(xi).cpu().numpy()                 # (1,OUT)
        Y_pt[i] = yi[0]

np.savez("parity_pack_pt.npz",
         x=X, y_pt=Y_pt, seq=SEQ, in_dim=in_dim, out_dim=OUT, n=N)
print("wrote parity_pack_pt.npz with", N, "samples")


wrote parity_pack_pt.npz with 100 samples


In [None]:
import os, numpy as np, tensorflow as tf

MODEL_DIR   = "tf_export_lstm3_unrolled_static"
TFLITE_FP32 = os.path.join(MODEL_DIR, "model_fp32.tflite")
PACK        = "parity_pack_pt.npz"

os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

# ---- load pack ----
pack = np.load(PACK)
X    = pack["x"]      # (N, SEQ, IN_DIM)
Y_pt = pack["y_pt"]   # (N, OUT)
N, SEQ, IN_DIM = X.shape
OUT = Y_pt.shape[-1]

# ---- TF SavedModel infer ----
saved = tf.saved_model.load(MODEL_DIR)
infer = saved.signatures["serving_default"]

Y_tf = np.zeros((N, OUT), dtype=np.float32)
for i in range(N):
    xi = X[i:i+1]                                # (1,SEQ,IN_DIM)
    yi = infer(tf.constant(xi))["output"].numpy().astype(np.float32)  # (1,OUT)
    Y_tf[i] = yi[0]

# ---- TFLite FP32 infer ----
try:
    import tflite_runtime.interpreter as tflite
    Interpreter = tflite.Interpreter
except Exception:
    Interpreter = tf.lite.Interpreter

# Interpreter = tf.lite.Interpreter

interp = Interpreter(model_path=TFLITE_FP32, num_threads=1)
interp.allocate_tensors()
in_det  = interp.get_input_details()[0]
out_det = interp.get_output_details()[0]

def run_tfl(xi):  # xi: (1,SEQ,IN_DIM)
    if tuple(in_det["shape"]) != xi.shape:
        interp.resize_tensor_input(in_det["index"], xi.shape, strict=False)
        interp.allocate_tensors()
    interp.set_tensor(in_det["index"], xi.astype(np.float32))
    interp.invoke()
    return interp.get_tensor(out_det["index"]).astype(np.float32)  # (1,OUT)

Y_tfl = np.zeros((N, OUT), dtype=np.float32)
for i in range(N):
    Y_tfl[i] = run_tfl(X[i:i+1])[0]

# ---- % error helpers ----
def pct_err(pred, ref):
    denom = np.maximum(np.abs(ref), 1e-8)
    return np.abs(pred - ref) / denom * 100.0

def summarize_pct(name, pred, ref):
    pe = pct_err(pred, ref)           # (N, OUT)
    max_each  = pe.max(axis=1)        # per-sample max%
    mean_each = pe.mean(axis=1)       # per-sample mean%
    rmse_each = np.sqrt((pe**2).mean(axis=1))
    print(f"{name}:")
    print(f"  overall   -> max%={pe.max():.6f}%  mean%={pe.mean():.6f}%  RMSE%={np.sqrt((pe**2).mean()):.6f}%")
    worst = int(np.argmax(max_each))
    print(f"  per-sample-> max%: mean={max_each.mean():.6f}%  p95={np.percentile(max_each,95):.6f}%  max={max_each.max():.6f}% @ idx={worst}")
    return max_each, mean_each, rmse_each

print(f"Comparing N={N} samples | X={X.shape}, Y_pt={Y_pt.shape}, Y_tf={Y_tf.shape}, Y_tfl={Y_tfl.shape}\n")

m_tfl_tf, _, _ = summarize_pct("TFLite FP32 vs TF",  Y_tfl, Y_tf)
m_tfl_pt, _, _ = summarize_pct("TFLite FP32 vs PT",  Y_tfl, Y_pt)
m_tf_pt,  _, _ = summarize_pct("TF vs PT",           Y_tf,  Y_pt)

# ---- pass gates ----
tol_max_pct = 0.01  # 0.001% max abs % error
pass_tf  = (m_tf_pt.max()   < tol_max_pct)     # TF vs PT
pass_tfl = (m_tfl_tf.max()  < tol_max_pct) and (m_tfl_pt.max() < tol_max_pct)  # TFLite vs both
print("\nPASS (TF vs PT):",  "✅" if pass_tf  else "❌")
print("PASS (TFLite FP32 vs TF & PT):", "✅" if pass_tfl else "❌")


Comparing N=100 samples | X=(100, 100, 11), Y_pt=(100, 2), Y_tf=(100, 2), Y_tfl=(100, 2)

TFLite FP32 vs TF:
  overall   -> max%=nan%  mean%=nan%  RMSE%=nan%
  per-sample-> max%: mean=nan%  p95=nan%  max=nan% @ idx=0
TFLite FP32 vs PT:
  overall   -> max%=nan%  mean%=nan%  RMSE%=nan%
  per-sample-> max%: mean=nan%  p95=nan%  max=nan% @ idx=0
TF vs PT:
  overall   -> max%=0.002269%  mean%=0.000058%  RMSE%=0.000187%
  per-sample-> max%: mean=0.000094%  p95=0.000217%  max=0.002269% @ idx=79

PASS (TF vs PT): ✅
PASS (TFLite FP32 vs TF & PT): ❌


In [10]:
print(Y_pt[:10])

[[ 0.2203637   1.2037135 ]
 [ 1.1469762   0.6399808 ]
 [ 1.7424899   0.52624094]
 [ 0.3726699   0.8699015 ]
 [-1.0564709   1.9162472 ]
 [ 1.2260323   0.27355912]
 [-0.5207463   1.3430268 ]
 [-0.01862868  0.28096434]
 [-0.8239329   0.8985657 ]
 [-0.49413708  1.8043662 ]]


In [11]:
print(Y_tf[:10])

[[ 0.22036368  1.2037143 ]
 [ 1.1469762   0.63998055]
 [ 1.7424885   0.5262406 ]
 [ 0.3726697   0.869901  ]
 [-1.0564705   1.916247  ]
 [ 1.2260323   0.27355888]
 [-0.52074635  1.3430262 ]
 [-0.01862887  0.28096434]
 [-0.8239332   0.8985659 ]
 [-0.49413693  1.8043659 ]]


In [12]:
print(Y_tfl[:10])

[[nan nan]
 [nan nan]
 [nan nan]
 [nan nan]
 [nan nan]
 [nan nan]
 [nan nan]
 [nan nan]
 [nan nan]
 [nan nan]]
