diff --git a/README.md b/README.md
index aeb146598..ef97cdc17 100644
--- a/README.md
+++ b/README.md
@@ -96,7 +96,6 @@ To install the remaining basic dependencies, run:
 pip install -r requirements/requirements.txt
 pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB
 pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard
-python ./megatron/fused_kernels/setup.py install # optional, if using fused kernels
 ```
 
 from the repository root.
@@ -106,6 +105,16 @@ from the repository root.
 
 </aside>
 
+### Fused Kernels
+We now support AMD GPUs (MI100, MI250X) through JIT fused-kernel compilation. Fused kernels will be built and loaded as needed. To avoid waiting during job launching, you can also do the following for manual pre-build:
+
+```python
+python
+from megatron.fused_kernels import load
+load()
+```
+This will automatically adapts building process over different GPU vendors (AMD, NVIDIA) without platform specific code changes. To further test fused kernels using `pytest`, use `pytest tests/model/test_fused_kernels.py`
+
 ### Flash Attention
 
 To use [Flash-Attention](https://github.com/HazyResearch/flash-attention), install the additional dependencies in  `./requirements/requirements-flashattention.txt` and set the attention type in your configuration accordingly (see [configs](./configs/)). This can provide significant speed-ups over regular attention on certain GPU architectures, including Ampere GPUs (such as A100s); see the repository for more details.
@@ -581,7 +590,7 @@ If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher,
 
 # Profiling
 
-We support profiling with Nsight Systems and PyTorch Memory Profiling.
+We support profiling with Nsight Systems, the PyTorch Profiler, and PyTorch Memory Profiling.
 
 ## Nsight Systems Profiling
 
@@ -597,6 +606,15 @@ The generated output file can then by viewed with the Nsight Systems GUI:
 
 ![Alt text](images/nsight_profiling.png)
 
+## PyTorch Profiling
+
+To use the built-in PyTorch profiler, set config options `profile`, `profile_step_start`, and `profile_step_stop`.
+
+The PyTorch profiler will save traces to your `tensorboard` log directory.  You can view these traces within
+TensorBoard by following the steps [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html).
+
+![Alt text](images/pytorch_profiling.png)
+
 ## PyTorch Memory Profiling
 
 To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path`.
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 7e1438675..744008287 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 3ef5b66
+    Default = 1b85a2f
 
     current git hash of repository
 
diff --git a/images/pytorch_profiling.png b/images/pytorch_profiling.png
new file mode 100644
index 000000000..e85324dc6
Binary files /dev/null and b/images/pytorch_profiling.png differ
diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
index c1ffe57a4..b100f0ca7 100644
--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -135,8 +135,8 @@ def _cpp_extention_load_helper(
         srcpath / "fused_rotary_positional_embedding.cpp",
         srcpath / "fused_rotary_positional_embedding_cuda.cu",
     ]
-    fused_rotary_positional_embedding_cuda = _cpp_extention_load_helper(
-        "fused_rotary_positional_embedding_cuda",
+    fused_rotary_positional_embedding = _cpp_extention_load_helper(
+        "fused_rotary_positional_embedding",
         sources,
         extra_cuda_flags,
         extra_include_paths,
@@ -174,7 +174,7 @@ def load_fused_kernels():
         print(e)
         print("=" * 100)
         print(
-            f"ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them"
+            f"ERROR: Fused kernels configured but not properly installed. Please run `from megatron.fused_kernels import load()` then `load()` to load them correctly"
         )
         print("=" * 100)
         exit()
diff --git a/megatron/model/norms.py b/megatron/model/norms.py
index 8b06b177c..dda44659f 100644
--- a/megatron/model/norms.py
+++ b/megatron/model/norms.py
@@ -14,7 +14,6 @@
 
 import torch
 from torch.nn import LayerNorm as LayerNorm
-from .fused_layer_norm import MixedFusedLayerNorm
 
 
 def get_norm(neox_args):
@@ -23,7 +22,11 @@ def get_norm(neox_args):
         eps = neox_args.rms_norm_epsilon
     elif neox_args.norm == "layernorm":
         eps = neox_args.layernorm_epsilon
-        norm = MixedFusedLayerNorm if neox_args.layernorm_fusion else LayerNorm
+        if neox_args.layernorm_fusion:
+            from .fused_layer_norm import MixedFusedLayerNorm
+            norm = MixedFusedLayerNorm
+        else:
+            norm = LayerNorm
     elif neox_args.norm == "scalenorm":
         eps = neox_args.scalenorm_epsilon
         norm = ScaleNorm
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 53ba28703..a41874971 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -1063,8 +1063,8 @@ def calculate_derived(self):
             ), "Mamba does not yet have dropout implemented"
         if "rwkv" in self.attention_config:
             assert (
-                not self.is_pipe_parallel and self.model_parallel_size == 1
-            ), "RWKV not currently compatible with parallelism"
+                self.model_parallel_size == 1
+            ), "RWKV not currently compatible with model parallelism"
             if isinstance(self.zero_stage, int):
                 assert self.zero_stage <= 2, "Zero stage 3 not compatible with RWKV"
             assert (
diff --git a/megatron/training.py b/megatron/training.py
index 0d6b55236..6a67d36f8 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -891,7 +891,28 @@ def train(
 
     # to monitor if we've skipped many iterations in a row and trigger an early exit
     overflow_monitor = OverflowMonitor(optimizer)
+
+    if neox_args.profile:
+        schedule = torch.profiler.schedule(
+            wait=neox_args.profile_step_start,
+            warmup=1,
+            active=neox_args.profile_step_stop - neox_args.profile_step_start,
+        )
+        prof = torch.profiler.profile(
+            schedule=schedule,
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                neox_args.tensorboard_dir
+            ),
+            record_shapes=True,
+            profile_memory=True,
+            with_flops=True,
+            with_modules=True,
+            with_stack=True,
+        )
+        prof.start()
     while iteration < neox_args.train_iters:
+        if neox_args.profile:
+            prof.step()
         if neox_args.profile and iteration == neox_args.profile_step_start:
             torch.cuda.cudart().cudaProfilerStart()
         loss_dict, skipped_iter = train_step(
@@ -904,6 +925,7 @@ def train(
         )
         if neox_args.profile and iteration == neox_args.profile_step_stop:
             torch.cuda.cudart().cudaProfilerStop()
+            prof.stop()
         iteration += 1
         neox_args.iteration = iteration
         if neox_args.precision == "fp16":
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 501edf345..3ac92598a 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,6 +1,6 @@
-git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed
+deepspeed@git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed
 ftfy>=6.0.1
-git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
+lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 huggingface_hub>=0.11.0
 jinja2==3.1.4
 lm_eval>=0.4.0,<=0.4.1
diff --git a/tests/model/test_fused_kernels.py b/tests/model/test_fused_kernels.py
index cc458bf4a..125eb6c52 100644
--- a/tests/model/test_fused_kernels.py
+++ b/tests/model/test_fused_kernels.py
@@ -30,9 +30,7 @@
 )
 
 
-@pytest.mark.xfail(
-    reason="ModuleNotFoundError: No module named 'scaled_masked_softmax_cuda'"
-)
+@pytest.mark.xfail(reason="SystemExit: None")
 def test_load_fused_kernels():
     load()
     try: