From 9111a2c247a609aaf74b6c0ae852e8ef0ae31223 Mon Sep 17 00:00:00 2001
From: farah kassabri <fkassabri@habana.ai>
Date: Sun, 31 Jan 2021 18:56:03 +0200
Subject: [PATCH 01/55] habanalabs: support hint addresses range reservation

Add support for pre-determined driver-reserved device VA address ranges.
This is needed for future ASIC support where some contents must be
mapped into these pre-determined ranges because the H/W will be
configured using these ranges.

In case the user asks to map a VA without a hint address, avoid
allocating the device VA from the reserved ranges.

Make sure the validation checks of the hint address take into account
situation where the DRAM page size is not pow of 2.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h | 22 ++++++++
 drivers/misc/habanalabs/common/memory.c     | 62 ++++++++++++++++++---
 2 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 6b3cdd7e068a36..0edc72e678c9da 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -381,6 +381,16 @@ struct hl_mmu_properties {
 	u8	host_resident;
 };
 
+/**
+ * struct hl_hints_range - hint addresses reserved va range.
+ * @start_addr: start address of the va range.
+ * @end_addr: end address of the va range.
+ */
+struct hl_hints_range {
+	u64 start_addr;
+	u64 end_addr;
+};
+
 /**
  * struct asic_fixed_properties - ASIC specific immutable properties.
  * @hw_queues_props: H/W queues properties.
@@ -392,6 +402,10 @@ struct hl_mmu_properties {
  * @pmmu: PCI (host) MMU address translation properties.
  * @pmmu_huge: PCI (host) MMU address translation properties for memory
  *              allocated with huge pages.
+ * @hints_dram_reserved_va_range: dram hint addresses reserved range.
+ * @hints_host_reserved_va_range: host hint addresses reserved range.
+ * @hints_host_hpage_reserved_va_range: host huge page hint addresses reserved
+ *                                      range.
  * @sram_base_address: SRAM physical start address.
  * @sram_end_address: SRAM physical end address.
  * @sram_user_base_address - SRAM physical start address for user access.
@@ -412,6 +426,8 @@ struct hl_mmu_properties {
  *                    to the device's MMU.
  * @cb_va_end_addr: virtual end address of command buffers which are mapped to
  *                  the device's MMU.
+ * @dram_hints_align_mask: dram va hint addresses alignment mask which is used
+ *                  for hints validity check.
  * @mmu_pgt_size: MMU page tables total size.
  * @mmu_pte_size: PTE size in MMU page tables.
  * @mmu_hop_table_size: MMU hop table size.
@@ -470,6 +486,7 @@ struct hl_mmu_properties {
  * @dram_supports_virtual_memory: is there an MMU towards the DRAM
  * @hard_reset_done_by_fw: true if firmware is handling hard reset flow
  * @num_functional_hbms: number of functional HBMs in each DCORE.
+ * @hints_range_reservation: device support hint addresses range reservation.
  * @iatu_done_by_fw: true if iATU configuration is being done by FW.
  * @dynamic_fw_load: is dynamic FW load is supported.
  * @gic_interrupts_enable: true if FW is not blocking GIC controller,
@@ -483,6 +500,9 @@ struct asic_fixed_properties {
 	struct hl_mmu_properties	dmmu;
 	struct hl_mmu_properties	pmmu;
 	struct hl_mmu_properties	pmmu_huge;
+	struct hl_hints_range		hints_dram_reserved_va_range;
+	struct hl_hints_range		hints_host_reserved_va_range;
+	struct hl_hints_range		hints_host_hpage_reserved_va_range;
 	u64				sram_base_address;
 	u64				sram_end_address;
 	u64				sram_user_base_address;
@@ -500,6 +520,7 @@ struct asic_fixed_properties {
 	u64				mmu_dram_default_page_addr;
 	u64				cb_va_start_addr;
 	u64				cb_va_end_addr;
+	u64				dram_hints_align_mask;
 	u32				mmu_pgt_size;
 	u32				mmu_pte_size;
 	u32				mmu_hop_table_size;
@@ -542,6 +563,7 @@ struct asic_fixed_properties {
 	u8				dram_supports_virtual_memory;
 	u8				hard_reset_done_by_fw;
 	u8				num_functional_hbms;
+	u8				hints_range_reservation;
 	u8				iatu_done_by_fw;
 	u8				dynamic_fw_load;
 	u8				gic_interrupts_enable;
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index af339ce1ab4f2b..d9429b98313af2 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -528,6 +528,33 @@ static inline int add_va_block(struct hl_device *hdev,
 	return rc;
 }
 
+/**
+ * is_hint_crossing_range() - check if hint address crossing specified reserved
+ * range.
+ */
+static inline bool is_hint_crossing_range(enum hl_va_range_type range_type,
+		u64 start_addr, u32 size, struct asic_fixed_properties *prop) {
+	bool range_cross;
+
+	if (range_type == HL_VA_RANGE_TYPE_DRAM)
+		range_cross =
+			hl_mem_area_crosses_range(start_addr, size,
+			prop->hints_dram_reserved_va_range.start_addr,
+			prop->hints_dram_reserved_va_range.end_addr);
+	else if (range_type == HL_VA_RANGE_TYPE_HOST)
+		range_cross =
+			hl_mem_area_crosses_range(start_addr,	size,
+			prop->hints_host_reserved_va_range.start_addr,
+			prop->hints_host_reserved_va_range.end_addr);
+	else
+		range_cross =
+			hl_mem_area_crosses_range(start_addr, size,
+			prop->hints_host_hpage_reserved_va_range.start_addr,
+			prop->hints_host_hpage_reserved_va_range.end_addr);
+
+	return range_cross;
+}
+
 /**
  * get_va_block() - get a virtual block for the given size and alignment.
  *
@@ -536,6 +563,7 @@ static inline int add_va_block(struct hl_device *hdev,
  * @size: requested block size.
  * @hint_addr: hint for requested address by the user.
  * @va_block_align: required alignment of the virtual block start address.
+ * @range_type: va range type (host, dram)
  *
  * This function does the following:
  * - Iterate on the virtual block list to find a suitable virtual block for the
@@ -545,13 +573,17 @@ static inline int add_va_block(struct hl_device *hdev,
  */
 static u64 get_va_block(struct hl_device *hdev,
 				struct hl_va_range *va_range,
-				u64 size, u64 hint_addr, u32 va_block_align)
+				u64 size, u64 hint_addr, u32 va_block_align,
+				enum hl_va_range_type range_type)
 {
 	struct hl_vm_va_block *va_block, *new_va_block = NULL;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	u64 tmp_hint_addr, valid_start, valid_size, prev_start, prev_end,
-		align_mask, reserved_valid_start = 0, reserved_valid_size = 0;
+		align_mask, reserved_valid_start = 0, reserved_valid_size = 0,
+		dram_hint_mask = prop->dram_hints_align_mask;
 	bool add_prev = false;
 	bool is_align_pow_2  = is_power_of_2(va_range->page_size);
+	bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr);
 
 	if (is_align_pow_2)
 		align_mask = ~((u64)va_block_align - 1);
@@ -564,12 +596,12 @@ static u64 get_va_block(struct hl_device *hdev,
 		size = DIV_ROUND_UP_ULL(size, va_range->page_size) *
 							va_range->page_size;
 
-	tmp_hint_addr = hint_addr;
+	tmp_hint_addr = hint_addr & ~dram_hint_mask;
 
 	/* Check if we need to ignore hint address */
 	if ((is_align_pow_2 && (hint_addr & (va_block_align - 1))) ||
-			(!is_align_pow_2 &&
-				do_div(tmp_hint_addr, va_range->page_size))) {
+		(!is_align_pow_2 && is_hint_dram_addr &&
+			do_div(tmp_hint_addr, va_range->page_size))) {
 
 		dev_dbg(hdev->dev,
 			"Hint address 0x%llx will be ignored because it is not aligned\n",
@@ -596,6 +628,16 @@ static u64 get_va_block(struct hl_device *hdev,
 		if (valid_size < size)
 			continue;
 
+		/*
+		 * In case hint address is 0, and arc_hints_range_reservation
+		 * property enabled, then avoid allocating va blocks from the
+		 * range reserved for hint addresses
+		 */
+		if (prop->hints_range_reservation && !hint_addr)
+			if (is_hint_crossing_range(range_type, valid_start,
+					size, prop))
+				continue;
+
 		/* Pick the minimal length block which has the required size */
 		if (!new_va_block || (valid_size < reserved_valid_size)) {
 			new_va_block = va_block;
@@ -670,7 +712,7 @@ u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
 		enum hl_va_range_type type, u32 size, u32 alignment)
 {
 	return get_va_block(hdev, ctx->va_range[type], size, 0,
-			max(alignment, ctx->va_range[type]->page_size));
+			max(alignment, ctx->va_range[type]->page_size), type);
 }
 
 /**
@@ -1006,6 +1048,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	u32 handle = 0, va_block_align;
 	int rc;
 	bool is_userptr = args->flags & HL_MEM_USERPTR;
+	enum hl_va_range_type va_range_type = 0;
 
 	/* Assume failure */
 	*device_addr = 0;
@@ -1038,7 +1081,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		/* get required alignment */
 		if (phys_pg_pack->page_size == page_size) {
 			va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];
-
+			va_range_type = HL_VA_RANGE_TYPE_HOST;
 			/*
 			 * huge page alignment may be needed in case of regular
 			 * page mapping, depending on the host VA alignment
@@ -1053,6 +1096,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 			 * mapping
 			 */
 			va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];
+			va_range_type = HL_VA_RANGE_TYPE_HOST_HUGE;
 			va_block_align = huge_page_size;
 		}
 	} else {
@@ -1078,6 +1122,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 
 		/* DRAM VA alignment is the same as the MMU page size */
 		va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];
+		va_range_type = HL_VA_RANGE_TYPE_DRAM;
 		va_block_align = hdev->asic_prop.dmmu.page_size;
 	}
 
@@ -1101,7 +1146,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	}
 
 	ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size,
-					hint_addr, va_block_align);
+					hint_addr, va_block_align,
+					va_range_type);
 	if (!ret_vaddr) {
 		dev_err(hdev->dev, "no available va block for handle %u\n",
 				handle);

From 3c475019240872693b87cfde74e50d85bd39a116 Mon Sep 17 00:00:00 2001
From: Yuri Nudelman <ynudelman@habana.ai>
Date: Thu, 3 Jun 2021 17:51:58 +0300
Subject: [PATCH 02/55] habanalabs: allow fail on inability to respect hint

A new user flag is required to make memory map hint mandatory, in
contrast to the current situation where it is best effort.
This is due to the requirement to map certain data to specific
pre-determined device virtual address ranges.

Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/memory.c | 45 +++++++++++++++++++++++--
 include/uapi/misc/habanalabs.h          |  1 +
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index d9429b98313af2..d54fdd04be5b4d 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -564,6 +564,7 @@ static inline bool is_hint_crossing_range(enum hl_va_range_type range_type,
  * @hint_addr: hint for requested address by the user.
  * @va_block_align: required alignment of the virtual block start address.
  * @range_type: va range type (host, dram)
+ * @flags: additional memory flags, currently only uses HL_MEM_FORCE_HINT
  *
  * This function does the following:
  * - Iterate on the virtual block list to find a suitable virtual block for the
@@ -574,7 +575,8 @@ static inline bool is_hint_crossing_range(enum hl_va_range_type range_type,
 static u64 get_va_block(struct hl_device *hdev,
 				struct hl_va_range *va_range,
 				u64 size, u64 hint_addr, u32 va_block_align,
-				enum hl_va_range_type range_type)
+				enum hl_va_range_type range_type,
+				u32 flags)
 {
 	struct hl_vm_va_block *va_block, *new_va_block = NULL;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -584,6 +586,7 @@ static u64 get_va_block(struct hl_device *hdev,
 	bool add_prev = false;
 	bool is_align_pow_2  = is_power_of_2(va_range->page_size);
 	bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr);
+	bool force_hint = flags & HL_MEM_FORCE_HINT;
 
 	if (is_align_pow_2)
 		align_mask = ~((u64)va_block_align - 1);
@@ -603,6 +606,15 @@ static u64 get_va_block(struct hl_device *hdev,
 		(!is_align_pow_2 && is_hint_dram_addr &&
 			do_div(tmp_hint_addr, va_range->page_size))) {
 
+		if (force_hint) {
+			/* Hint must be repected, so here we just fail.
+			 */
+			dev_err(hdev->dev,
+				"Hint address 0x%llx is not page aligned - cannot be respected\n",
+				hint_addr);
+			return 0;
+		}
+
 		dev_dbg(hdev->dev,
 			"Hint address 0x%llx will be ignored because it is not aligned\n",
 			hint_addr);
@@ -660,6 +672,17 @@ static u64 get_va_block(struct hl_device *hdev,
 		goto out;
 	}
 
+	if (force_hint && reserved_valid_start != hint_addr) {
+		/* Hint address must be respected. If we are here - this means
+		 * we could not respect it.
+		 */
+		dev_err(hdev->dev,
+			"Hint address 0x%llx could not be respected\n",
+			hint_addr);
+		reserved_valid_start = 0;
+		goto out;
+	}
+
 	/*
 	 * Check if there is some leftover range due to reserving the new
 	 * va block, then return it to the main virtual addresses list.
@@ -712,7 +735,8 @@ u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
 		enum hl_va_range_type type, u32 size, u32 alignment)
 {
 	return get_va_block(hdev, ctx->va_range[type], size, 0,
-			max(alignment, ctx->va_range[type]->page_size), type);
+			max(alignment, ctx->va_range[type]->page_size),
+			type, 0);
 }
 
 /**
@@ -1145,9 +1169,24 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		goto hnode_err;
 	}
 
+	if (hint_addr && phys_pg_pack->offset) {
+		if (args->flags & HL_MEM_FORCE_HINT) {
+			/* If hint must be repected, since we can't - just fail.
+			 */
+			dev_err(hdev->dev,
+				"Hint address 0x%llx cannot be respected because source memory is not aligned 0x%x\n",
+				hint_addr, phys_pg_pack->offset);
+			rc = -EINVAL;
+			goto va_block_err;
+		}
+		dev_dbg(hdev->dev,
+			"Hint address 0x%llx will be ignored because source memory is not aligned 0x%x\n",
+			hint_addr, phys_pg_pack->offset);
+	}
+
 	ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size,
 					hint_addr, va_block_align,
-					va_range_type);
+					va_range_type, args->flags);
 	if (!ret_vaddr) {
 		dev_err(hdev->dev, "no available va block for handle %u\n",
 				handle);
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index a47a731e452778..18765eb75b6590 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -813,6 +813,7 @@ union hl_wait_cs_args {
 #define HL_MEM_CONTIGUOUS	0x1
 #define HL_MEM_SHARED		0x2
 #define HL_MEM_USERPTR		0x4
+#define HL_MEM_FORCE_HINT	0x8
 
 struct hl_mem_in {
 	union {

From 2235ef41f0c4f5b9c96d0ed7be52f9963a849023 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Mon, 28 Jun 2021 11:21:56 +0300
Subject: [PATCH 03/55] habanalabs: update firmware header files

Update recent changes made in firmware header files, which contain
a minor COMMS protocol change and new error status definitions.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c  |  4 +--
 .../habanalabs/include/common/hl_boot_if.h    | 35 ++++++++++++++++---
 .../habanalabs/include/gaudi/gaudi_reg_map.h  |  2 --
 3 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 2e4d04ec6b5396..bac25a60650f6e 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -1953,8 +1953,8 @@ static void hl_fw_dynamic_update_linux_interrupt_if(struct hl_device *hdev)
 	if (!hdev->asic_prop.gic_interrupts_enable &&
 			!(hdev->asic_prop.fw_app_cpu_boot_dev_sts0 &
 				CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN)) {
-		dyn_regs->gic_host_halt_irq = dyn_regs->gic_host_irq_ctrl;
-		dyn_regs->gic_host_ints_irq = dyn_regs->gic_host_irq_ctrl;
+		dyn_regs->gic_host_halt_irq = dyn_regs->gic_host_pi_upd_irq;
+		dyn_regs->gic_host_ints_irq = dyn_regs->gic_host_pi_upd_irq;
 
 		dev_warn(hdev->dev,
 			"Using a single interrupt interface towards cpucp");
diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h
index fa8a5ad2d438c7..d762bb2f12042f 100644
--- a/drivers/misc/habanalabs/include/common/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h
@@ -78,6 +78,26 @@
  * CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL	Device is unusable and customer support
  *					should be contacted.
  *
+ * CPU_BOOT_ERR0_ARC0_HALT_ACK_NOT_RCVD	HALT ACK from ARC0 is not received
+ *					within specified retries after issuing
+ *					HALT request. ARC0 appears to be in bad
+ *					reset.
+ *
+ * CPU_BOOT_ERR0_ARC1_HALT_ACK_NOT_RCVD	HALT ACK from ARC1 is not received
+ *					within specified retries after issuing
+ *					HALT request. ARC1 appears to be in bad
+ *					reset.
+ *
+ * CPU_BOOT_ERR0_ARC0_RUN_ACK_NOT_RCVD	RUN ACK from ARC0 is not received
+ *					within specified timeout after issuing
+ *					RUN request. ARC0 appears to be in bad
+ *					reset.
+ *
+ * CPU_BOOT_ERR0_ARC1_RUN_ACK_NOT_RCVD	RUN ACK from ARC1 is not received
+ *					within specified timeout after issuing
+ *					RUN request. ARC1 appears to be in bad
+ *					reset.
+ *
  * CPU_BOOT_ERR0_ENABLED		Error registers enabled.
  *					This is a main indication that the
  *					running FW populates the error
@@ -98,6 +118,10 @@
 #define CPU_BOOT_ERR0_SEC_IMG_VER_FAIL		(1 << 11)
 #define CPU_BOOT_ERR0_PLL_FAIL			(1 << 12)
 #define CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL	(1 << 13)
+#define CPU_BOOT_ERR0_ARC0_HALT_ACK_NOT_RCVD	(1 << 14)
+#define CPU_BOOT_ERR0_ARC1_HALT_ACK_NOT_RCVD	(1 << 15)
+#define CPU_BOOT_ERR0_ARC0_RUN_ACK_NOT_RCVD	(1 << 16)
+#define CPU_BOOT_ERR0_ARC1_RUN_ACK_NOT_RCVD	(1 << 17)
 #define CPU_BOOT_ERR0_ENABLED			(1 << 31)
 #define CPU_BOOT_ERR1_ENABLED			(1 << 31)
 
@@ -313,10 +337,7 @@ struct cpu_dyn_regs {
 	__le32 hw_state;
 	__le32 kmd_msg_to_cpu;
 	__le32 cpu_cmd_status_to_host;
-	union {
-		__le32 gic_host_irq_ctrl;
-		__le32 gic_host_pi_upd_irq;
-	};
+	__le32 gic_host_pi_upd_irq;
 	__le32 gic_tpc_qm_irq_ctrl;
 	__le32 gic_mme_qm_irq_ctrl;
 	__le32 gic_dma_qm_irq_ctrl;
@@ -462,6 +483,11 @@ struct lkd_fw_comms_msg {
  *				Do not wait for BMC response.
  *
  * COMMS_LOW_PLL_OPP		Initialize PLLs for low OPP.
+ *
+ * COMMS_PREP_DESC_ELBI		Same as COMMS_PREP_DESC only that the memory
+ *				space is allocated in a ELBI access only
+ *				address range.
+ *
  */
 enum comms_cmd {
 	COMMS_NOOP = 0,
@@ -474,6 +500,7 @@ enum comms_cmd {
 	COMMS_GOTO_WFE = 7,
 	COMMS_SKIP_BMC = 8,
 	COMMS_LOW_PLL_OPP = 9,
+	COMMS_PREP_DESC_ELBI = 10,
 	COMMS_INVLD_LAST
 };
 
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h b/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
index d95d4162ae2cfd..b9bd5a7f71eb62 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
@@ -12,8 +12,6 @@
  * PSOC scratch-pad registers
  */
 #define mmHW_STATE			mmPSOC_GLOBAL_CONF_SCRATCHPAD_0
-/* TODO: remove mmGIC_HOST_IRQ_CTRL_POLL_REG */
-#define mmGIC_HOST_IRQ_CTRL_POLL_REG	mmPSOC_GLOBAL_CONF_SCRATCHPAD_1
 #define mmGIC_HOST_PI_UPD_IRQ_POLL_REG	mmPSOC_GLOBAL_CONF_SCRATCHPAD_1
 #define mmGIC_TPC_QM_IRQ_CTRL_POLL_REG	mmPSOC_GLOBAL_CONF_SCRATCHPAD_2
 #define mmGIC_MME_QM_IRQ_CTRL_POLL_REG	mmPSOC_GLOBAL_CONF_SCRATCHPAD_3

From 51f1eb619221b3a311bb19d68ca4ceb4cc34467a Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Tue, 29 Jun 2021 18:08:05 +0300
Subject: [PATCH 04/55] habanalabs: rename enum vm_type_t to vm_type

We don't use typedefs so the enum name shouldn't end with _t

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/debugfs.c    |  4 ++--
 drivers/misc/habanalabs/common/habanalabs.h |  8 ++++----
 drivers/misc/habanalabs/common/memory.c     | 14 ++++++--------
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 703d79fb6f3f5d..fd44c3b66d3bc5 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -235,7 +235,7 @@ static int vm_show(struct seq_file *s, void *data)
 	struct hl_vm_hash_node *hnode;
 	struct hl_userptr *userptr;
 	struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
-	enum vm_type_t *vm_type;
+	enum vm_type *vm_type;
 	bool once = true;
 	u64 j;
 	int i;
@@ -492,7 +492,7 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr, u32 size,
 	struct hl_ctx *ctx = hdev->compute_ctx;
 	struct hl_vm_hash_node *hnode;
 	struct hl_userptr *userptr;
-	enum vm_type_t *vm_type;
+	enum vm_type *vm_type;
 	bool valid = false;
 	u64 end_address;
 	u32 range_size;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 0edc72e678c9da..31fda9e41fe711 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -317,11 +317,11 @@ struct hw_queue_properties {
 };
 
 /**
- * enum vm_type_t - virtual memory mapping request information.
+ * enum vm_type - virtual memory mapping request information.
  * @VM_TYPE_USERPTR: mapping of user memory to device virtual address.
  * @VM_TYPE_PHYS_PACK: mapping of DRAM memory to device virtual address.
  */
-enum vm_type_t {
+enum vm_type {
 	VM_TYPE_USERPTR = 0x1,
 	VM_TYPE_PHYS_PACK = 0x2
 };
@@ -1421,7 +1421,7 @@ struct hl_ctx_mgr {
  * @dma_mapped: true if the SG was mapped to DMA addresses, false otherwise.
  */
 struct hl_userptr {
-	enum vm_type_t		vm_type; /* must be first */
+	enum vm_type		vm_type; /* must be first */
 	struct list_head	job_node;
 	struct page		**pages;
 	unsigned int		npages;
@@ -1635,7 +1635,7 @@ struct hl_vm_hw_block_list_node {
  * @created_from_userptr: is product of host virtual address.
  */
 struct hl_vm_phys_pg_pack {
-	enum vm_type_t		vm_type; /* must be first */
+	enum vm_type		vm_type; /* must be first */
 	u64			*pages;
 	u64			npages;
 	u64			total_size;
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index d54fdd04be5b4d..57e980d1a84bcb 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -607,8 +607,7 @@ static u64 get_va_block(struct hl_device *hdev,
 			do_div(tmp_hint_addr, va_range->page_size))) {
 
 		if (force_hint) {
-			/* Hint must be repected, so here we just fail.
-			 */
+			/* Hint must be respected, so here we just fail */
 			dev_err(hdev->dev,
 				"Hint address 0x%llx is not page aligned - cannot be respected\n",
 				hint_addr);
@@ -1067,7 +1066,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	struct hl_userptr *userptr = NULL;
 	struct hl_vm_hash_node *hnode;
 	struct hl_va_range *va_range;
-	enum vm_type_t *vm_type;
+	enum vm_type *vm_type;
 	u64 ret_vaddr, hint_addr;
 	u32 handle = 0, va_block_align;
 	int rc;
@@ -1098,7 +1097,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 			goto init_page_pack_err;
 		}
 
-		vm_type = (enum vm_type_t *) userptr;
+		vm_type = (enum vm_type *) userptr;
 		hint_addr = args->map_host.hint_addr;
 		handle = phys_pg_pack->handle;
 
@@ -1140,7 +1139,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 
 		spin_unlock(&vm->idr_lock);
 
-		vm_type = (enum vm_type_t *) phys_pg_pack;
+		vm_type = (enum vm_type *) phys_pg_pack;
 
 		hint_addr = args->map_device.hint_addr;
 
@@ -1171,8 +1170,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 
 	if (hint_addr && phys_pg_pack->offset) {
 		if (args->flags & HL_MEM_FORCE_HINT) {
-			/* If hint must be repected, since we can't - just fail.
-			 */
+			/* Fail if hint must be respected but it can't be */
 			dev_err(hdev->dev,
 				"Hint address 0x%llx cannot be respected because source memory is not aligned 0x%x\n",
 				hint_addr, phys_pg_pack->offset);
@@ -1273,7 +1271,7 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	struct hl_userptr *userptr = NULL;
 	struct hl_va_range *va_range;
 	u64 vaddr = args->unmap.device_virt_addr;
-	enum vm_type_t *vm_type;
+	enum vm_type *vm_type;
 	bool is_userptr;
 	int rc = 0;
 

From f70e6d0b8f9eebd51d3fd1b91d4127b843d4004f Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Thu, 1 Jul 2021 11:42:53 +0300
Subject: [PATCH 05/55] habanalabs: re-init completion object upon retry

In case user interrupt arrived but the completion value is less than
the target value, we want to retry the wait.

However, before the retry we must reinitialize the completion object,
under spin-lock, so the wait function won't exit immediately because
the completion object is already completed (from the previous
interrupt).

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 80c60fb41bbca5..12f20446e99a1f 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -2046,7 +2046,8 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 		goto unlock_and_free_fence;
 	}
 
-	if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 4)) {
+	if (copy_from_user(&completion_value, u64_to_user_ptr(user_address),
+									4)) {
 		dev_err(hdev->dev,
 			"Failed to copy completion value from user\n");
 		rc = -EFAULT;
@@ -2077,18 +2078,28 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 	 * If comparison fails, keep waiting until timeout expires
 	 */
 	if (completion_rc > 0) {
+		spin_lock(&interrupt->wait_list_lock);
+
 		if (copy_from_user(&completion_value,
 				u64_to_user_ptr(user_address), 4)) {
+
+			spin_unlock(&interrupt->wait_list_lock);
+
 			dev_err(hdev->dev,
 				"Failed to copy completion value from user\n");
 			rc = -EFAULT;
+
 			goto remove_pending_user_interrupt;
 		}
 
 		if (completion_value >= target_value) {
+			spin_unlock(&interrupt->wait_list_lock);
 			*status = CS_WAIT_STATUS_COMPLETED;
 		} else {
+			reinit_completion(&pend->fence.completion);
 			timeout = completion_rc;
+
+			spin_unlock(&interrupt->wait_list_lock);
 			goto wait_again;
 		}
 	} else {

From cd2f6cb2d357c5a2225a8c2f64aa0405e95a2bc8 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Thu, 1 Jul 2021 10:09:28 +0300
Subject: [PATCH 06/55] habanalabs: release pending user interrupts on device
 fini

In device fini there was missing a call to release all pending user
interrupts. That can cause a process to be stuck inside the driver's
IOCTL of wait for interrupts, in case the device is removed or
simulator is killed at the same time.

In addition, also call to remove inactive codec job was missing.

Moreover, to prevent such errors in the future (where code is added
to reset path but not to device fini), we moved some common parts
to two dedicated functions:
cleanup_resources
take_release_locks

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 100 +++++++++++-------------
 1 file changed, 46 insertions(+), 54 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index ff4cbde289c0b6..43d0dcbd20e313 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -866,6 +866,48 @@ static void device_disable_open_processes(struct hl_device *hdev)
 	mutex_unlock(&hdev->fpriv_list_lock);
 }
 
+static void take_release_locks(struct hl_device *hdev)
+{
+	/* Flush anyone that is inside the critical section of enqueue
+	 * jobs to the H/W
+	 */
+	hdev->asic_funcs->hw_queues_lock(hdev);
+	hdev->asic_funcs->hw_queues_unlock(hdev);
+
+	/* Flush anyone that is inside device open */
+	mutex_lock(&hdev->fpriv_list_lock);
+	mutex_unlock(&hdev->fpriv_list_lock);
+}
+
+static void cleanup_resources(struct hl_device *hdev, bool hard_reset)
+{
+	if (hard_reset) {
+		device_late_fini(hdev);
+
+		/*
+		 * Now that the heartbeat thread is closed, flush processes
+		 * which are sending messages to CPU
+		 */
+		mutex_lock(&hdev->send_cpu_message_lock);
+		mutex_unlock(&hdev->send_cpu_message_lock);
+	}
+
+	/*
+	 * Halt the engines and disable interrupts so we won't get any more
+	 * completions from H/W and we won't have any accesses from the
+	 * H/W to the host machine
+	 */
+	hdev->asic_funcs->halt_engines(hdev, hard_reset);
+
+	/* Go over all the queues, release all CS and their jobs */
+	hl_cs_rollback_all(hdev);
+
+	/* Release all pending user interrupts, each pending user interrupt
+	 * holds a reference to user context
+	 */
+	hl_release_pending_user_interrupts(hdev);
+}
+
 /*
  * hl_device_reset - reset the device
  *
@@ -970,15 +1012,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		/* This also blocks future CS/VM/JOB completion operations */
 		hdev->disabled = true;
 
-		/* Flush anyone that is inside the critical section of enqueue
-		 * jobs to the H/W
-		 */
-		hdev->asic_funcs->hw_queues_lock(hdev);
-		hdev->asic_funcs->hw_queues_unlock(hdev);
-
-		/* Flush anyone that is inside device open */
-		mutex_lock(&hdev->fpriv_list_lock);
-		mutex_unlock(&hdev->fpriv_list_lock);
+		take_release_locks(hdev);
 
 		dev_err(hdev->dev, "Going to RESET device!\n");
 	}
@@ -999,31 +1033,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		return 0;
 	}
 
-	if (hard_reset) {
-		device_late_fini(hdev);
-
-		/*
-		 * Now that the heartbeat thread is closed, flush processes
-		 * which are sending messages to CPU
-		 */
-		mutex_lock(&hdev->send_cpu_message_lock);
-		mutex_unlock(&hdev->send_cpu_message_lock);
-	}
-
-	/*
-	 * Halt the engines and disable interrupts so we won't get any more
-	 * completions from H/W and we won't have any accesses from the
-	 * H/W to the host machine
-	 */
-	hdev->asic_funcs->halt_engines(hdev, hard_reset);
-
-	/* Go over all the queues, release all CS and their jobs */
-	hl_cs_rollback_all(hdev);
-
-	/* Release all pending user interrupts, each pending user interrupt
-	 * holds a reference to user context
-	 */
-	hl_release_pending_user_interrupts(hdev);
+	cleanup_resources(hdev, hard_reset);
 
 kill_processes:
 	if (hard_reset) {
@@ -1567,31 +1577,13 @@ void hl_device_fini(struct hl_device *hdev)
 	/* Mark device as disabled */
 	hdev->disabled = true;
 
-	/* Flush anyone that is inside the critical section of enqueue
-	 * jobs to the H/W
-	 */
-	hdev->asic_funcs->hw_queues_lock(hdev);
-	hdev->asic_funcs->hw_queues_unlock(hdev);
-
-	/* Flush anyone that is inside device open */
-	mutex_lock(&hdev->fpriv_list_lock);
-	mutex_unlock(&hdev->fpriv_list_lock);
+	take_release_locks(hdev);
 
 	hdev->hard_reset_pending = true;
 
 	hl_hwmon_fini(hdev);
 
-	device_late_fini(hdev);
-
-	/*
-	 * Halt the engines and disable interrupts so we won't get any more
-	 * completions from H/W and we won't have any accesses from the
-	 * H/W to the host machine
-	 */
-	hdev->asic_funcs->halt_engines(hdev, true);
-
-	/* Go over all the queues, release all CS and their jobs */
-	hl_cs_rollback_all(hdev);
+	cleanup_resources(hdev, true);
 
 	/* Kill processes here after CS rollback. This is because the process
 	 * can't really exit until all its CSs are done, which is what we

From 1e421d7ac8d5345e59d844c544a881ccfbc350d7 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Thu, 1 Jul 2021 10:36:16 +0300
Subject: [PATCH 07/55] habanalabs: handle case of interruptable wait

Same as we handle it in the regular wait for CS, we need to handle the
case where the waiting for user interrupt was interrupted. In that case,
we need to return correct error code to the user.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_submission.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 12f20446e99a1f..997a37e782413e 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -2102,6 +2102,12 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 			spin_unlock(&interrupt->wait_list_lock);
 			goto wait_again;
 		}
+	} else if (completion_rc == -ERESTARTSYS) {
+		dev_err_ratelimited(hdev->dev,
+			"user process got signal while waiting for interrupt ID %d\n",
+			interrupt->interrupt_id);
+		*status = HL_WAIT_CS_STATUS_INTERRUPTED;
+		rc = -EINTR;
 	} else {
 		*status = CS_WAIT_STATUS_BUSY;
 	}
@@ -2159,8 +2165,9 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 	memset(args, 0, sizeof(*args));
 
 	if (rc) {
-		dev_err_ratelimited(hdev->dev,
-			"interrupt_wait_ioctl failed (%d)\n", rc);
+		if (rc != -EINTR)
+			dev_err_ratelimited(hdev->dev,
+				"interrupt_wait_ioctl failed (%d)\n", rc);
 
 		return rc;
 	}

From a48e84deb20568064b1ee3ff499e71b45ffbf53d Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Tue, 29 Jun 2021 18:12:46 +0300
Subject: [PATCH 08/55] habanalabs: user mappings can be 64-bit

Increase the size variable in the userptr structure to 64-bit. That
variable describes the size of the memory allocation of the user that
is now being mapped into the device. The mapping can be larger than
4GB, so we need to support it.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/debugfs.c    | 4 ++--
 drivers/misc/habanalabs/common/habanalabs.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index fd44c3b66d3bc5..77f7c2aa571da8 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -213,7 +213,7 @@ static int userptr_show(struct seq_file *s, void *data)
 			seq_puts(s, "----------------------------------------------------------\n");
 		}
 		seq_printf(s,
-			"    0x%-14llx      %-10u    %-30s\n",
+			"    0x%-14llx      %-10llu    %-30s\n",
 			userptr->addr, userptr->size, dma_dir[userptr->dir]);
 	}
 
@@ -261,7 +261,7 @@ static int vm_show(struct seq_file *s, void *data)
 			if (*vm_type == VM_TYPE_USERPTR) {
 				userptr = hnode->ptr;
 				seq_printf(s,
-					"    0x%-14llx      %-10u\n",
+					"    0x%-14llx      %-10llu\n",
 					hnode->vaddr, userptr->size);
 			} else {
 				phys_pg_pack = hnode->ptr;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 31fda9e41fe711..5ae95d2abaa6ad 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1429,7 +1429,7 @@ struct hl_userptr {
 	enum dma_data_direction dir;
 	struct list_head	debugfs_list;
 	u64			addr;
-	u32			size;
+	u64			size;
 	u8			dma_mapped;
 };
 

From d35c8793e6161188739c5187bb09147ae0611dc2 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Tue, 29 Jun 2021 18:23:41 +0300
Subject: [PATCH 09/55] habanalabs: allow disabling huge page use

Sometimes we may need to disable optimization of using huge pages
in our memory management code. Add such a flag to the function that
creates the list of physical pages that would be programmed into the
device MMU.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/memory.c | 36 ++++++++++++++++---------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index 57e980d1a84bcb..a05d98db485762 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -819,6 +819,10 @@ static u32 get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr)
  * @ctx: pointer to the context structure.
  * @userptr: userptr to initialize from.
  * @pphys_pg_pack: result pointer.
+ * @force_regular_page: tell the function to ignore huge page optimization,
+ *                      even if possible. Needed for cases where the device VA
+ *                      is allocated before we know the composition of the
+ *                      physical pages
  *
  * This function does the following:
  * - Pin the physical pages related to the given virtual block.
@@ -827,17 +831,18 @@ static u32 get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr)
  */
 static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
 				struct hl_userptr *userptr,
-				struct hl_vm_phys_pg_pack **pphys_pg_pack)
+				struct hl_vm_phys_pg_pack **pphys_pg_pack,
+				bool force_regular_page)
 {
+	u32 npages, page_size = PAGE_SIZE,
+		huge_page_size = ctx->hdev->asic_prop.pmmu_huge.page_size;
+	u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size);
 	struct hl_vm_phys_pg_pack *phys_pg_pack;
+	bool first = true, is_huge_page_opt;
+	u64 page_mask, total_npages;
 	struct scatterlist *sg;
 	dma_addr_t dma_addr;
-	u64 page_mask, total_npages;
-	u32 npages, page_size = PAGE_SIZE,
-		huge_page_size = ctx->hdev->asic_prop.pmmu_huge.page_size;
-	bool first = true, is_huge_page_opt = true;
 	int rc, i, j;
-	u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size);
 
 	phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
 	if (!phys_pg_pack)
@@ -848,6 +853,8 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
 	phys_pg_pack->asid = ctx->asid;
 	atomic_set(&phys_pg_pack->mapping_cnt, 1);
 
+	is_huge_page_opt = (force_regular_page ? false : true);
+
 	/* Only if all dma_addrs are aligned to 2MB and their
 	 * sizes is at least 2MB, we can use huge page mapping.
 	 * We limit the 2MB optimization to this condition,
@@ -1089,7 +1096,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		}
 
 		rc = init_phys_pg_pack_from_userptr(ctx, userptr,
-				&phys_pg_pack);
+				&phys_pg_pack, false);
 		if (rc) {
 			dev_err(hdev->dev,
 				"unable to init page pack for vaddr 0x%llx\n",
@@ -1264,17 +1271,19 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 				bool ctx_free)
 {
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
+	u64 vaddr = args->unmap.device_virt_addr;
 	struct hl_vm_hash_node *hnode = NULL;
+	struct asic_fixed_properties *prop;
+	struct hl_device *hdev = ctx->hdev;
 	struct hl_userptr *userptr = NULL;
 	struct hl_va_range *va_range;
-	u64 vaddr = args->unmap.device_virt_addr;
 	enum vm_type *vm_type;
 	bool is_userptr;
 	int rc = 0;
 
+	prop = &hdev->asic_prop;
+
 	/* protect from double entrance */
 	mutex_lock(&ctx->mem_hash_lock);
 	hash_for_each_possible(ctx->mem_hash, hnode, node, (unsigned long)vaddr)
@@ -1297,8 +1306,9 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	if (*vm_type == VM_TYPE_USERPTR) {
 		is_userptr = true;
 		userptr = hnode->ptr;
-		rc = init_phys_pg_pack_from_userptr(ctx, userptr,
-							&phys_pg_pack);
+
+		rc = init_phys_pg_pack_from_userptr(ctx, userptr, &phys_pg_pack,
+							false);
 		if (rc) {
 			dev_err(hdev->dev,
 				"unable to init page pack for vaddr 0x%llx\n",
@@ -1382,7 +1392,7 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	kfree(hnode);
 
 	if (is_userptr) {
-		rc = free_phys_pg_pack(hdev, phys_pg_pack);
+		free_phys_pg_pack(hdev, phys_pg_pack);
 		dma_unmap_host_va(hdev, userptr);
 	}
 

From 363573837e3c27db3bca9019c54835a495b9bb2b Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Sat, 3 Jul 2021 11:50:32 +0300
Subject: [PATCH 10/55] habanalabs: use get_task_pid() to take PID

The previous function we used, find_get_pid(), wasn't good in case
the user process was run inside docker.

As a result, we didn't had the PID and we couldn't kill the user
process in case the device got stuck and we needed to reset the
device.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c         | 6 +++++-
 drivers/misc/habanalabs/common/habanalabs_drv.c | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 43d0dcbd20e313..45070e891544d9 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -7,11 +7,11 @@
 
 #define pr_fmt(fmt)			"habanalabs: " fmt
 
+#include <uapi/misc/habanalabs.h>
 #include "habanalabs.h"
 
 #include <linux/pci.h>
 #include <linux/hwmon.h>
-#include <uapi/misc/habanalabs.h>
 
 enum hl_device_status hl_device_status(struct hl_device *hdev)
 {
@@ -819,6 +819,10 @@ static int device_kill_open_processes(struct hl_device *hdev, u32 timeout)
 			usleep_range(1000, 10000);
 
 			put_task_struct(task);
+		} else {
+			dev_warn(hdev->dev,
+				"Can't get task struct for PID so giving up on killing process\n");
+			return -ETIME;
 		}
 	}
 
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index 4194cda2d04c39..bffca119946b33 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -141,7 +141,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
 	hl_cb_mgr_init(&hpriv->cb_mgr);
 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
 
-	hpriv->taskpid = find_get_pid(current->pid);
+	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
 
 	mutex_lock(&hdev->fpriv_list_lock);
 

From ab8e4180b2ce7e527b62e6cad3e4e5cc014fcd20 Mon Sep 17 00:00:00 2001
From: Yuri Nudelman <ynudelman@habana.ai>
Date: Sun, 6 Jun 2021 10:28:51 +0300
Subject: [PATCH 11/55] habanalabs: expose state dump

To improve the user's ability to debug the case where a workload that
is part of executing training/inference of a topology is getting stuck,
we need to add a 'core dump' each time a CS times-out. The 'core dump'
shall contain all relevant Sync Manager information and corresponding
fence values.

The most recent dumps shall be accessible via debugfs, under
'state_dump' node. Reading from the node will provide the oldest dump
available. Writing an integer value X will discard X dumps, starting
with the oldest one, i.e. subsequent read will now return newer
dumps.

Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../ABI/testing/debugfs-driver-habanalabs     |  11 +
 drivers/misc/habanalabs/common/Makefile       |   3 +-
 .../habanalabs/common/command_submission.c    |   4 +
 drivers/misc/habanalabs/common/debugfs.c      |  93 ++++
 drivers/misc/habanalabs/common/device.c       |   2 +
 drivers/misc/habanalabs/common/habanalabs.h   | 132 +++++
 drivers/misc/habanalabs/common/state_dump.c   | 452 ++++++++++++++++++
 drivers/misc/habanalabs/gaudi/gaudi.c         |  24 +-
 drivers/misc/habanalabs/goya/goya.c           |  24 +-
 9 files changed, 742 insertions(+), 3 deletions(-)
 create mode 100644 drivers/misc/habanalabs/common/state_dump.c

diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index a5c28f60686557..e29156511388e2 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -215,6 +215,17 @@ Description:    Sets the skip reset on timeout option for the device. Value of
                 "0" means device will be reset in case some CS has timed out,
                 otherwise it will not be reset.
 
+What:           /sys/kernel/debug/habanalabs/hl<n>/state_dump
+Date:           Oct 2021
+KernelVersion:  5.15
+Contact:        ynudelman@habana.ai
+Description:    Gets the state dump occurring on a CS timeout or failure.
+                State dump is used for debug and is created each time in case of
+                a problem in a CS execution, before reset.
+                Reading from the node returns the newest state dump available.
+                Writing an integer X discards X state dumps, so that the
+                next read would return X+1-st newest state dump.
+
 What:           /sys/kernel/debug/habanalabs/hl<n>/stop_on_err
 Date:           Mar 2020
 KernelVersion:  5.6
diff --git a/drivers/misc/habanalabs/common/Makefile b/drivers/misc/habanalabs/common/Makefile
index 5d8b48288cf4d6..6ebe3c7001ffe3 100644
--- a/drivers/misc/habanalabs/common/Makefile
+++ b/drivers/misc/habanalabs/common/Makefile
@@ -10,4 +10,5 @@ HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \
 		common/asid.o common/habanalabs_ioctl.o \
 		common/command_buffer.o common/hw_queue.o common/irq.o \
 		common/sysfs.o common/hwmon.o common/memory.o \
-		common/command_submission.o common/firmware_if.o
+		common/command_submission.o common/firmware_if.o \
+		common/state_dump.o
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 997a37e782413e..a0846880400cea 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -621,6 +621,10 @@ static void cs_timedout(struct work_struct *work)
 		break;
 	}
 
+	rc = hl_state_dump(hdev);
+	if (rc)
+		dev_err(hdev->dev, "Error during system state dump %d\n", rc);
+
 	cs_put(cs);
 
 	if (likely(!skip_reset_on_timeout)) {
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 77f7c2aa571da8..51744e42b808fb 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -1043,6 +1043,60 @@ static ssize_t hl_security_violations_read(struct file *f, char __user *buf,
 	return 0;
 }
 
+static ssize_t hl_state_dump_read(struct file *f, char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+	ssize_t rc;
+
+	down_read(&entry->state_dump_sem);
+	if (!entry->state_dump[entry->state_dump_head])
+		rc = 0;
+	else
+		rc = simple_read_from_buffer(
+			buf, count, ppos,
+			entry->state_dump[entry->state_dump_head],
+			strlen(entry->state_dump[entry->state_dump_head]));
+	up_read(&entry->state_dump_sem);
+
+	return rc;
+}
+
+static ssize_t hl_state_dump_write(struct file *f, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+	struct hl_device *hdev = entry->hdev;
+	ssize_t rc;
+	u32 size;
+	int i;
+
+	rc = kstrtouint_from_user(buf, count, 10, &size);
+	if (rc)
+		return rc;
+
+	if (size <= 0 || size >= ARRAY_SIZE(entry->state_dump)) {
+		dev_err(hdev->dev, "Invalid number of dumps to skip\n");
+		return -EINVAL;
+	}
+
+	if (entry->state_dump[entry->state_dump_head]) {
+		down_write(&entry->state_dump_sem);
+		for (i = 0; i < size; ++i) {
+			vfree(entry->state_dump[entry->state_dump_head]);
+			entry->state_dump[entry->state_dump_head] = NULL;
+			if (entry->state_dump_head > 0)
+				entry->state_dump_head--;
+			else
+				entry->state_dump_head =
+					ARRAY_SIZE(entry->state_dump) - 1;
+		}
+		up_write(&entry->state_dump_sem);
+	}
+
+	return count;
+}
+
 static const struct file_operations hl_data32b_fops = {
 	.owner = THIS_MODULE,
 	.read = hl_data_read32,
@@ -1110,6 +1164,12 @@ static const struct file_operations hl_security_violations_fops = {
 	.read = hl_security_violations_read
 };
 
+static const struct file_operations hl_state_dump_fops = {
+	.owner = THIS_MODULE,
+	.read = hl_state_dump_read,
+	.write = hl_state_dump_write
+};
+
 static const struct hl_info_list hl_debugfs_list[] = {
 	{"command_buffers", command_buffers_show, NULL},
 	{"command_submission", command_submission_show, NULL},
@@ -1172,6 +1232,7 @@ void hl_debugfs_add_device(struct hl_device *hdev)
 	INIT_LIST_HEAD(&dev_entry->userptr_list);
 	INIT_LIST_HEAD(&dev_entry->ctx_mem_hash_list);
 	mutex_init(&dev_entry->file_mutex);
+	init_rwsem(&dev_entry->state_dump_sem);
 	spin_lock_init(&dev_entry->cb_spinlock);
 	spin_lock_init(&dev_entry->cs_spinlock);
 	spin_lock_init(&dev_entry->cs_job_spinlock);
@@ -1283,6 +1344,12 @@ void hl_debugfs_add_device(struct hl_device *hdev)
 				dev_entry->root,
 				&hdev->skip_reset_on_timeout);
 
+	debugfs_create_file("state_dump",
+				0600,
+				dev_entry->root,
+				dev_entry,
+				&hl_state_dump_fops);
+
 	for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
 		debugfs_create_file(hl_debugfs_list[i].name,
 					0444,
@@ -1297,6 +1364,7 @@ void hl_debugfs_add_device(struct hl_device *hdev)
 void hl_debugfs_remove_device(struct hl_device *hdev)
 {
 	struct hl_dbg_device_entry *entry = &hdev->hl_debugfs;
+	int i;
 
 	debugfs_remove_recursive(entry->root);
 
@@ -1304,6 +1372,9 @@ void hl_debugfs_remove_device(struct hl_device *hdev)
 
 	vfree(entry->blob_desc.data);
 
+	for (i = 0; i < ARRAY_SIZE(entry->state_dump); ++i)
+		vfree(entry->state_dump[i]);
+
 	kfree(entry->entry_arr);
 }
 
@@ -1416,6 +1487,28 @@ void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx)
 	spin_unlock(&dev_entry->ctx_mem_hash_spinlock);
 }
 
+/**
+ * hl_debugfs_set_state_dump - register state dump making it accessible via
+ *                             debugfs
+ * @hdev: pointer to the device structure
+ * @data: the actual dump data
+ * @length: the length of the data
+ */
+void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data,
+					unsigned long length)
+{
+	struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs;
+
+	down_write(&dev_entry->state_dump_sem);
+
+	dev_entry->state_dump_head = (dev_entry->state_dump_head + 1) %
+					ARRAY_SIZE(dev_entry->state_dump);
+	vfree(dev_entry->state_dump[dev_entry->state_dump_head]);
+	dev_entry->state_dump[dev_entry->state_dump_head] = data;
+
+	up_write(&dev_entry->state_dump_sem);
+}
+
 void __init hl_debugfs_init(void)
 {
 	hl_debug_root = debugfs_create_dir("habanalabs", NULL);
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 45070e891544d9..86426052a191d1 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1375,6 +1375,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 
 	hdev->compute_ctx = NULL;
 
+	hdev->asic_funcs->state_dump_init(hdev);
+
 	hl_debugfs_add_device(hdev);
 
 	/* debugfs nodes are created in hl_ctx_init so it must be called after
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 5ae95d2abaa6ad..c1bb175d004b03 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -20,6 +20,7 @@
 #include <linux/scatterlist.h>
 #include <linux/hashtable.h>
 #include <linux/debugfs.h>
+#include <linux/rwsem.h>
 #include <linux/bitfield.h>
 #include <linux/genalloc.h>
 #include <linux/sched/signal.h>
@@ -65,6 +66,11 @@
 
 #define HL_COMMON_USER_INTERRUPT_ID	0xFFF
 
+#define HL_STATE_DUMP_HIST_LEN		5
+
+#define OBJ_NAMES_HASH_TABLE_BITS	7 /* 1 << 7 buckets */
+#define SYNC_TO_ENGINE_HASH_TABLE_BITS	7 /* 1 << 7 buckets */
+
 /* Memory */
 #define MEM_HASH_TABLE_BITS		7 /* 1 << 7 buckets */
 
@@ -1123,6 +1129,7 @@ struct fw_load_mgr {
  *                         generic f/w compatible PLL Indexes
  * @init_firmware_loader: initialize data for FW loader.
  * @init_cpu_scrambler_dram: Enable CPU specific DRAM scrambling
+ * @state_dump_init: initialize constants required for state dump
  */
 struct hl_asic_funcs {
 	int (*early_init)(struct hl_device *hdev);
@@ -1248,6 +1255,7 @@ struct hl_asic_funcs {
 	int (*map_pll_idx_to_fw_idx)(u32 pll_idx);
 	void (*init_firmware_loader)(struct hl_device *hdev);
 	void (*init_cpu_scrambler_dram)(struct hl_device *hdev);
+	void (*state_dump_init)(struct hl_device *hdev);
 };
 
 
@@ -1781,9 +1789,12 @@ struct hl_debugfs_entry {
  * @ctx_mem_hash_list: list of available contexts with MMU mappings.
  * @ctx_mem_hash_spinlock: protects cb_list.
  * @blob_desc: descriptor of blob
+ * @state_dump: data of the system states in case of a bad cs.
+ * @state_dump_sem: protects state_dump.
  * @addr: next address to read/write from/to in read/write32.
  * @mmu_addr: next virtual address to translate to physical address in mmu_show.
  * @mmu_asid: ASID to use while translating in mmu_show.
+ * @state_dump_head: index of the latest state dump
  * @i2c_bus: generic u8 debugfs file for bus value to use in i2c_data_read.
  * @i2c_addr: generic u8 debugfs file for address value to use in i2c_data_read.
  * @i2c_reg: generic u8 debugfs file for register value to use in i2c_data_read.
@@ -1805,14 +1816,117 @@ struct hl_dbg_device_entry {
 	struct list_head		ctx_mem_hash_list;
 	spinlock_t			ctx_mem_hash_spinlock;
 	struct debugfs_blob_wrapper	blob_desc;
+	char				*state_dump[HL_STATE_DUMP_HIST_LEN];
+	struct rw_semaphore		state_dump_sem;
 	u64				addr;
 	u64				mmu_addr;
 	u32				mmu_asid;
+	u32				state_dump_head;
 	u8				i2c_bus;
 	u8				i2c_addr;
 	u8				i2c_reg;
 };
 
+/**
+ * struct hl_hw_obj_name_entry - single hw object name, member of
+ * hl_state_dump_specs
+ * @node: link to the containing hash table
+ * @name: hw object name
+ * @id: object identifier
+ */
+struct hl_hw_obj_name_entry {
+	struct hlist_node	node;
+	const char		*name;
+	u32			id;
+};
+
+enum hl_state_dump_specs_props {
+	SP_SYNC_OBJ_BASE_ADDR,
+	SP_NEXT_SYNC_OBJ_ADDR,
+	SP_SYNC_OBJ_AMOUNT,
+	SP_MON_OBJ_WR_ADDR_LOW,
+	SP_MON_OBJ_WR_ADDR_HIGH,
+	SP_MON_OBJ_WR_DATA,
+	SP_MON_OBJ_ARM_DATA,
+	SP_MON_OBJ_STATUS,
+	SP_MONITORS_AMOUNT,
+	SP_TPC0_CMDQ,
+	SP_TPC0_CFG_SO,
+	SP_NEXT_TPC,
+	SP_MME_CMDQ,
+	SP_MME_CFG_SO,
+	SP_NEXT_MME,
+	SP_DMA_CMDQ,
+	SP_DMA_CFG_SO,
+	SP_DMA_QUEUES_OFFSET,
+	SP_NUM_OF_MME_ENGINES,
+	SP_SUB_MME_ENG_NUM,
+	SP_NUM_OF_DMA_ENGINES,
+	SP_NUM_OF_TPC_ENGINES,
+	SP_ENGINE_NUM_OF_QUEUES,
+	SP_ENGINE_NUM_OF_STREAMS,
+	SP_ENGINE_NUM_OF_FENCES,
+	SP_FENCE0_CNT_OFFSET,
+	SP_FENCE0_RDATA_OFFSET,
+	SP_CP_STS_OFFSET,
+	SP_NUM_CORES,
+
+	SP_MAX
+};
+
+enum hl_sync_engine_type {
+	ENGINE_TPC,
+	ENGINE_DMA,
+	ENGINE_MME,
+};
+
+/**
+ * struct hl_sync_to_engine_map_entry - sync object id to engine mapping entry
+ * @engine_type: type of the engine
+ * @engine_id: id of the engine
+ * @sync_id: id of the sync object
+ */
+struct hl_sync_to_engine_map_entry {
+	struct hlist_node		node;
+	enum hl_sync_engine_type	engine_type;
+	u32				engine_id;
+	u32				sync_id;
+};
+
+/**
+ * struct hl_sync_to_engine_map - maps sync object id to associated engine id
+ * @tb: hash table containing the mapping, each element is of type
+ *      struct hl_sync_to_engine_map_entry
+ */
+struct hl_sync_to_engine_map {
+	DECLARE_HASHTABLE(tb, SYNC_TO_ENGINE_HASH_TABLE_BITS);
+};
+
+/**
+ * struct hl_state_dump_specs_funcs - virtual functions used by the state dump
+ * @gen_sync_to_engine_map: generate a hash map from sync obj id to its engine
+ */
+struct hl_state_dump_specs_funcs {
+	int (*gen_sync_to_engine_map)(struct hl_device *hdev,
+				struct hl_sync_to_engine_map *map);
+};
+
+/**
+ * struct hl_state_dump_specs - defines ASIC known hw objects names
+ * @so_id_to_str_tb: sync objects names index table
+ * @monitor_id_to_str_tb: monitors names index table
+ * @funcs: virtual functions used for state dump
+ * @sync_namager_names: readable names for sync manager if available (ex: N_E)
+ * @props: pointer to a per asic const props array required for state dump
+ */
+struct hl_state_dump_specs {
+	DECLARE_HASHTABLE(so_id_to_str_tb, OBJ_NAMES_HASH_TABLE_BITS);
+	DECLARE_HASHTABLE(monitor_id_to_str_tb, OBJ_NAMES_HASH_TABLE_BITS);
+	struct hl_state_dump_specs_funcs	funcs;
+	const char * const			*sync_namager_names;
+	s64					*props;
+};
+
 
 /*
  * DEVICES
@@ -2151,6 +2265,7 @@ struct hl_mmu_funcs {
  * @mmu_func: device-related MMU functions.
  * @fw_loader: FW loader manager.
  * @pci_mem_region: array of memory regions in the PCI
+ * @state_dump_specs: constants and dictionaries needed to dump system state.
  * @dram_used_mem: current DRAM memory consumption.
  * @timeout_jiffies: device CS timeout value.
  * @max_power: the max power of the device, as configured by the sysadmin. This
@@ -2295,6 +2410,8 @@ struct hl_device {
 
 	struct pci_mem_region		pci_mem_region[PCI_REGION_NUMBER];
 
+	struct hl_state_dump_specs	state_dump_specs;
+
 	atomic64_t			dram_used_mem;
 	u64				timeout_jiffies;
 	u64				max_power;
@@ -2676,6 +2793,14 @@ void hl_release_pending_user_interrupts(struct hl_device *hdev);
 int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
 			struct hl_hw_sob **hw_sob, u32 count);
 
+int hl_state_dump(struct hl_device *hdev);
+const char *hl_state_dump_get_sync_name(struct hl_device *hdev, u32 sync_id);
+void hl_state_dump_free_sync_to_engine_map(struct hl_sync_to_engine_map *map);
+__printf(4, 5) int hl_snprintf_resize(char **buf, size_t *size, size_t *offset,
+					const char *format, ...);
+char *hl_format_as_binary(char *buf, size_t buf_len, u32 n);
+const char *hl_sync_engine_to_string(enum hl_sync_engine_type engine_type);
+
 #ifdef CONFIG_DEBUG_FS
 
 void hl_debugfs_init(void);
@@ -2695,6 +2820,8 @@ void hl_debugfs_remove_userptr(struct hl_device *hdev,
 				struct hl_userptr *userptr);
 void hl_debugfs_add_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx);
 void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx);
+void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data,
+					unsigned long length);
 
 #else
 
@@ -2768,6 +2895,11 @@ static inline void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev,
 {
 }
 
+void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data,
+					unsigned long length)
+{
+}
+
 #endif
 
 /* IOCTLs */
diff --git a/drivers/misc/habanalabs/common/state_dump.c b/drivers/misc/habanalabs/common/state_dump.c
new file mode 100644
index 00000000000000..a546ea613536f1
--- /dev/null
+++ b/drivers/misc/habanalabs/common/state_dump.c
@@ -0,0 +1,452 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2021 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include <uapi/misc/habanalabs.h>
+#include "habanalabs.h"
+
+/**
+ * hl_format_as_binary - helper function, format an integer as binary
+ *                       using supplied scratch buffer
+ * @buf: the buffer to use
+ * @buf_len: buffer capacity
+ * @n: number to format
+ *
+ * Returns pointer to buffer
+ */
+char *hl_format_as_binary(char *buf, size_t buf_len, u32 n)
+{
+	int i;
+	u32 bit;
+	bool leading0 = true;
+	char *wrptr = buf;
+
+	if (buf_len > 0 && buf_len < 3) {
+		*wrptr = '\0';
+		return buf;
+	}
+
+	wrptr[0] = '0';
+	wrptr[1] = 'b';
+	wrptr += 2;
+	/* Remove 3 characters from length for '0b' and '\0' termination */
+	buf_len -= 3;
+
+	for (i = 0; i < sizeof(n) * BITS_PER_BYTE && buf_len; ++i, n <<= 1) {
+		/* Writing bit calculation in one line would cause a false
+		 * positive static code analysis error, so splitting.
+		 */
+		bit = n & (1 << (sizeof(n) * BITS_PER_BYTE - 1));
+		bit = !!bit;
+		leading0 &= !bit;
+		if (!leading0) {
+			*wrptr = '0' + bit;
+			++wrptr;
+		}
+	}
+
+	*wrptr = '\0';
+
+	return buf;
+}
+
+/**
+ * resize_to_fit - helper function, resize buffer to fit given amount of data
+ * @buf: destination buffer double pointer
+ * @size: pointer to the size container
+ * @desired_size: size the buffer must contain
+ *
+ * Returns 0 on success or error code on failure.
+ * On success, the size of buffer is at least desired_size. Buffer is allocated
+ * via vmalloc and must be freed with vfree.
+ */
+static int resize_to_fit(char **buf, size_t *size, size_t desired_size)
+{
+	char *resized_buf;
+	size_t new_size;
+
+	if (*size >= desired_size)
+		return 0;
+
+	/* Not enough space to print all, have to resize */
+	new_size = max_t(size_t, PAGE_SIZE, round_up(desired_size, PAGE_SIZE));
+	resized_buf = vmalloc(new_size);
+	if (!resized_buf)
+		return -ENOMEM;
+	memcpy(resized_buf, *buf, *size);
+	vfree(*buf);
+	*buf = resized_buf;
+	*size = new_size;
+
+	return 1;
+}
+
+/**
+ * hl_snprintf_resize() - print formatted data to buffer, resize as needed
+ * @buf: buffer double pointer, to be written to and resized, must be either
+ *       NULL or allocated with vmalloc.
+ * @size: current size of the buffer
+ * @offset: current offset to write to
+ * @format: format of the data
+ *
+ * This function will write formatted data into the buffer. If buffer is not
+ * large enough, it will be resized using vmalloc. Size may be modified if the
+ * buffer was resized, offset will be advanced by the number of bytes written
+ * not including the terminating character
+ *
+ * Returns 0 on success or error code on failure
+ *
+ * Note that the buffer has to be manually released using vfree.
+ */
+int hl_snprintf_resize(char **buf, size_t *size, size_t *offset,
+			   const char *format, ...)
+{
+	va_list args;
+	size_t length;
+	int rc;
+
+	if (*buf == NULL && (*size != 0 || *offset != 0))
+		return -EINVAL;
+
+	va_start(args, format);
+	length = vsnprintf(*buf + *offset, *size - *offset, format, args);
+	va_end(args);
+
+	rc = resize_to_fit(buf, size, *offset + length + 1);
+	if (rc < 0)
+		return rc;
+	else if (rc > 0) {
+		/* Resize was needed, write again */
+		va_start(args, format);
+		length = vsnprintf(*buf + *offset, *size - *offset, format,
+				   args);
+		va_end(args);
+	}
+
+	*offset += length;
+
+	return 0;
+}
+
+/**
+ * hl_sync_engine_to_string - convert engine type enum to string literal
+ * @engine_type: engine type (TPC/MME/DMA)
+ *
+ * Return the resolved string literal
+ */
+const char *hl_sync_engine_to_string(enum hl_sync_engine_type engine_type)
+{
+	switch (engine_type) {
+	case ENGINE_DMA:
+		return "DMA";
+	case ENGINE_MME:
+		return "MME";
+	case ENGINE_TPC:
+		return "TPC";
+	}
+	return "Invalid Engine Type";
+}
+
+/**
+ * hl_print_resize_sync_engine - helper function, format engine name and ID
+ * using hl_snprintf_resize
+ * @buf: destination buffer double pointer to be used with hl_snprintf_resize
+ * @size: pointer to the size container
+ * @offset: pointer to the offset container
+ * @engine_type: engine type (TPC/MME/DMA)
+ * @engine_id: engine numerical id
+ *
+ * Returns 0 on success or error code on failure
+ */
+static int hl_print_resize_sync_engine(char **buf, size_t *size, size_t *offset,
+				enum hl_sync_engine_type engine_type,
+				u32 engine_id)
+{
+	return hl_snprintf_resize(buf, size, offset, "%s%u",
+			hl_sync_engine_to_string(engine_type), engine_id);
+}
+
+/**
+ * hl_state_dump_get_sync_name - transform sync object id to name if available
+ * @hdev: pointer to the device
+ * @sync_id: sync object id
+ *
+ * Returns a name literal or NULL if not resolved.
+ * Note: returning NULL shall not be considered as a failure, as not all
+ * sync objects are named.
+ */
+const char *hl_state_dump_get_sync_name(struct hl_device *hdev, u32 sync_id)
+{
+	struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+	struct hl_hw_obj_name_entry *entry;
+
+	hash_for_each_possible(sds->so_id_to_str_tb, entry,
+				node, sync_id)
+		if (sync_id == entry->id)
+			return entry->name;
+
+	return NULL;
+}
+
+/**
+ * hl_state_dump_free_sync_to_engine_map - free sync object to engine map
+ * @map: sync object to engine map
+ *
+ * Note: generic free implementation, the allocation is implemented per ASIC.
+ */
+void hl_state_dump_free_sync_to_engine_map(struct hl_sync_to_engine_map *map)
+{
+	struct hl_sync_to_engine_map_entry *entry;
+	struct hlist_node *tmp_node;
+	int i;
+
+	hash_for_each_safe(map->tb, i, tmp_node, entry, node) {
+		hash_del(&entry->node);
+		kfree(entry);
+	}
+}
+
+/**
+ * hl_state_dump_get_sync_to_engine - transform sync_id to
+ * hl_sync_to_engine_map_entry if available for current id
+ * @map: sync object to engine map
+ * @sync_id: sync object id
+ *
+ * Returns the translation entry if found or NULL if not.
+ * Note, returned NULL shall not be considered as a failure as the map
+ * does not cover all possible, it is a best effort sync ids.
+ */
+static struct hl_sync_to_engine_map_entry *
+hl_state_dump_get_sync_to_engine(struct hl_sync_to_engine_map *map, u32 sync_id)
+{
+	struct hl_sync_to_engine_map_entry *entry;
+
+	hash_for_each_possible(map->tb, entry, node, sync_id)
+		if (entry->sync_id == sync_id)
+			return entry;
+	return NULL;
+}
+
+/**
+ * hl_state_dump_read_sync_objects - read sync objects array
+ * @hdev: pointer to the device
+ * @index: sync manager block index starting with E_N
+ *
+ * Returns array of size SP_SYNC_OBJ_AMOUNT on success or NULL on failure
+ */
+static u32 *hl_state_dump_read_sync_objects(struct hl_device *hdev, u32 index)
+{
+	struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+	u32 *sync_objects;
+	s64 base_addr; /* Base addr can be negative */
+	int i;
+
+	base_addr =
+		sds->props[SP_SYNC_OBJ_BASE_ADDR] +
+		sds->props[SP_NEXT_SYNC_OBJ_ADDR] *
+		index;
+
+	sync_objects = vmalloc(
+		sds->props[SP_SYNC_OBJ_AMOUNT] *
+		sizeof(u32));
+	if (!sync_objects)
+		return NULL;
+
+	for (i = 0;
+		i < sds->props[SP_SYNC_OBJ_AMOUNT];
+		++i) {
+		sync_objects[i] =
+		RREG32(base_addr + i * sizeof(u32));
+	}
+
+	return sync_objects;
+}
+
+/**
+ * hl_state_dump_free_sync_objects - free sync objects array allocated by
+ * hl_state_dump_read_sync_objects
+ * @sync_objects: sync objects array
+ */
+static void hl_state_dump_free_sync_objects(u32 *sync_objects)
+{
+	vfree(sync_objects);
+}
+
+
+/**
+ * hl_state_dump_print_syncs_single_block - print active sync objects on a
+ * single block
+ * @hdev: pointer to the device
+ * @index: sync manager block index starting with E_N
+ * @buf: destination buffer double pointer to be used with hl_snprintf_resize
+ * @size: pointer to the size container
+ * @offset: pointer to the offset container
+ * @map: sync engines names map
+ *
+ * Returns 0 on success or error code on failure
+ */
+static int
+hl_state_dump_print_syncs_single_block(struct hl_device *hdev, u32 index,
+				char **buf, size_t *size, size_t *offset,
+				struct hl_sync_to_engine_map *map)
+{
+	struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+	const char *sync_name;
+	u32 *sync_objects = NULL;
+	int rc = 0, i;
+
+	if (sds->sync_namager_names) {
+		rc = hl_snprintf_resize(
+			buf, size, offset, "%s\n",
+			sds->sync_namager_names[index]);
+		if (rc)
+			goto out;
+	}
+
+	sync_objects = hl_state_dump_read_sync_objects(hdev, index);
+	if (!sync_objects) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0;
+		i < sds->props[SP_SYNC_OBJ_AMOUNT];
+		++i) {
+		struct hl_sync_to_engine_map_entry *entry;
+		u64 sync_object_addr;
+
+		if (!sync_objects[i])
+			continue;
+
+		sync_object_addr =
+			sds->props[SP_SYNC_OBJ_BASE_ADDR] +
+			sds->props[SP_NEXT_SYNC_OBJ_ADDR] *
+			index + i * sizeof(u32);
+
+		rc = hl_snprintf_resize(buf, size, offset, "sync id: %u", i);
+		if (rc)
+			goto free_sync_objects;
+		sync_name = hl_state_dump_get_sync_name(hdev, i);
+		if (sync_name) {
+			rc = hl_snprintf_resize(buf, size, offset, " %s",
+						sync_name);
+			if (rc)
+				goto free_sync_objects;
+		}
+		rc = hl_snprintf_resize(buf, size, offset, ", value: %u",
+					sync_objects[i]);
+		if (rc)
+			goto free_sync_objects;
+
+		/* Append engine string */
+		entry = hl_state_dump_get_sync_to_engine(map,
+			(u32)sync_object_addr);
+		if (entry) {
+			rc = hl_snprintf_resize(buf, size, offset, ", Engine: ");
+			if (rc)
+				goto free_sync_objects;
+			rc = hl_print_resize_sync_engine(buf, size, offset,
+						entry->engine_type,
+						entry->engine_id);
+			if (rc)
+				goto free_sync_objects;
+		}
+
+		rc = hl_snprintf_resize(buf, size, offset, "\n");
+		if (rc)
+			goto free_sync_objects;
+	}
+
+free_sync_objects:
+	hl_state_dump_free_sync_objects(sync_objects);
+out:
+	return rc;
+}
+
+/**
+ * hl_state_dump_print_syncs - print active sync objects
+ * @hdev: pointer to the device
+ * @buf: destination buffer double pointer to be used with hl_snprintf_resize
+ * @size: pointer to the size container
+ * @offset: pointer to the offset container
+ *
+ * Returns 0 on success or error code on failure
+ */
+static int hl_state_dump_print_syncs(struct hl_device *hdev,
+					char **buf, size_t *size,
+					size_t *offset)
+
+{
+	struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+	struct hl_sync_to_engine_map *map;
+	u32 index;
+	int rc = 0;
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (!map)
+		return -ENOMEM;
+
+	rc = sds->funcs.gen_sync_to_engine_map(hdev, map);
+	if (rc)
+		goto free_map_mem;
+
+	rc = hl_snprintf_resize(buf, size, offset, "Non zero sync objects:\n");
+	if (rc)
+		goto out;
+
+	if (sds->sync_namager_names) {
+		for (index = 0; sds->sync_namager_names[index]; ++index) {
+			rc = hl_state_dump_print_syncs_single_block(
+				hdev, index, buf, size, offset, map);
+			if (rc)
+				goto out;
+		}
+	} else {
+		for (index = 0; index < sds->props[SP_NUM_CORES]; ++index) {
+			rc = hl_state_dump_print_syncs_single_block(
+				hdev, index, buf, size, offset, map);
+			if (rc)
+				goto out;
+		}
+	}
+
+out:
+	hl_state_dump_free_sync_to_engine_map(map);
+free_map_mem:
+	kfree(map);
+
+	return rc;
+}
+
+/**
+ * hl_state_dump() - dump system state
+ * @hdev: pointer to device structure
+ */
+int hl_state_dump(struct hl_device *hdev)
+{
+	char *buf = NULL;
+	size_t offset = 0, size = 0;
+	int rc;
+
+	rc = hl_snprintf_resize(&buf, &size, &offset,
+				"Timestamp taken on: %llu\n\n",
+				ktime_to_ns(ktime_get()));
+	if (rc)
+		goto err;
+
+	rc = hl_state_dump_print_syncs(hdev, &buf, &size, &offset);
+	if (rc)
+		goto err;
+
+	hl_snprintf_resize(&buf, &size, &offset, "\n");
+
+	hl_debugfs_set_state_dump(hdev, buf, size);
+
+	return 0;
+err:
+	vfree(buf);
+	return rc;
+}
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index aa8a0ca5aca240..7f90f637d7f42e 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -348,6 +348,8 @@ static enum hl_queue_type gaudi_queue_type[GAUDI_QUEUE_ID_SIZE] = {
 	QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_3 */
 };
 
+static s64 gaudi_state_dump_specs_props[SP_MAX] = {0};
+
 struct ecc_info_extract_params {
 	u64 block_address;
 	u32 num_memories;
@@ -8977,6 +8979,25 @@ static int gaudi_map_pll_idx_to_fw_idx(u32 pll_idx)
 	}
 }
 
+static int gaudi_gen_sync_to_engine_map(struct hl_device *hdev,
+				struct hl_sync_to_engine_map *map)
+{
+	/* Not implemented */
+	return 0;
+}
+
+
+static struct hl_state_dump_specs_funcs gaudi_state_dump_funcs = {
+	.gen_sync_to_engine_map = gaudi_gen_sync_to_engine_map,
+};
+
+static void gaudi_state_dump_init(struct hl_device *hdev)
+{
+	/* Not implemented */
+	hdev->state_dump_specs.props = gaudi_state_dump_specs_props;
+	hdev->state_dump_specs.funcs = gaudi_state_dump_funcs;
+}
+
 static const struct hl_asic_funcs gaudi_funcs = {
 	.early_init = gaudi_early_init,
 	.early_fini = gaudi_early_fini,
@@ -9062,7 +9083,8 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.enable_events_from_fw = gaudi_enable_events_from_fw,
 	.map_pll_idx_to_fw_idx = gaudi_map_pll_idx_to_fw_idx,
 	.init_firmware_loader = gaudi_init_firmware_loader,
-	.init_cpu_scrambler_dram = gaudi_init_scrambler_hbm
+	.init_cpu_scrambler_dram = gaudi_init_scrambler_hbm,
+	.state_dump_init = gaudi_state_dump_init
 };
 
 /**
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 755e08cf2ecc82..2c3d642d31aba6 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -350,6 +350,8 @@ static u32 goya_all_events[] = {
 	GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E
 };
 
+static s64 goya_state_dump_specs_props[SP_MAX] = {0};
+
 static int goya_mmu_clear_pgt_range(struct hl_device *hdev);
 static int goya_mmu_set_dram_default_page(struct hl_device *hdev);
 static int goya_mmu_add_mappings_for_device_cpu(struct hl_device *hdev);
@@ -5524,6 +5526,25 @@ static int goya_map_pll_idx_to_fw_idx(u32 pll_idx)
 	}
 }
 
+static int goya_gen_sync_to_engine_map(struct hl_device *hdev,
+				struct hl_sync_to_engine_map *map)
+{
+	/* Not implemented */
+	return 0;
+}
+
+
+static struct hl_state_dump_specs_funcs goya_state_dump_funcs = {
+	.gen_sync_to_engine_map = goya_gen_sync_to_engine_map,
+};
+
+static void goya_state_dump_init(struct hl_device *hdev)
+{
+	/* Not implemented */
+	hdev->state_dump_specs.props = goya_state_dump_specs_props;
+	hdev->state_dump_specs.funcs = goya_state_dump_funcs;
+}
+
 static const struct hl_asic_funcs goya_funcs = {
 	.early_init = goya_early_init,
 	.early_fini = goya_early_fini,
@@ -5609,7 +5630,8 @@ static const struct hl_asic_funcs goya_funcs = {
 	.enable_events_from_fw = goya_enable_events_from_fw,
 	.map_pll_idx_to_fw_idx = goya_map_pll_idx_to_fw_idx,
 	.init_firmware_loader = goya_init_firmware_loader,
-	.init_cpu_scrambler_dram = goya_cpu_init_scrambler_dram
+	.init_cpu_scrambler_dram = goya_cpu_init_scrambler_dram,
+	.state_dump_init = goya_state_dump_init,
 };
 
 /*

From c9573bf437d95390e07a17d149a9b109d47da791 Mon Sep 17 00:00:00 2001
From: Yuri Nudelman <ynudelman@habana.ai>
Date: Wed, 9 Jun 2021 14:04:26 +0300
Subject: [PATCH 12/55] habanalabs: state dump monitors and fences
 infrastructure

With the infrastructure in place, monitors and fences dump shall be
implemented.

Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h |  37 ++-
 drivers/misc/habanalabs/common/state_dump.c | 308 ++++++++++++++++++--
 drivers/misc/habanalabs/gaudi/gaudi.c       |  27 ++
 drivers/misc/habanalabs/goya/goya.c         |  27 ++
 4 files changed, 376 insertions(+), 23 deletions(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index c1bb175d004b03..9aedea471ebe76 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1880,6 +1880,24 @@ enum hl_sync_engine_type {
 	ENGINE_MME,
 };
 
+/**
+ * struct hl_mon_state_dump - represents a state dump of a single monitor
+ * @id: monitor id
+ * @wr_addr_low: address monitor will write to, low bits
+ * @wr_addr_high: address monitor will write to, high bits
+ * @wr_data: data monitor will write
+ * @arm_data: register value containing monitor configuration
+ * @status: monitor status
+ */
+struct hl_mon_state_dump {
+	u32		id;
+	u32		wr_addr_low;
+	u32		wr_addr_high;
+	u32		wr_data;
+	u32		arm_data;
+	u32		status;
+};
+
 /**
  * struct hl_sync_to_engine_map_entry - sync object id to engine mapping entry
  * @engine_type: type of the engine
@@ -1905,10 +1923,23 @@ struct hl_sync_to_engine_map {
 /**
  * struct hl_state_dump_specs_funcs - virtual functions used by the state dump
  * @gen_sync_to_engine_map: generate a hash map from sync obj id to its engine
+ * @print_single_monitor: format monitor data as string
+ * @monitor_valid: return true if given monitor dump is valid
+ * @print_fences_single_engine: format fences data as string
  */
 struct hl_state_dump_specs_funcs {
 	int (*gen_sync_to_engine_map)(struct hl_device *hdev,
 				struct hl_sync_to_engine_map *map);
+	int (*print_single_monitor)(char **buf, size_t *size, size_t *offset,
+				    struct hl_device *hdev,
+				    struct hl_mon_state_dump *mon);
+	int (*monitor_valid)(struct hl_mon_state_dump *mon);
+	int (*print_fences_single_engine)(struct hl_device *hdev,
+					u64 base_offset,
+					u64 status_base_offset,
+					enum hl_sync_engine_type engine_type,
+					u32 engine_id, char **buf,
+					size_t *size, size_t *offset);
 };
 
 /**
@@ -2795,6 +2826,8 @@ int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
 
 int hl_state_dump(struct hl_device *hdev);
 const char *hl_state_dump_get_sync_name(struct hl_device *hdev, u32 sync_id);
+const char *hl_state_dump_get_monitor_name(struct hl_device *hdev,
+					struct hl_mon_state_dump *mon);
 void hl_state_dump_free_sync_to_engine_map(struct hl_sync_to_engine_map *map);
 __printf(4, 5) int hl_snprintf_resize(char **buf, size_t *size, size_t *offset,
 					const char *format, ...);
@@ -2895,8 +2928,8 @@ static inline void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev,
 {
 }
 
-void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data,
-					unsigned long length)
+static inline void hl_debugfs_set_state_dump(struct hl_device *hdev,
+					char *data, unsigned long length)
 {
 }
 
diff --git a/drivers/misc/habanalabs/common/state_dump.c b/drivers/misc/habanalabs/common/state_dump.c
index a546ea613536f1..74726907c95edf 100644
--- a/drivers/misc/habanalabs/common/state_dump.c
+++ b/drivers/misc/habanalabs/common/state_dump.c
@@ -5,6 +5,7 @@
  * All Rights Reserved.
  */
 
+#include <linux/vmalloc.h>
 #include <uapi/misc/habanalabs.h>
 #include "habanalabs.h"
 
@@ -191,6 +192,30 @@ const char *hl_state_dump_get_sync_name(struct hl_device *hdev, u32 sync_id)
 	return NULL;
 }
 
+/**
+ * hl_state_dump_get_monitor_name - transform monitor object dump to monitor
+ * name if available
+ * @hdev: pointer to the device
+ * @mon: monitor state dump
+ *
+ * Returns a name literal or NULL if not resolved.
+ * Note: returning NULL shall not be considered as a failure, as not all
+ * monitors are named.
+ */
+const char *hl_state_dump_get_monitor_name(struct hl_device *hdev,
+					struct hl_mon_state_dump *mon)
+{
+	struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+	struct hl_hw_obj_name_entry *entry;
+
+	hash_for_each_possible(sds->monitor_id_to_str_tb,
+				entry, node, mon->id)
+		if (mon->id == entry->id)
+			return entry->name;
+
+	return NULL;
+}
+
 /**
  * hl_state_dump_free_sync_to_engine_map - free sync object to engine map
  * @map: sync object to engine map
@@ -244,23 +269,15 @@ static u32 *hl_state_dump_read_sync_objects(struct hl_device *hdev, u32 index)
 	s64 base_addr; /* Base addr can be negative */
 	int i;
 
-	base_addr =
-		sds->props[SP_SYNC_OBJ_BASE_ADDR] +
-		sds->props[SP_NEXT_SYNC_OBJ_ADDR] *
-		index;
+	base_addr = sds->props[SP_SYNC_OBJ_BASE_ADDR] +
+			sds->props[SP_NEXT_SYNC_OBJ_ADDR] * index;
 
-	sync_objects = vmalloc(
-		sds->props[SP_SYNC_OBJ_AMOUNT] *
-		sizeof(u32));
+	sync_objects = vmalloc(sds->props[SP_SYNC_OBJ_AMOUNT] * sizeof(u32));
 	if (!sync_objects)
 		return NULL;
 
-	for (i = 0;
-		i < sds->props[SP_SYNC_OBJ_AMOUNT];
-		++i) {
-		sync_objects[i] =
-		RREG32(base_addr + i * sizeof(u32));
-	}
+	for (i = 0; i < sds->props[SP_SYNC_OBJ_AMOUNT]; ++i)
+		sync_objects[i] = RREG32(base_addr + i * sizeof(u32));
 
 	return sync_objects;
 }
@@ -312,19 +329,16 @@ hl_state_dump_print_syncs_single_block(struct hl_device *hdev, u32 index,
 		goto out;
 	}
 
-	for (i = 0;
-		i < sds->props[SP_SYNC_OBJ_AMOUNT];
-		++i) {
+	for (i = 0; i < sds->props[SP_SYNC_OBJ_AMOUNT]; ++i) {
 		struct hl_sync_to_engine_map_entry *entry;
 		u64 sync_object_addr;
 
 		if (!sync_objects[i])
 			continue;
 
-		sync_object_addr =
-			sds->props[SP_SYNC_OBJ_BASE_ADDR] +
-			sds->props[SP_NEXT_SYNC_OBJ_ADDR] *
-			index + i * sizeof(u32);
+		sync_object_addr = sds->props[SP_SYNC_OBJ_BASE_ADDR] +
+				sds->props[SP_NEXT_SYNC_OBJ_ADDR] * index +
+				i * sizeof(u32);
 
 		rc = hl_snprintf_resize(buf, size, offset, "sync id: %u", i);
 		if (rc)
@@ -345,7 +359,8 @@ hl_state_dump_print_syncs_single_block(struct hl_device *hdev, u32 index,
 		entry = hl_state_dump_get_sync_to_engine(map,
 			(u32)sync_object_addr);
 		if (entry) {
-			rc = hl_snprintf_resize(buf, size, offset, ", Engine: ");
+			rc = hl_snprintf_resize(buf, size, offset,
+						", Engine: ");
 			if (rc)
 				goto free_sync_objects;
 			rc = hl_print_resize_sync_engine(buf, size, offset,
@@ -421,6 +436,245 @@ static int hl_state_dump_print_syncs(struct hl_device *hdev,
 	return rc;
 }
 
+/**
+ * hl_state_dump_alloc_read_sm_block_monitors - read monitors for a specific
+ * block
+ * @hdev: pointer to the device
+ * @index: sync manager block index starting with E_N
+ *
+ * Returns an array of monitor data of size SP_MONITORS_AMOUNT or NULL
+ * on error
+ */
+static struct hl_mon_state_dump *
+hl_state_dump_alloc_read_sm_block_monitors(struct hl_device *hdev, u32 index)
+{
+	struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+	struct hl_mon_state_dump *monitors;
+	s64 base_addr; /* Base addr can be negative */
+	int i;
+
+	monitors = vmalloc(sds->props[SP_MONITORS_AMOUNT] *
+			   sizeof(struct hl_mon_state_dump));
+	if (!monitors)
+		return NULL;
+
+	base_addr = sds->props[SP_NEXT_SYNC_OBJ_ADDR] * index;
+
+	for (i = 0; i < sds->props[SP_MONITORS_AMOUNT]; ++i) {
+		monitors[i].id = i;
+		monitors[i].wr_addr_low =
+			RREG32(base_addr + sds->props[SP_MON_OBJ_WR_ADDR_LOW] +
+				i * sizeof(u32));
+
+		monitors[i].wr_addr_high =
+			RREG32(base_addr + sds->props[SP_MON_OBJ_WR_ADDR_HIGH] +
+				i * sizeof(u32));
+
+		monitors[i].wr_data =
+			RREG32(base_addr + sds->props[SP_MON_OBJ_WR_DATA] +
+				i * sizeof(u32));
+
+		monitors[i].arm_data =
+			RREG32(base_addr + sds->props[SP_MON_OBJ_ARM_DATA] +
+				i * sizeof(u32));
+
+		monitors[i].status =
+			RREG32(base_addr + sds->props[SP_MON_OBJ_STATUS] +
+				i * sizeof(u32));
+	}
+
+	return monitors;
+}
+
+/**
+ * hl_state_dump_free_monitors - free the monitors structure
+ * @monitors: monitors array created with
+ *            hl_state_dump_alloc_read_sm_block_monitors
+ */
+static void hl_state_dump_free_monitors(struct hl_mon_state_dump *monitors)
+{
+	vfree(monitors);
+}
+
+/**
+ * hl_state_dump_print_monitors_single_block - print active monitors on a
+ * single block
+ * @hdev: pointer to the device
+ * @index: sync manager block index starting with E_N
+ * @buf: destination buffer double pointer to be used with hl_snprintf_resize
+ * @size: pointer to the size container
+ * @offset: pointer to the offset container
+ *
+ * Returns 0 on success or error code on failure
+ */
+static int hl_state_dump_print_monitors_single_block(struct hl_device *hdev,
+						u32 index,
+						char **buf, size_t *size,
+						size_t *offset)
+{
+	struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+	struct hl_mon_state_dump *monitors = NULL;
+	int rc = 0, i;
+
+	if (sds->sync_namager_names) {
+		rc = hl_snprintf_resize(
+			buf, size, offset, "%s\n",
+			sds->sync_namager_names[index]);
+		if (rc)
+			goto out;
+	}
+
+	monitors = hl_state_dump_alloc_read_sm_block_monitors(hdev, index);
+	if (!monitors) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < sds->props[SP_MONITORS_AMOUNT]; ++i) {
+		if (!(sds->funcs.monitor_valid(&monitors[i])))
+			continue;
+
+		/* Monitor is valid, dump it */
+		rc = sds->funcs.print_single_monitor(buf, size, offset, hdev,
+							&monitors[i]);
+		if (rc)
+			goto free_monitors;
+
+		hl_snprintf_resize(buf, size, offset, "\n");
+	}
+
+free_monitors:
+	hl_state_dump_free_monitors(monitors);
+out:
+	return rc;
+}
+
+/**
+ * hl_state_dump_print_monitors - print active monitors
+ * @hdev: pointer to the device
+ * @buf: destination buffer double pointer to be used with hl_snprintf_resize
+ * @size: pointer to the size container
+ * @offset: pointer to the offset container
+ *
+ * Returns 0 on success or error code on failure
+ */
+static int hl_state_dump_print_monitors(struct hl_device *hdev,
+					char **buf, size_t *size,
+					size_t *offset)
+{
+	struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+	u32 index;
+	int rc = 0;
+
+	rc = hl_snprintf_resize(buf, size, offset,
+		"Valid (armed) monitor objects:\n");
+	if (rc)
+		goto out;
+
+	if (sds->sync_namager_names) {
+		for (index = 0; sds->sync_namager_names[index]; ++index) {
+			rc = hl_state_dump_print_monitors_single_block(
+				hdev, index, buf, size, offset);
+			if (rc)
+				goto out;
+		}
+	} else {
+		for (index = 0; index < sds->props[SP_NUM_CORES]; ++index) {
+			rc = hl_state_dump_print_monitors_single_block(
+				hdev, index, buf, size, offset);
+			if (rc)
+				goto out;
+		}
+	}
+
+out:
+	return rc;
+}
+
+/**
+ * hl_state_dump_print_engine_fences - print active fences for a specific
+ * engine
+ * @hdev: pointer to the device
+ * @engine_type: engine type to use
+ * @buf: destination buffer double pointer to be used with hl_snprintf_resize
+ * @size: pointer to the size container
+ * @offset: pointer to the offset container
+ */
+static int
+hl_state_dump_print_engine_fences(struct hl_device *hdev,
+				  enum hl_sync_engine_type engine_type,
+				  char **buf, size_t *size, size_t *offset)
+{
+	struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+	int rc = 0, i, n_fences;
+	u64 base_addr, next_fence;
+
+	switch (engine_type) {
+	case ENGINE_TPC:
+		n_fences = sds->props[SP_NUM_OF_TPC_ENGINES];
+		base_addr = sds->props[SP_TPC0_CMDQ];
+		next_fence = sds->props[SP_NEXT_TPC];
+		break;
+	case ENGINE_MME:
+		n_fences = sds->props[SP_NUM_OF_MME_ENGINES];
+		base_addr = sds->props[SP_MME_CMDQ];
+		next_fence = sds->props[SP_NEXT_MME];
+		break;
+	case ENGINE_DMA:
+		n_fences = sds->props[SP_NUM_OF_DMA_ENGINES];
+		base_addr = sds->props[SP_DMA_CMDQ];
+		next_fence = sds->props[SP_DMA_QUEUES_OFFSET];
+		break;
+	default:
+		return -EINVAL;
+	}
+	for (i = 0; i < n_fences; ++i) {
+		rc = sds->funcs.print_fences_single_engine(
+			hdev,
+			base_addr + next_fence * i +
+				sds->props[SP_FENCE0_CNT_OFFSET],
+			base_addr + next_fence * i +
+				sds->props[SP_CP_STS_OFFSET],
+			engine_type, i, buf, size, offset);
+		if (rc)
+			goto out;
+	}
+out:
+	return rc;
+}
+
+/**
+ * hl_state_dump_print_fences - print active fences
+ * @hdev: pointer to the device
+ * @buf: destination buffer double pointer to be used with hl_snprintf_resize
+ * @size: pointer to the size container
+ * @offset: pointer to the offset container
+ */
+static int hl_state_dump_print_fences(struct hl_device *hdev, char **buf,
+				      size_t *size, size_t *offset)
+{
+	int rc = 0;
+
+	rc = hl_snprintf_resize(buf, size, offset, "Valid (armed) fences:\n");
+	if (rc)
+		goto out;
+
+	rc = hl_state_dump_print_engine_fences(hdev, ENGINE_TPC, buf, size, offset);
+	if (rc)
+		goto out;
+
+	rc = hl_state_dump_print_engine_fences(hdev, ENGINE_MME, buf, size, offset);
+	if (rc)
+		goto out;
+
+	rc = hl_state_dump_print_engine_fences(hdev, ENGINE_DMA, buf, size, offset);
+	if (rc)
+		goto out;
+
+out:
+	return rc;
+}
+
 /**
  * hl_state_dump() - dump system state
  * @hdev: pointer to device structure
@@ -443,6 +697,18 @@ int hl_state_dump(struct hl_device *hdev)
 
 	hl_snprintf_resize(&buf, &size, &offset, "\n");
 
+	rc = hl_state_dump_print_monitors(hdev, &buf, &size, &offset);
+	if (rc)
+		goto err;
+
+	hl_snprintf_resize(&buf, &size, &offset, "\n");
+
+	rc = hl_state_dump_print_fences(hdev, &buf, &size, &offset);
+	if (rc)
+		goto err;
+
+	hl_snprintf_resize(&buf, &size, &offset, "\n");
+
 	hl_debugfs_set_state_dump(hdev, buf, size);
 
 	return 0;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 7f90f637d7f42e..29ada339b66acf 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -8986,9 +8986,36 @@ static int gaudi_gen_sync_to_engine_map(struct hl_device *hdev,
 	return 0;
 }
 
+static int gaudi_monitor_valid(struct hl_mon_state_dump *mon)
+{
+	/* Not implemented */
+	return 0;
+}
+
+static int gaudi_print_single_monitor(char **buf, size_t *size, size_t *offset,
+				struct hl_device *hdev,
+				struct hl_mon_state_dump *mon)
+{
+	/* Not implemented */
+	return 0;
+}
+
+
+static int gaudi_print_fences_single_engine(
+	struct hl_device *hdev, u64 base_offset, u64 status_base_offset,
+	enum hl_sync_engine_type engine_type, u32 engine_id, char **buf,
+	size_t *size, size_t *offset)
+{
+	/* Not implemented */
+	return 0;
+}
+
 
 static struct hl_state_dump_specs_funcs gaudi_state_dump_funcs = {
+	.monitor_valid = gaudi_monitor_valid,
+	.print_single_monitor = gaudi_print_single_monitor,
 	.gen_sync_to_engine_map = gaudi_gen_sync_to_engine_map,
+	.print_fences_single_engine = gaudi_print_fences_single_engine,
 };
 
 static void gaudi_state_dump_init(struct hl_device *hdev)
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 2c3d642d31aba6..4144a8445eef5f 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5533,9 +5533,36 @@ static int goya_gen_sync_to_engine_map(struct hl_device *hdev,
 	return 0;
 }
 
+static int goya_monitor_valid(struct hl_mon_state_dump *mon)
+{
+	/* Not implemented */
+	return 0;
+}
+
+static int goya_print_single_monitor(char **buf, size_t *size, size_t *offset,
+				struct hl_device *hdev,
+				struct hl_mon_state_dump *mon)
+{
+	/* Not implemented */
+	return 0;
+}
+
+
+static int goya_print_fences_single_engine(
+	struct hl_device *hdev, u64 base_offset, u64 status_base_offset,
+	enum hl_sync_engine_type engine_type, u32 engine_id, char **buf,
+	size_t *size, size_t *offset)
+{
+	/* Not implemented */
+	return 0;
+}
+
 
 static struct hl_state_dump_specs_funcs goya_state_dump_funcs = {
+	.monitor_valid = goya_monitor_valid,
+	.print_single_monitor = goya_print_single_monitor,
 	.gen_sync_to_engine_map = goya_gen_sync_to_engine_map,
+	.print_fences_single_engine = goya_print_fences_single_engine,
 };
 
 static void goya_state_dump_init(struct hl_device *hdev)

From 116768dbeb53d5e5d8288a92ca1dbd98a8cfba8e Mon Sep 17 00:00:00 2001
From: Yuri Nudelman <ynudelman@habana.ai>
Date: Sun, 6 Jun 2021 10:30:41 +0300
Subject: [PATCH 13/55] habanalabs/gaudi: implement state dump

At the first stage, only gaudi core dump shall be implemented, not
including the status registers.

Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c         | 298 +++++++++++++++++-
 drivers/misc/habanalabs/gaudi/gaudiP.h        |  12 +
 .../include/gaudi/asic_reg/gaudi_regs.h       |   3 +
 .../habanalabs/include/gaudi/gaudi_masks.h    |  17 +
 4 files changed, 319 insertions(+), 11 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 29ada339b66acf..7dd36d1cb39e3b 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -106,6 +106,8 @@
 
 #define GAUDI_PLL_MAX 10
 
+#define BIN_REG_STRING_SIZE	sizeof("0b10101010101010101010101010101010")
+
 static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = {
 		"gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3",
 		"gaudi cq 1_0", "gaudi cq 1_1", "gaudi cq 1_2", "gaudi cq 1_3",
@@ -348,7 +350,90 @@ static enum hl_queue_type gaudi_queue_type[GAUDI_QUEUE_ID_SIZE] = {
 	QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_3 */
 };
 
-static s64 gaudi_state_dump_specs_props[SP_MAX] = {0};
+static struct hl_hw_obj_name_entry gaudi_so_id_to_str[] = {
+	{ .id = 0,  .name = "SYNC_OBJ_DMA_DOWN_FEEDBACK" },
+	{ .id = 1,  .name = "SYNC_OBJ_DMA_UP_FEEDBACK" },
+	{ .id = 2,  .name = "SYNC_OBJ_DMA_STATIC_DRAM_SRAM_FEEDBACK" },
+	{ .id = 3,  .name = "SYNC_OBJ_DMA_SRAM_DRAM_FEEDBACK" },
+	{ .id = 4,  .name = "SYNC_OBJ_FIRST_COMPUTE_FINISH" },
+	{ .id = 5,  .name = "SYNC_OBJ_HOST_DRAM_DONE" },
+	{ .id = 6,  .name = "SYNC_OBJ_DBG_CTR_DEPRECATED" },
+	{ .id = 7,  .name = "SYNC_OBJ_DMA_ACTIVATIONS_DRAM_SRAM_FEEDBACK" },
+	{ .id = 8,  .name = "SYNC_OBJ_ENGINE_SEM_MME_0" },
+	{ .id = 9,  .name = "SYNC_OBJ_ENGINE_SEM_MME_1" },
+	{ .id = 10, .name = "SYNC_OBJ_ENGINE_SEM_TPC_0" },
+	{ .id = 11, .name = "SYNC_OBJ_ENGINE_SEM_TPC_1" },
+	{ .id = 12, .name = "SYNC_OBJ_ENGINE_SEM_TPC_2" },
+	{ .id = 13, .name = "SYNC_OBJ_ENGINE_SEM_TPC_3" },
+	{ .id = 14, .name = "SYNC_OBJ_ENGINE_SEM_TPC_4" },
+	{ .id = 15, .name = "SYNC_OBJ_ENGINE_SEM_TPC_5" },
+	{ .id = 16, .name = "SYNC_OBJ_ENGINE_SEM_TPC_6" },
+	{ .id = 17, .name = "SYNC_OBJ_ENGINE_SEM_TPC_7" },
+	{ .id = 18, .name = "SYNC_OBJ_ENGINE_SEM_DMA_1" },
+	{ .id = 19, .name = "SYNC_OBJ_ENGINE_SEM_DMA_2" },
+	{ .id = 20, .name = "SYNC_OBJ_ENGINE_SEM_DMA_3" },
+	{ .id = 21, .name = "SYNC_OBJ_ENGINE_SEM_DMA_4" },
+	{ .id = 22, .name = "SYNC_OBJ_ENGINE_SEM_DMA_5" },
+	{ .id = 23, .name = "SYNC_OBJ_ENGINE_SEM_DMA_6" },
+	{ .id = 24, .name = "SYNC_OBJ_ENGINE_SEM_DMA_7" },
+	{ .id = 25, .name = "SYNC_OBJ_DBG_CTR_0" },
+	{ .id = 26, .name = "SYNC_OBJ_DBG_CTR_1" },
+};
+
+static struct hl_hw_obj_name_entry gaudi_monitor_id_to_str[] = {
+	{ .id = 200, .name = "MON_OBJ_DMA_DOWN_FEEDBACK_RESET" },
+	{ .id = 201, .name = "MON_OBJ_DMA_UP_FEADBACK_RESET" },
+	{ .id = 203, .name = "MON_OBJ_DRAM_TO_SRAM_QUEUE_FENCE" },
+	{ .id = 204, .name = "MON_OBJ_TPC_0_CLK_GATE" },
+	{ .id = 205, .name = "MON_OBJ_TPC_1_CLK_GATE" },
+	{ .id = 206, .name = "MON_OBJ_TPC_2_CLK_GATE" },
+	{ .id = 207, .name = "MON_OBJ_TPC_3_CLK_GATE" },
+	{ .id = 208, .name = "MON_OBJ_TPC_4_CLK_GATE" },
+	{ .id = 209, .name = "MON_OBJ_TPC_5_CLK_GATE" },
+	{ .id = 210, .name = "MON_OBJ_TPC_6_CLK_GATE" },
+	{ .id = 211, .name = "MON_OBJ_TPC_7_CLK_GATE" },
+};
+
+static s64 gaudi_state_dump_specs_props[] = {
+	[SP_SYNC_OBJ_BASE_ADDR] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0,
+	[SP_NEXT_SYNC_OBJ_ADDR] = NEXT_SYNC_OBJ_ADDR_INTERVAL,
+	[SP_SYNC_OBJ_AMOUNT] = NUM_OF_SOB_IN_BLOCK,
+	[SP_MON_OBJ_WR_ADDR_LOW] =
+		mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0,
+	[SP_MON_OBJ_WR_ADDR_HIGH] =
+		mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRH_0,
+	[SP_MON_OBJ_WR_DATA] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_DATA_0,
+	[SP_MON_OBJ_ARM_DATA] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_ARM_0,
+	[SP_MON_OBJ_STATUS] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_0,
+	[SP_MONITORS_AMOUNT] = NUM_OF_MONITORS_IN_BLOCK,
+	[SP_TPC0_CMDQ] = mmTPC0_QM_GLBL_CFG0,
+	[SP_TPC0_CFG_SO] = mmTPC0_CFG_QM_SYNC_OBJECT_ADDR,
+	[SP_NEXT_TPC] = mmTPC1_QM_GLBL_CFG0 - mmTPC0_QM_GLBL_CFG0,
+	[SP_MME_CMDQ] = mmMME0_QM_GLBL_CFG0,
+	[SP_MME_CFG_SO] = mmMME0_CTRL_ARCH_DESC_SYNC_OBJECT_ADDR_LOW_LOCAL,
+	[SP_NEXT_MME] = mmMME2_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0,
+	[SP_DMA_CMDQ] = mmDMA0_QM_GLBL_CFG0,
+	[SP_DMA_CFG_SO] = mmDMA0_CORE_WR_COMP_ADDR_LO,
+	[SP_DMA_QUEUES_OFFSET] = mmDMA1_QM_GLBL_CFG0 - mmDMA0_QM_GLBL_CFG0,
+	[SP_NUM_OF_MME_ENGINES] = NUM_OF_MME_ENGINES,
+	[SP_SUB_MME_ENG_NUM] = NUM_OF_MME_SUB_ENGINES,
+	[SP_NUM_OF_DMA_ENGINES] = NUM_OF_DMA_ENGINES,
+	[SP_NUM_OF_TPC_ENGINES] = NUM_OF_TPC_ENGINES,
+	[SP_ENGINE_NUM_OF_QUEUES] = NUM_OF_QUEUES,
+	[SP_ENGINE_NUM_OF_STREAMS] = NUM_OF_STREAMS,
+	[SP_ENGINE_NUM_OF_FENCES] = NUM_OF_FENCES,
+	[SP_FENCE0_CNT_OFFSET] =
+		mmDMA0_QM_CP_FENCE0_CNT_0 - mmDMA0_QM_GLBL_CFG0,
+	[SP_FENCE0_RDATA_OFFSET] =
+		mmDMA0_QM_CP_FENCE0_RDATA_0 - mmDMA0_QM_GLBL_CFG0,
+	[SP_CP_STS_OFFSET] = mmDMA0_QM_CP_STS_0 - mmDMA0_QM_GLBL_CFG0,
+	[SP_NUM_CORES] = 1,
+};
+
+static const char * const gaudi_sync_manager_names[] = {
+	"SYNC_MGR_E_N", "SYNC_MGR_W_N", "SYNC_MGR_E_S", "SYNC_MGR_W_S",
+	NULL
+};
 
 struct ecc_info_extract_params {
 	u64 block_address;
@@ -8979,25 +9064,141 @@ static int gaudi_map_pll_idx_to_fw_idx(u32 pll_idx)
 	}
 }
 
+static int gaudi_add_sync_to_engine_map_entry(
+	struct hl_sync_to_engine_map *map, u32 reg_value,
+	enum hl_sync_engine_type engine_type, u32 engine_id)
+{
+	struct hl_sync_to_engine_map_entry *entry;
+
+	/* Reg value represents a partial address of sync object,
+	 * it is used as unique identifier. For this we need to
+	 * clear the cutoff cfg base bits from the value.
+	 */
+	if (reg_value == 0 || reg_value == 0xffffffff)
+		return 0;
+	reg_value -= (u32)CFG_BASE;
+
+	/* create a new hash entry */
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+	entry->engine_type = engine_type;
+	entry->engine_id = engine_id;
+	entry->sync_id = reg_value;
+	hash_add(map->tb, &entry->node, reg_value);
+
+	return 0;
+}
+
 static int gaudi_gen_sync_to_engine_map(struct hl_device *hdev,
 				struct hl_sync_to_engine_map *map)
 {
-	/* Not implemented */
+	struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+	struct gaudi_device *gaudi = hdev->asic_specific;
+	int i, j, rc;
+	u32 reg_value;
+
+	/* Iterate over TPC engines */
+	for (i = 0; i < sds->props[SP_NUM_OF_TPC_ENGINES]; ++i) {
+		/* TPC registered must be accessed with clock gating disabled */
+		mutex_lock(&gaudi->clk_gate_mutex);
+		hdev->asic_funcs->disable_clock_gating(hdev);
+
+		reg_value = RREG32(sds->props[SP_TPC0_CFG_SO] +
+					sds->props[SP_NEXT_TPC] * i);
+
+		/* We can reenable clock_gating */
+		hdev->asic_funcs->set_clock_gating(hdev);
+		mutex_unlock(&gaudi->clk_gate_mutex);
+
+		rc = gaudi_add_sync_to_engine_map_entry(map, reg_value,
+							ENGINE_TPC, i);
+		if (rc)
+			goto free_sync_to_engine_map;
+	}
+
+	/* Iterate over MME engines */
+	for (i = 0; i < sds->props[SP_NUM_OF_MME_ENGINES]; ++i) {
+		for (j = 0; j < sds->props[SP_SUB_MME_ENG_NUM]; ++j) {
+			/* MME registered must be accessed with clock gating
+			 * disabled
+			 */
+			mutex_lock(&gaudi->clk_gate_mutex);
+			hdev->asic_funcs->disable_clock_gating(hdev);
+
+			reg_value = RREG32(sds->props[SP_MME_CFG_SO] +
+						sds->props[SP_NEXT_MME] * i +
+						j * sizeof(u32));
+
+			/* We can reenable clock_gating */
+			hdev->asic_funcs->set_clock_gating(hdev);
+			mutex_unlock(&gaudi->clk_gate_mutex);
+
+			rc = gaudi_add_sync_to_engine_map_entry(
+				map, reg_value, ENGINE_MME,
+				i * sds->props[SP_SUB_MME_ENG_NUM] + j);
+			if (rc)
+				goto free_sync_to_engine_map;
+		}
+	}
+
+	/* Iterate over DMA engines */
+	for (i = 0; i < sds->props[SP_NUM_OF_DMA_ENGINES]; ++i) {
+		reg_value = RREG32(sds->props[SP_DMA_CFG_SO] +
+					sds->props[SP_DMA_QUEUES_OFFSET] * i);
+		rc = gaudi_add_sync_to_engine_map_entry(map, reg_value,
+							ENGINE_DMA, i);
+		if (rc)
+			goto free_sync_to_engine_map;
+	}
+
 	return 0;
+
+free_sync_to_engine_map:
+	hl_state_dump_free_sync_to_engine_map(map);
+
+	return rc;
 }
 
 static int gaudi_monitor_valid(struct hl_mon_state_dump *mon)
 {
-	/* Not implemented */
-	return 0;
+	return FIELD_GET(
+		SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_VALID_MASK,
+		mon->status);
 }
 
 static int gaudi_print_single_monitor(char **buf, size_t *size, size_t *offset,
 				struct hl_device *hdev,
 				struct hl_mon_state_dump *mon)
 {
-	/* Not implemented */
-	return 0;
+	const char *name;
+	char scratch_buf1[BIN_REG_STRING_SIZE],
+		scratch_buf2[BIN_REG_STRING_SIZE];
+
+	name = hl_state_dump_get_monitor_name(hdev, mon);
+	if (!name)
+		name = "";
+
+	return hl_snprintf_resize(
+		buf, size, offset,
+		"Mon id: %u%s, wait for group id: %u mask %s to reach val: %u and write %u to address 0x%llx. Pending: %s",
+		mon->id, name,
+		FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_MASK,
+				mon->arm_data),
+		hl_format_as_binary(
+			scratch_buf1, sizeof(scratch_buf1),
+			FIELD_GET(
+				SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_MASK,
+				mon->arm_data)),
+		FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOD_MASK,
+				mon->arm_data),
+		mon->wr_data,
+		(((u64)mon->wr_addr_high) << 32) | mon->wr_addr_low,
+		hl_format_as_binary(
+			scratch_buf2, sizeof(scratch_buf2),
+			FIELD_GET(
+				SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_PENDING_MASK,
+				mon->status)));
 }
 
 
@@ -9006,8 +9207,68 @@ static int gaudi_print_fences_single_engine(
 	enum hl_sync_engine_type engine_type, u32 engine_id, char **buf,
 	size_t *size, size_t *offset)
 {
-	/* Not implemented */
-	return 0;
+	struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+	int rc = -ENOMEM, i;
+	u32 *statuses, *fences;
+
+	statuses = kcalloc(sds->props[SP_ENGINE_NUM_OF_QUEUES],
+			sizeof(*statuses), GFP_KERNEL);
+	if (!statuses)
+		goto out;
+
+	fences = kcalloc(sds->props[SP_ENGINE_NUM_OF_FENCES] *
+				sds->props[SP_ENGINE_NUM_OF_QUEUES],
+			 sizeof(*fences), GFP_KERNEL);
+	if (!fences)
+		goto free_status;
+
+	for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_FENCES]; ++i)
+		statuses[i] = RREG32(status_base_offset + i * sizeof(u32));
+
+	for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_FENCES] *
+				sds->props[SP_ENGINE_NUM_OF_QUEUES]; ++i)
+		fences[i] = RREG32(base_offset + i * sizeof(u32));
+
+	/* The actual print */
+	for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_QUEUES]; ++i) {
+		u32 fence_id;
+		u64 fence_cnt, fence_rdata;
+		const char *engine_name;
+
+		if (!FIELD_GET(TPC0_QM_CP_STS_0_FENCE_IN_PROGRESS_MASK,
+			statuses[i]))
+			continue;
+
+		fence_id =
+			FIELD_GET(TPC0_QM_CP_STS_0_FENCE_ID_MASK, statuses[i]);
+		fence_cnt = base_offset + CFG_BASE +
+			sizeof(u32) *
+			(i + fence_id * sds->props[SP_ENGINE_NUM_OF_QUEUES]);
+		fence_rdata = fence_cnt - sds->props[SP_FENCE0_CNT_OFFSET] +
+				sds->props[SP_FENCE0_RDATA_OFFSET];
+		engine_name = hl_sync_engine_to_string(engine_type);
+
+		rc = hl_snprintf_resize(
+			buf, size, offset,
+			"%s%u, stream %u: fence id %u cnt = 0x%llx (%s%u_QM.CP_FENCE%u_CNT_%u) rdata = 0x%llx (%s%u_QM.CP_FENCE%u_RDATA_%u) value = %u, cp_status = %u\n",
+			engine_name, engine_id,
+			i, fence_id,
+			fence_cnt, engine_name, engine_id, fence_id, i,
+			fence_rdata, engine_name, engine_id, fence_id, i,
+			fences[fence_id],
+			statuses[i]);
+		if (rc)
+			goto free_fences;
+	}
+
+	rc = 0;
+
+free_fences:
+	kfree(fences);
+free_status:
+	kfree(statuses);
+out:
+	return rc;
 }
 
 
@@ -9020,9 +9281,24 @@ static struct hl_state_dump_specs_funcs gaudi_state_dump_funcs = {
 
 static void gaudi_state_dump_init(struct hl_device *hdev)
 {
-	/* Not implemented */
-	hdev->state_dump_specs.props = gaudi_state_dump_specs_props;
-	hdev->state_dump_specs.funcs = gaudi_state_dump_funcs;
+	struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(gaudi_so_id_to_str); ++i)
+		hash_add(sds->so_id_to_str_tb,
+			&gaudi_so_id_to_str[i].node,
+			gaudi_so_id_to_str[i].id);
+
+	for (i = 0; i < ARRAY_SIZE(gaudi_monitor_id_to_str); ++i)
+		hash_add(sds->monitor_id_to_str_tb,
+			&gaudi_monitor_id_to_str[i].node,
+			gaudi_monitor_id_to_str[i].id);
+
+	sds->props = gaudi_state_dump_specs_props;
+
+	sds->sync_namager_names = gaudi_sync_manager_names;
+
+	sds->funcs = gaudi_state_dump_funcs;
 }
 
 static const struct hl_asic_funcs gaudi_funcs = {
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index 957bf3720f7014..838e98b0d43d01 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -200,6 +200,18 @@
 #define HW_CAP_TPC_MASK		GENMASK(31, 24)
 #define HW_CAP_TPC_SHIFT	24
 
+#define NEXT_SYNC_OBJ_ADDR_INTERVAL \
+	(mmSYNC_MNGR_W_N_SYNC_MNGR_OBJS_SOB_OBJ_0 - \
+	 mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0)
+#define NUM_OF_MME_ENGINES			2
+#define NUM_OF_MME_SUB_ENGINES		2
+#define NUM_OF_TPC_ENGINES			8
+#define NUM_OF_DMA_ENGINES			8
+#define NUM_OF_QUEUES				5
+#define NUM_OF_STREAMS				4
+#define NUM_OF_FENCES				4
+
+
 #define GAUDI_CPU_PCI_MSB_ADDR(addr)	(((addr) & GENMASK_ULL(49, 39)) >> 39)
 #define GAUDI_PCI_TO_CPU_ADDR(addr)			\
 	do {						\
diff --git a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h
index 5bb54b34a8aebe..ffdfbd9b32201f 100644
--- a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h
+++ b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h
@@ -126,6 +126,9 @@
 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_1                     0x4F2004
 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_2047                  0x4F3FFC
 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0               0x4F4000
+#define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRH_0               0x4F4800
+#define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_DATA_0                0x4F5000
+#define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_ARM_0                     0x4F5800
 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_0                  0x4F6000
 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_511                0x4F67FC
 
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
index 9aea7e99665468..acc85d3ed98b4b 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
@@ -449,4 +449,21 @@ enum axi_id {
 #define PCIE_AUX_FLR_CTRL_HW_CTRL_MASK                               0x1
 #define PCIE_AUX_FLR_CTRL_INT_MASK_MASK                              0x2
 
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_VALID_SHIFT        0
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_VALID_MASK         0x1
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_PENDING_SHIFT      1
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_PENDING_MASK       0x1FE
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_SHIFT             0
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_MASK              0xFF
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_SHIFT            8
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_MASK             0xFF00
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOP_SHIFT             16
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOP_MASK              0x10000
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOD_SHIFT             17
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOD_MASK              0xFFFE0000
+#define TPC0_QM_CP_STS_0_FENCE_ID_SHIFT                              20
+#define TPC0_QM_CP_STS_0_FENCE_ID_MASK                               0x300000
+#define TPC0_QM_CP_STS_0_FENCE_IN_PROGRESS_SHIFT                     22
+#define TPC0_QM_CP_STS_0_FENCE_IN_PROGRESS_MASK                      0x400000
+
 #endif /* GAUDI_MASKS_H_ */

From fe0d236fa7b51bfb8cd76df6f63f4c0c91f49759 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Tue, 6 Jul 2021 12:06:15 +0300
Subject: [PATCH 14/55] habanalabs: missing mutex_unlock in process kill
 procedure

missing mutex unlock once driver is giving up killing user processes.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 86426052a191d1..28fe15a283613e 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -822,6 +822,7 @@ static int device_kill_open_processes(struct hl_device *hdev, u32 timeout)
 		} else {
 			dev_warn(hdev->dev,
 				"Can't get task struct for PID so giving up on killing process\n");
+			mutex_unlock(&hdev->fpriv_list_lock);
 			return -ETIME;
 		}
 	}

From 4fc9e2c6658aa008f3192e320a2a46f3469e2afc Mon Sep 17 00:00:00 2001
From: Zvika Yehudai <zyehudai@habana.ai>
Date: Tue, 6 Jul 2021 13:50:32 +0300
Subject: [PATCH 15/55] habanalabs: rename cb_mmap to mmap

This function will be used for more mmap operations than just
mmaping CBs.

Signed-off-by: Zvika Yehudai <zyehudai@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/command_buffer.c | 2 +-
 drivers/misc/habanalabs/common/habanalabs.h     | 4 ++--
 drivers/misc/habanalabs/gaudi/gaudi.c           | 4 ++--
 drivers/misc/habanalabs/goya/goya.c             | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c
index 719168c980a45b..58afefcd74f312 100644
--- a/drivers/misc/habanalabs/common/command_buffer.c
+++ b/drivers/misc/habanalabs/common/command_buffer.c
@@ -552,7 +552,7 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
 
 	vma->vm_private_data = cb;
 
-	rc = hdev->asic_funcs->cb_mmap(hdev, vma, cb->kernel_address,
+	rc = hdev->asic_funcs->mmap(hdev, vma, cb->kernel_address,
 					cb->bus_address, cb->size);
 	if (rc) {
 		spin_lock(&cb->lock);
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 9aedea471ebe76..6affad6bc1b9f5 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1024,7 +1024,7 @@ struct fw_load_mgr {
  *                hw_fini and before CS rollback.
  * @suspend: handles IP specific H/W or SW changes for suspend.
  * @resume: handles IP specific H/W or SW changes for resume.
- * @cb_mmap: maps a CB.
+ * @mmap: maps a memory.
  * @ring_doorbell: increment PI on a given QMAN.
  * @pqe_write: Write the PQ entry to the PQ. This is ASIC-specific
  *             function because the PQs are located in different memory areas
@@ -1143,7 +1143,7 @@ struct hl_asic_funcs {
 	void (*halt_engines)(struct hl_device *hdev, bool hard_reset);
 	int (*suspend)(struct hl_device *hdev);
 	int (*resume)(struct hl_device *hdev);
-	int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
+	int (*mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
 			void *cpu_addr, dma_addr_t dma_addr, size_t size);
 	void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi);
 	void (*pqe_write)(struct hl_device *hdev, __le64 *pqe,
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 7dd36d1cb39e3b..14d0f6d9a38334 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -4299,7 +4299,7 @@ static int gaudi_resume(struct hl_device *hdev)
 	return gaudi_init_iatu(hdev);
 }
 
-static int gaudi_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
+static int gaudi_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
 			void *cpu_addr, dma_addr_t dma_addr, size_t size)
 {
 	int rc;
@@ -9313,7 +9313,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.halt_engines = gaudi_halt_engines,
 	.suspend = gaudi_suspend,
 	.resume = gaudi_resume,
-	.cb_mmap = gaudi_cb_mmap,
+	.mmap = gaudi_mmap,
 	.ring_doorbell = gaudi_ring_doorbell,
 	.pqe_write = gaudi_pqe_write,
 	.asic_dma_alloc_coherent = gaudi_dma_alloc_coherent,
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 4144a8445eef5f..017c913f7b5ab0 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -2798,7 +2798,7 @@ int goya_resume(struct hl_device *hdev)
 	return goya_init_iatu(hdev);
 }
 
-static int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
+static int goya_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
 			void *cpu_addr, dma_addr_t dma_addr, size_t size)
 {
 	int rc;
@@ -5584,7 +5584,7 @@ static const struct hl_asic_funcs goya_funcs = {
 	.halt_engines = goya_halt_engines,
 	.suspend = goya_suspend,
 	.resume = goya_resume,
-	.cb_mmap = goya_cb_mmap,
+	.mmap = goya_mmap,
 	.ring_doorbell = goya_ring_doorbell,
 	.pqe_write = goya_pqe_write,
 	.asic_dma_alloc_coherent = goya_dma_alloc_coherent,

From 0af280b96c52cbb72ee95249a7fe2278e5d4f968 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Fri, 9 Jul 2021 00:06:47 +0300
Subject: [PATCH 16/55] habanalabs: fix nullifying of destroyed mmu pgt pool

In case of host-resident MMU, when the page tables pool is destroyed,
its pointer is not nullified correctly.
As a result, on a device fini which happens after a failing reset, the
already destroyed pool is accessed, which leads to a kernel panic.
The patch fixes the setting of the pool pointer to NULL.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/mmu/mmu_v1.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/common/mmu/mmu_v1.c b/drivers/misc/habanalabs/common/mmu/mmu_v1.c
index c5e93ff3258662..0f536f79dd9c9a 100644
--- a/drivers/misc/habanalabs/common/mmu/mmu_v1.c
+++ b/drivers/misc/habanalabs/common/mmu/mmu_v1.c
@@ -470,13 +470,13 @@ static void hl_mmu_v1_fini(struct hl_device *hdev)
 	if (!ZERO_OR_NULL_PTR(hdev->mmu_priv.hr.mmu_shadow_hop0)) {
 		kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0);
 		gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
-	}
 
-	/* Make sure that if we arrive here again without init was called we
-	 * won't cause kernel panic. This can happen for example if we fail
-	 * during hard reset code at certain points
-	 */
-	hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL;
+		/* Make sure that if we arrive here again without init was
+		 * called we won't cause kernel panic. This can happen for
+		 * example if we fail during hard reset code at certain points
+		 */
+		hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL;
+	}
 }
 
 /**

From 099254da050091dc730d045b2f78ba0d2346dfa3 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Fri, 9 Jul 2021 17:34:45 +0300
Subject: [PATCH 17/55] habanalabs: mark linux image as not loaded after
 hw_fini

If hard reset fails after the call to hw_fini and before loading the
linux image to the device, a subsequent call to hw_fini should
communicate via COMMS (or MSG_TO_CPU regs for old FW versions).
However, the driver still tries in this case to communicate via the GIC,
and thus no hard reset is actually done.
To avoid that, the patch clears the linux_loaded flag after every call
to hw_fini.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 28fe15a283613e..4fcd24e5a6092e 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1075,9 +1075,12 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	hdev->asic_funcs->hw_fini(hdev, hard_reset);
 
 	if (hard_reset) {
+		hdev->fw_loader.linux_loaded = false;
+
 		/* Release kernel context */
 		if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1)
 			hdev->kernel_ctx = NULL;
+
 		hl_vm_fini(hdev);
 		hl_mmu_fini(hdev);
 		hl_eq_reset(hdev, &hdev->event_queue);
@@ -1611,6 +1614,8 @@ void hl_device_fini(struct hl_device *hdev)
 	/* Reset the H/W. It will be in idle state after this returns */
 	hdev->asic_funcs->hw_fini(hdev, true);
 
+	hdev->fw_loader.linux_loaded = false;
+
 	/* Release kernel context */
 	if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
 		dev_err(hdev->dev, "kernel ctx is still alive\n");

From ebb2eef55518c6915f0b595572ce0db7fc9c4d86 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Sat, 10 Jul 2021 20:00:35 +0300
Subject: [PATCH 18/55] habanalabs: fix type of variable

Recently, the size parameter in userptr structure was change to u64.
As a result, we need to change the type of the local range_size
in device_va_to_pa() to u64 to avoid overflow.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/debugfs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 51744e42b808fb..2c587af28f9bba 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -491,11 +491,10 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr, u32 size,
 	struct hl_vm_phys_pg_pack *phys_pg_pack;
 	struct hl_ctx *ctx = hdev->compute_ctx;
 	struct hl_vm_hash_node *hnode;
+	u64 end_address, range_size;
 	struct hl_userptr *userptr;
 	enum vm_type *vm_type;
 	bool valid = false;
-	u64 end_address;
-	u32 range_size;
 	int i, rc = 0;
 
 	if (!ctx) {

From 71301b89223c9237de3947b0eea2c5e653ee828b Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Sat, 10 Jul 2021 21:12:45 +0300
Subject: [PATCH 19/55] habanalabs: add asic property of host dma offset

Each ASIC can have a different offset to add to a host dma address,
to enable the ASIC to access that host memory.

The usage for this can be common code so add this to the asic
property structure.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h | 3 +++
 drivers/misc/habanalabs/gaudi/gaudi.c       | 1 +
 drivers/misc/habanalabs/goya/goya.c         | 1 +
 3 files changed, 5 insertions(+)

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 6affad6bc1b9f5..9782bb50931ab2 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -434,6 +434,8 @@ struct hl_hints_range {
  *                  the device's MMU.
  * @dram_hints_align_mask: dram va hint addresses alignment mask which is used
  *                  for hints validity check.
+ * device_dma_offset_for_host_access: the offset to add to host DMA addresses
+ *                                    to enable the device to access them.
  * @mmu_pgt_size: MMU page tables total size.
  * @mmu_pte_size: PTE size in MMU page tables.
  * @mmu_hop_table_size: MMU hop table size.
@@ -527,6 +529,7 @@ struct asic_fixed_properties {
 	u64				cb_va_start_addr;
 	u64				cb_va_end_addr;
 	u64				dram_hints_align_mask;
+	u64				device_dma_offset_for_host_access;
 	u32				mmu_pgt_size;
 	u32				mmu_pte_size;
 	u32				mmu_hop_table_size;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 14d0f6d9a38334..fdbe8155ef3cce 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -537,6 +537,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 						get_collective_mode(hdev, i);
 	}
 
+	prop->device_dma_offset_for_host_access = HOST_PHYS_BASE;
 	prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
 	prop->collective_first_sob = 0;
 	prop->collective_first_mon = 0;
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 017c913f7b5ab0..ae987192836919 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -389,6 +389,7 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 		prop->hw_queues_props[i].cb_alloc_flags = CB_ALLOC_USER;
 	}
 
+	prop->device_dma_offset_for_host_access = HOST_PHYS_BASE;
 	prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
 
 	prop->dram_base_address = DRAM_PHYS_BASE;

From f3863b0ce2f52f34c6a3bf33189e2edaaf0e7384 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Sun, 11 Jul 2021 08:55:42 +0300
Subject: [PATCH 20/55] habanalabs: set dma max segment size

This is required from any device that is capable to perform DMA.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/pci/pci.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/misc/habanalabs/common/pci/pci.c b/drivers/misc/habanalabs/common/pci/pci.c
index d5bedf5ba011be..0b5366cc84fdb3 100644
--- a/drivers/misc/habanalabs/common/pci/pci.c
+++ b/drivers/misc/habanalabs/common/pci/pci.c
@@ -436,6 +436,8 @@ int hl_pci_init(struct hl_device *hdev)
 		goto unmap_pci_bars;
 	}
 
+	dma_set_max_seg_size(&pdev->dev, U32_MAX);
+
 	return 0;
 
 unmap_pci_bars:

From f51f53ef9497ede79708060cc4ef259ccbafd3f6 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Mon, 12 Jul 2021 14:18:30 +0300
Subject: [PATCH 21/55] habanalabs/gaudi: trigger state dump in case of SM
 errors

State dump is relevant to the user in case of Sync Manager error, so
we need to trigger it in that case as well.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index fdbe8155ef3cce..6cbedeee15d163 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7894,8 +7894,9 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
 	u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
 			>> EQ_CTL_EVENT_TYPE_SHIFT);
-	u8 cause;
 	bool reset_required;
+	u8 cause;
+	int rc;
 
 	gaudi->events_stat[event_type]++;
 	gaudi->events_stat_aggregate[event_type]++;
@@ -8081,6 +8082,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 		gaudi_print_irq_info(hdev, event_type, false);
 		gaudi_print_sm_sei_info(hdev, event_type,
 					&eq_entry->sm_sei_data);
+		rc = hl_state_dump(hdev);
+		if (rc)
+			dev_err(hdev->dev,
+				"Error during system state dump %d\n", rc);
 		hl_fw_unmask_irq(hdev, event_type);
 		break;
 

From 96a279734462ba5a9c6fc06f0661e114821f4dc6 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Mon, 12 Jul 2021 13:48:56 +0300
Subject: [PATCH 22/55] habanalabs/gaudi: fix information printed on SM event

Print the SM name instead of index because it is more informational for
the user to know the SM name instead of id when a SM interrupt occurs.

In addition, the index that is printed is of the SOB group, not
a specific SOB.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 34 ++++++++++++++++++---------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 6cbedeee15d163..561485dcddaf78 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -430,8 +430,14 @@ static s64 gaudi_state_dump_specs_props[] = {
 	[SP_NUM_CORES] = 1,
 };
 
+/* The order here is opposite to the order of the indexing in the h/w.
+ * i.e. SYNC_MGR_W_S is actually 0, SYNC_MGR_E_S is 1, etc.
+ */
 static const char * const gaudi_sync_manager_names[] = {
-	"SYNC_MGR_E_N", "SYNC_MGR_W_N", "SYNC_MGR_E_S", "SYNC_MGR_W_S",
+	"SYNC_MGR_E_N",
+	"SYNC_MGR_W_N",
+	"SYNC_MGR_E_S",
+	"SYNC_MGR_W_S",
 	NULL
 };
 
@@ -7414,24 +7420,30 @@ static void gaudi_print_sm_sei_info(struct hl_device *hdev, u16 event_type,
 {
 	u32 index = event_type - GAUDI_EVENT_DMA_IF_SEI_0;
 
+	/* Flip the bits as the enum is ordered in the opposite way */
+	index = (index ^ 0x3) & 0x3;
+
 	switch (sei_data->sei_cause) {
 	case SM_SEI_SO_OVERFLOW:
-		dev_err(hdev->dev,
-			"SM %u SEI Error: SO %u overflow/underflow",
-			index, le32_to_cpu(sei_data->sei_log));
+		dev_err_ratelimited(hdev->dev,
+			"%s SEI Error: SOB Group %u overflow/underflow",
+			gaudi_sync_manager_names[index],
+			le32_to_cpu(sei_data->sei_log));
 		break;
 	case SM_SEI_LBW_4B_UNALIGNED:
-		dev_err(hdev->dev,
-			"SM %u SEI Error: Unaligned 4B LBW access, monitor agent address low - %#x",
-			index, le32_to_cpu(sei_data->sei_log));
+		dev_err_ratelimited(hdev->dev,
+			"%s SEI Error: Unaligned 4B LBW access, monitor agent address low - %#x",
+			gaudi_sync_manager_names[index],
+			le32_to_cpu(sei_data->sei_log));
 		break;
 	case SM_SEI_AXI_RESPONSE_ERR:
-		dev_err(hdev->dev,
-			"SM %u SEI Error: AXI ID %u response error",
-			index, le32_to_cpu(sei_data->sei_log));
+		dev_err_ratelimited(hdev->dev,
+			"%s SEI Error: AXI ID %u response error",
+			gaudi_sync_manager_names[index],
+			le32_to_cpu(sei_data->sei_log));
 		break;
 	default:
-		dev_err(hdev->dev, "Unknown SM SEI cause %u",
+		dev_err_ratelimited(hdev->dev, "Unknown SM SEI cause %u",
 				le32_to_cpu(sei_data->sei_log));
 		break;
 	}

From db2f48b4239e90345b8eae6d8dd6282892104918 Mon Sep 17 00:00:00 2001
From: Koby Elbaz <kelbaz@habana.ai>
Date: Tue, 6 Jul 2021 20:50:33 +0300
Subject: [PATCH 23/55] habanalabs: fix race between soft reset and heartbeat

There is a scenario where an ongoing soft reset would race with an
ongoing heartbeat routine, eventually causing heartbeat to fail and
thus to escalate into a hard reset.

With this fix, soft-reset procedure will disable heartbeat CPU messages
and flush the (ongoing) current one before continuing with reset code.

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c      | 95 +++++++++-----------
 drivers/misc/habanalabs/common/firmware_if.c | 18 ++--
 drivers/misc/habanalabs/common/habanalabs.h  |  4 +-
 drivers/misc/habanalabs/common/hw_queue.c    | 30 +++----
 4 files changed, 67 insertions(+), 80 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 4fcd24e5a6092e..b09bdd1d462f5b 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -682,6 +682,44 @@ int hl_device_set_debug_mode(struct hl_device *hdev, bool enable)
 	return rc;
 }
 
+static void take_release_locks(struct hl_device *hdev)
+{
+	/* Flush anyone that is inside the critical section of enqueue
+	 * jobs to the H/W
+	 */
+	hdev->asic_funcs->hw_queues_lock(hdev);
+	hdev->asic_funcs->hw_queues_unlock(hdev);
+
+	/* Flush processes that are sending message to CPU */
+	mutex_lock(&hdev->send_cpu_message_lock);
+	mutex_unlock(&hdev->send_cpu_message_lock);
+
+	/* Flush anyone that is inside device open */
+	mutex_lock(&hdev->fpriv_list_lock);
+	mutex_unlock(&hdev->fpriv_list_lock);
+}
+
+static void cleanup_resources(struct hl_device *hdev, bool hard_reset)
+{
+	if (hard_reset)
+		device_late_fini(hdev);
+
+	/*
+	 * Halt the engines and disable interrupts so we won't get any more
+	 * completions from H/W and we won't have any accesses from the
+	 * H/W to the host machine
+	 */
+	hdev->asic_funcs->halt_engines(hdev, hard_reset);
+
+	/* Go over all the queues, release all CS and their jobs */
+	hl_cs_rollback_all(hdev);
+
+	/* Release all pending user interrupts, each pending user interrupt
+	 * holds a reference to user context
+	 */
+	hl_release_pending_user_interrupts(hdev);
+}
+
 /*
  * hl_device_suspend - initiate device suspend
  *
@@ -707,16 +745,7 @@ int hl_device_suspend(struct hl_device *hdev)
 	/* This blocks all other stuff that is not blocked by in_reset */
 	hdev->disabled = true;
 
-	/*
-	 * Flush anyone that is inside the critical section of enqueue
-	 * jobs to the H/W
-	 */
-	hdev->asic_funcs->hw_queues_lock(hdev);
-	hdev->asic_funcs->hw_queues_unlock(hdev);
-
-	/* Flush processes that are sending message to CPU */
-	mutex_lock(&hdev->send_cpu_message_lock);
-	mutex_unlock(&hdev->send_cpu_message_lock);
+	take_release_locks(hdev);
 
 	rc = hdev->asic_funcs->suspend(hdev);
 	if (rc)
@@ -871,48 +900,6 @@ static void device_disable_open_processes(struct hl_device *hdev)
 	mutex_unlock(&hdev->fpriv_list_lock);
 }
 
-static void take_release_locks(struct hl_device *hdev)
-{
-	/* Flush anyone that is inside the critical section of enqueue
-	 * jobs to the H/W
-	 */
-	hdev->asic_funcs->hw_queues_lock(hdev);
-	hdev->asic_funcs->hw_queues_unlock(hdev);
-
-	/* Flush anyone that is inside device open */
-	mutex_lock(&hdev->fpriv_list_lock);
-	mutex_unlock(&hdev->fpriv_list_lock);
-}
-
-static void cleanup_resources(struct hl_device *hdev, bool hard_reset)
-{
-	if (hard_reset) {
-		device_late_fini(hdev);
-
-		/*
-		 * Now that the heartbeat thread is closed, flush processes
-		 * which are sending messages to CPU
-		 */
-		mutex_lock(&hdev->send_cpu_message_lock);
-		mutex_unlock(&hdev->send_cpu_message_lock);
-	}
-
-	/*
-	 * Halt the engines and disable interrupts so we won't get any more
-	 * completions from H/W and we won't have any accesses from the
-	 * H/W to the host machine
-	 */
-	hdev->asic_funcs->halt_engines(hdev, hard_reset);
-
-	/* Go over all the queues, release all CS and their jobs */
-	hl_cs_rollback_all(hdev);
-
-	/* Release all pending user interrupts, each pending user interrupt
-	 * holds a reference to user context
-	 */
-	hl_release_pending_user_interrupts(hdev);
-}
-
 /*
  * hl_device_reset - reset the device
  *
@@ -941,8 +928,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		return 0;
 	}
 
-	hard_reset = (flags & HL_RESET_HARD) != 0;
-	from_hard_reset_thread = (flags & HL_RESET_FROM_RESET_THREAD) != 0;
+	hard_reset = !!(flags & HL_RESET_HARD);
+	from_hard_reset_thread = !!(flags & HL_RESET_FROM_RESET_THREAD);
 
 	if (!hard_reset && !hdev->supports_soft_reset) {
 		hard_instead_soft = true;
diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index bac25a60650f6e..869c6057ae3164 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -240,11 +240,15 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 	/* set fence to a non valid value */
 	pkt->fence = cpu_to_le32(UINT_MAX);
 
-	rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, len, pkt_dma_addr);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc);
-		goto out;
-	}
+	/*
+	 * The CPU queue is a synchronous queue with an effective depth of
+	 * a single entry (although it is allocated with room for multiple
+	 * entries). We lock on it using 'send_cpu_message_lock' which
+	 * serializes accesses to the CPU queue.
+	 * Which means that we don't need to lock the access to the entire H/W
+	 * queues module when submitting a JOB to the CPU queue.
+	 */
+	hl_hw_queue_submit_bd(hdev, queue, 0, len, pkt_dma_addr);
 
 	if (prop->fw_app_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN)
 		expected_ack_val = queue->pi;
@@ -2235,6 +2239,10 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 	dev_info(hdev->dev,
 		"Loading firmware to device, may take some time...\n");
 
+	/*
+	 * In this stage, "cpu_dyn_regs" contains only LKD's hard coded values!
+	 * It will be updated from FW after hl_fw_dynamic_request_descriptor().
+	 */
 	dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs;
 
 	rc = hl_fw_dynamic_send_protocol_cmd(hdev, fw_loader, COMMS_RST_STATE,
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 9782bb50931ab2..a0f3c580b58b9e 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2609,7 +2609,9 @@ void destroy_hdev(struct hl_device *hdev);
 int hl_hw_queues_create(struct hl_device *hdev);
 void hl_hw_queues_destroy(struct hl_device *hdev);
 int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
-				u32 cb_size, u64 cb_ptr);
+		u32 cb_size, u64 cb_ptr);
+void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
+		u32 ctl, u32 len, u64 ptr);
 int hl_hw_queue_schedule_cs(struct hl_cs *cs);
 u32 hl_hw_queue_add_ptr(u32 ptr, u16 val);
 void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id);
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index bcabfdbf1e011a..0afead229e9773 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -65,7 +65,7 @@ void hl_hw_queue_update_ci(struct hl_cs *cs)
 }
 
 /*
- * ext_and_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a
+ * hl_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a
  *                                H/W queue.
  * @hdev: pointer to habanalabs device structure
  * @q: pointer to habanalabs queue structure
@@ -80,8 +80,8 @@ void hl_hw_queue_update_ci(struct hl_cs *cs)
  * This function must be called when the scheduler mutex is taken
  *
  */
-static void ext_and_hw_queue_submit_bd(struct hl_device *hdev,
-			struct hl_hw_queue *q, u32 ctl, u32 len, u64 ptr)
+void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
+		u32 ctl, u32 len, u64 ptr)
 {
 	struct hl_bd *bd;
 
@@ -222,8 +222,8 @@ static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q,
  * @cb_size: size of CB
  * @cb_ptr: pointer to CB location
  *
- * This function sends a single CB, that must NOT generate a completion entry
- *
+ * This function sends a single CB, that must NOT generate a completion entry.
+ * Sending CPU messages can be done instead via 'hl_hw_queue_submit_bd()'
  */
 int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
 				u32 cb_size, u64 cb_ptr)
@@ -231,16 +231,7 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
 	struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id];
 	int rc = 0;
 
-	/*
-	 * The CPU queue is a synchronous queue with an effective depth of
-	 * a single entry (although it is allocated with room for multiple
-	 * entries). Therefore, there is a different lock, called
-	 * send_cpu_message_lock, that serializes accesses to the CPU queue.
-	 * As a result, we don't need to lock the access to the entire H/W
-	 * queues module when submitting a JOB to the CPU queue
-	 */
-	if (q->queue_type != QUEUE_TYPE_CPU)
-		hdev->asic_funcs->hw_queues_lock(hdev);
+	hdev->asic_funcs->hw_queues_lock(hdev);
 
 	if (hdev->disabled) {
 		rc = -EPERM;
@@ -258,11 +249,10 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
 			goto out;
 	}
 
-	ext_and_hw_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
+	hl_hw_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
 
 out:
-	if (q->queue_type != QUEUE_TYPE_CPU)
-		hdev->asic_funcs->hw_queues_unlock(hdev);
+	hdev->asic_funcs->hw_queues_unlock(hdev);
 
 	return rc;
 }
@@ -328,7 +318,7 @@ static void ext_queue_schedule_job(struct hl_cs_job *job)
 	cq->pi = hl_cq_inc_ptr(cq->pi);
 
 submit_bd:
-	ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
+	hl_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
 }
 
 /*
@@ -407,7 +397,7 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
 	else
 		ptr = (u64) (uintptr_t) job->user_cb;
 
-	ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
+	hl_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
 }
 
 static int init_signal_cs(struct hl_device *hdev,

From 8f37b56367e46e7f6db4410eb4437f749908daad Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Tue, 13 Jul 2021 10:13:30 +0300
Subject: [PATCH 24/55] habanalabs: update firmware header to latest version

Add two new fields regarding interrupts communication between driver
and f/w.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/include/common/hl_boot_if.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h
index d762bb2f12042f..1f296784fa2bcb 100644
--- a/drivers/misc/habanalabs/include/common/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h
@@ -345,7 +345,9 @@ struct cpu_dyn_regs {
 	__le32 gic_dma_core_irq_ctrl;
 	__le32 gic_host_halt_irq;
 	__le32 gic_host_ints_irq;
-	__le32 reserved1[24];		/* reserve for future use */
+	__le32 gic_host_soft_rst_irq;
+	__le32 gic_rot_qm_irq_ctrl;
+	__le32 reserved1[22];		/* reserve for future use */
 };
 
 /* TODO: remove the desc magic after the code is updated to use message */

From d1bf273815c6bfa199ea85e39d7e6e3015c40baa Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Tue, 13 Jul 2021 10:14:36 +0300
Subject: [PATCH 25/55] habanalabs/goya: add missing initialization

Need to initialize f/w Linux loaded indication to false to prevent
wrong communication with the f/w.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/goya/goya.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index ae987192836919..c8d58dd2c041ff 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -2496,6 +2496,7 @@ static void goya_init_firmware_loader(struct hl_device *hdev)
 	struct fw_load_mgr *fw_loader = &hdev->fw_loader;
 
 	/* fill common fields */
+	fw_loader->linux_loaded = false;
 	fw_loader->boot_fit_img.image_name = GOYA_BOOT_FIT_FILE;
 	fw_loader->linux_img.image_name = GOYA_LINUX_FW_FILE;
 	fw_loader->cpu_timeout = GOYA_CPU_TIMEOUT_USEC;

From 4359832b8434f7651acdc2b231b95bc016cbd0eb Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Tue, 13 Jul 2021 08:11:54 +0300
Subject: [PATCH 26/55] habanalabs: revise prints on FD close

The driver quietly handles memory mappings that were not freed so no
need to print a warning about that when user closes the FD.

Accordingly, revise the text that is printed in case the device is
still in use after the user process closed the FD.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 4 ++--
 drivers/misc/habanalabs/common/memory.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index b09bdd1d462f5b..4f76c7a5160589 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -129,8 +129,8 @@ static int hl_device_release(struct inode *inode, struct file *filp)
 	hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
 
 	if (!hl_hpriv_put(hpriv))
-		dev_warn(hdev->dev,
-			"Device is still in use because there are live CS and/or memory mappings\n");
+		dev_notice(hdev->dev,
+			"User process closed FD but device still in use\n");
 
 	hdev->last_open_session_duration_jif =
 		jiffies - hdev->last_successful_open_jif;
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index a05d98db485762..efc460e9db5d2f 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -2126,7 +2126,7 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
 	 * another side effect error
 	 */
 	if (!hdev->hard_reset_pending && !hash_empty(ctx->mem_hash))
-		dev_notice(hdev->dev,
+		dev_dbg(hdev->dev,
 			"user released device without removing its memory mappings\n");
 
 	hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) {

From d28ed2da824b75330bcb6b642cab82c06b2631c0 Mon Sep 17 00:00:00 2001
From: Ohad Sharabi <osharabi@habana.ai>
Date: Sun, 20 Jun 2021 11:00:26 +0300
Subject: [PATCH 27/55] habanalabs: get multiple fences under same cs_lock

To add proper support for wait-for-multi-CS, locking the CS lock
for each CS fence in the list is not efficient.

Instead, this patch add support to lock the CS lock once to get all
required fences.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 108 +++++++++++-------
 drivers/misc/habanalabs/common/context.c      |  53 +++++++--
 drivers/misc/habanalabs/common/habanalabs.h   |   3 +
 3 files changed, 113 insertions(+), 51 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index a0846880400cea..b373a52a47a775 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -152,8 +152,17 @@ static void hl_fence_release(struct kref *kref)
 
 void hl_fence_put(struct hl_fence *fence)
 {
-	if (fence)
-		kref_put(&fence->refcount, hl_fence_release);
+	if (IS_ERR_OR_NULL(fence))
+		return;
+	kref_put(&fence->refcount, hl_fence_release);
+}
+
+void hl_fences_put(struct hl_fence **fence, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++, fence++)
+		hl_fence_put(*fence);
 }
 
 void hl_fence_get(struct hl_fence *fence)
@@ -1896,61 +1905,76 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 	return rc;
 }
 
-static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
-				u64 timeout_us, u64 seq,
-				enum hl_cs_wait_status *status, s64 *timestamp)
+static int hl_wait_for_fence(struct hl_ctx *ctx, u64 seq, struct hl_fence *fence,
+				enum hl_cs_wait_status *status, u64 timeout_us,
+				s64 *timestamp)
 {
-	struct hl_fence *fence;
-	unsigned long timeout;
-	int rc = 0;
+	struct hl_device *hdev = ctx->hdev;
 	long completion_rc;
+	int rc = 0;
 
-	if (timestamp)
-		*timestamp = 0;
-
-	if (timeout_us == MAX_SCHEDULE_TIMEOUT)
-		timeout = timeout_us;
-	else
-		timeout = usecs_to_jiffies(timeout_us);
-
-	hl_ctx_get(hdev, ctx);
-
-	fence = hl_ctx_get_fence(ctx, seq);
 	if (IS_ERR(fence)) {
 		rc = PTR_ERR(fence);
 		if (rc == -EINVAL)
 			dev_notice_ratelimited(hdev->dev,
 				"Can't wait on CS %llu because current CS is at seq %llu\n",
 				seq, ctx->cs_sequence);
-	} else if (fence) {
-		if (!timeout_us)
-			completion_rc = completion_done(&fence->completion);
-		else
-			completion_rc =
-				wait_for_completion_interruptible_timeout(
-					&fence->completion, timeout);
-
-		if (completion_rc > 0) {
-			*status = CS_WAIT_STATUS_COMPLETED;
-			if (timestamp)
-				*timestamp = ktime_to_ns(fence->timestamp);
-		} else {
-			*status = CS_WAIT_STATUS_BUSY;
-		}
-
-		if (fence->error == -ETIMEDOUT)
-			rc = -ETIMEDOUT;
-		else if (fence->error == -EIO)
-			rc = -EIO;
+		return rc;
+	}
 
-		hl_fence_put(fence);
-	} else {
+	if (!fence) {
 		dev_dbg(hdev->dev,
 			"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
-			seq, ctx->cs_sequence);
+				seq, ctx->cs_sequence);
+
 		*status = CS_WAIT_STATUS_GONE;
+		return 0;
+	}
+
+	if (!timeout_us) {
+		completion_rc = completion_done(&fence->completion);
+	} else {
+		unsigned long timeout;
+
+		timeout = (timeout_us == MAX_SCHEDULE_TIMEOUT) ?
+				timeout_us : usecs_to_jiffies(timeout_us);
+		completion_rc =
+			wait_for_completion_interruptible_timeout(
+				&fence->completion, timeout);
 	}
 
+	if (completion_rc > 0) {
+		*status = CS_WAIT_STATUS_COMPLETED;
+		if (timestamp)
+			*timestamp = ktime_to_ns(fence->timestamp);
+	} else {
+		*status = CS_WAIT_STATUS_BUSY;
+	}
+
+	if (fence->error == -ETIMEDOUT)
+		rc = -ETIMEDOUT;
+	else if (fence->error == -EIO)
+		rc = -EIO;
+
+	return rc;
+}
+
+static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
+				u64 timeout_us, u64 seq,
+				enum hl_cs_wait_status *status, s64 *timestamp)
+{
+	struct hl_fence *fence;
+	int rc = 0;
+
+	if (timestamp)
+		*timestamp = 0;
+
+	hl_ctx_get(hdev, ctx);
+
+	fence = hl_ctx_get_fence(ctx, seq);
+
+	rc = hl_wait_for_fence(ctx, seq, fence, status, timeout_us, timestamp);
+	hl_fence_put(fence);
 	hl_ctx_put(ctx);
 
 	return rc;
diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index 19b6b045219ed3..1001c65c5e7a39 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -229,31 +229,66 @@ int hl_ctx_put(struct hl_ctx *ctx)
 	return kref_put(&ctx->refcount, hl_ctx_do_release);
 }
 
-struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
+/* this function shall be called with cs_lock locked */
+static struct hl_fence *hl_ctx_get_fence_locked(struct hl_ctx *ctx, u64 seq)
 {
 	struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
 	struct hl_fence *fence;
 
-	spin_lock(&ctx->cs_lock);
-
-	if (seq >= ctx->cs_sequence) {
-		spin_unlock(&ctx->cs_lock);
+	if (seq >= ctx->cs_sequence)
 		return ERR_PTR(-EINVAL);
-	}
 
-	if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) {
-		spin_unlock(&ctx->cs_lock);
+	if (seq + asic_prop->max_pending_cs < ctx->cs_sequence)
 		return NULL;
-	}
 
 	fence = ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)];
 	hl_fence_get(fence);
+	return fence;
+}
+
+struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
+{
+	struct hl_fence *fence;
+
+	spin_lock(&ctx->cs_lock);
+
+	fence = hl_ctx_get_fence_locked(ctx, seq);
 
 	spin_unlock(&ctx->cs_lock);
 
 	return fence;
 }
 
+int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr,
+				struct hl_fence **fence, u32 arr_len)
+{
+	struct hl_fence **fence_arr_base = fence;
+	int i, rc = 0;
+
+	spin_lock(&ctx->cs_lock);
+
+	for (i = 0; i < arr_len; i++, fence++) {
+		u64 seq = seq_arr[i];
+
+		*fence = hl_ctx_get_fence_locked(ctx, seq);
+
+		if (IS_ERR(*fence)) {
+			dev_err(ctx->hdev->dev,
+				"Failed to get fence for CS with seq 0x%llx\n",
+					seq);
+			rc = PTR_ERR(*fence);
+			break;
+		}
+	}
+
+	spin_unlock(&ctx->cs_lock);
+
+	if (rc)
+		hl_fences_put(fence_arr_base, i);
+
+	return rc;
+}
+
 /*
  * hl_ctx_mgr_init - initialize the context manager
  *
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index a0f3c580b58b9e..26f75d070f726c 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2645,6 +2645,8 @@ void hl_ctx_do_release(struct kref *ref);
 void hl_ctx_get(struct hl_device *hdev,	struct hl_ctx *ctx);
 int hl_ctx_put(struct hl_ctx *ctx);
 struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
+int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr,
+				struct hl_fence **fence, u32 arr_len);
 void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr);
 void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr);
 
@@ -2692,6 +2694,7 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
 void hl_sob_reset_error(struct kref *ref);
 int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask);
 void hl_fence_put(struct hl_fence *fence);
+void hl_fences_put(struct hl_fence **fence, int len);
 void hl_fence_get(struct hl_fence *fence);
 void cs_get(struct hl_cs *cs);
 bool cs_needs_completion(struct hl_cs *cs);

From 5bf209df737fb0b7d809a04bae610cbc3f7311f7 Mon Sep 17 00:00:00 2001
From: Ohad Sharabi <osharabi@habana.ai>
Date: Mon, 14 Jun 2021 22:18:41 +0300
Subject: [PATCH 28/55] habanalabs: add wait-for-multi-CS uAPI

When user sends multiple CSs, waiting for each CS is not efficient
as it involves many user-kernel context switches.

In order to address this issue we add support to "wait on multiple CSs"
using a new uAPI which can wait on maximum of 32 CSs. The new uAPI is
defined using a new flag - WAIT_FOR_MULTI_CS - in the wait_for_cs IOCTL.

The input parameters for this uAPI will be:
@seq: user pointer to an array of up to 32 CS's sequence numbers.
@seq_array_len: length of sequence array.
@timeout_us: timeout for waiting for any CS.

The output paramateres for this API will be:
@status: multi CS ioctl completion status (dedicated status was added as
         well).
@flags: bitmap of output flags of the CS.
@cs_completion_map: bitmap for multi CS, if CS sequence that was placed
                    in index N in input seq array has completed- the N-th
		    bit in cs_completion_map will be 1, otherwise it will
		    be 0.
@timestamp_nsec: timestamp of the first completed CS

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 443 +++++++++++++++++-
 drivers/misc/habanalabs/common/context.c      |  22 +-
 drivers/misc/habanalabs/common/device.c       |   4 +
 drivers/misc/habanalabs/common/habanalabs.h   |  63 ++-
 drivers/misc/habanalabs/common/hw_queue.c     |   4 +
 drivers/misc/habanalabs/gaudi/gaudi.c         |   1 +
 drivers/misc/habanalabs/goya/goya.c           |   1 +
 include/uapi/misc/habanalabs.h                |  23 +-
 8 files changed, 556 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index b373a52a47a775..ba0c854b2ed46a 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -482,6 +482,91 @@ static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs)
 	spin_unlock(&hdev->cs_mirror_lock);
 }
 
+/*
+ * force_complete_multi_cs - complete all contexts that wait on multi-CS
+ *
+ * @hdev: pointer to habanalabs device structure
+ */
+static void force_complete_multi_cs(struct hl_device *hdev)
+{
+	int i;
+
+	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
+		struct multi_cs_completion *mcs_compl;
+
+		mcs_compl = &hdev->multi_cs_completion[i];
+
+		spin_lock(&mcs_compl->lock);
+
+		if (!mcs_compl->used) {
+			spin_unlock(&mcs_compl->lock);
+			continue;
+		}
+
+		/* when calling force complete no context should be waiting on
+		 * multi-cS.
+		 * We are calling the function as a protection for such case
+		 * to free any pending context and print error message
+		 */
+		dev_err(hdev->dev,
+				"multi-CS completion context %d still waiting when calling force completion\n",
+				i);
+		complete_all(&mcs_compl->completion);
+		spin_unlock(&mcs_compl->lock);
+	}
+}
+
+/*
+ * complete_multi_cs - complete all waiting entities on multi-CS
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @cs: CS structure
+ *
+ * The function signals waiting entity that its waiting stream has common
+ * stream with the completed CS.
+ * For example:
+ * - a completed CS worked on streams 0 and 1, multi CS completion
+ *   is actively waiting on stream 3. don't send signal as no common stream
+ * - a completed CS worked on streams 0 and 1, multi CS completion
+ *   is actively waiting on streams 1 and 3. send signal as stream 1 is common
+ */
+static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
+{
+	struct hl_fence *fence = cs->fence;
+	int i;
+
+	/* in case of multi CS check for completion only for the first CS */
+	if (cs->staged_cs && !cs->staged_first)
+		return;
+
+	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
+		struct multi_cs_completion *mcs_compl;
+
+		mcs_compl = &hdev->multi_cs_completion[i];
+		if (!mcs_compl->used)
+			continue;
+
+		spin_lock(&mcs_compl->lock);
+
+		/*
+		 * complete if:
+		 * 1. still waiting for completion
+		 * 2. the completed CS has at least one overlapping stream
+		 *    with the streams in the completion
+		 */
+		if (mcs_compl->used &&
+				(fence->stream_map & mcs_compl->stream_map)) {
+			/* extract the timestamp only of first completed CS */
+			if (!mcs_compl->timestamp)
+				mcs_compl->timestamp =
+						ktime_to_ns(fence->timestamp);
+			complete_all(&mcs_compl->completion);
+		}
+
+		spin_unlock(&mcs_compl->lock);
+	}
+}
+
 static void cs_do_release(struct kref *ref)
 {
 	struct hl_cs *cs = container_of(ref, struct hl_cs, refcount);
@@ -575,6 +660,7 @@ static void cs_do_release(struct kref *ref)
 	if (cs->timestamp)
 		cs->fence->timestamp = ktime_get();
 	complete_all(&cs->fence->completion);
+	complete_multi_cs(hdev, cs);
 	hl_fence_put(cs->fence);
 
 	kfree(cs->jobs_in_queue_cnt);
@@ -804,6 +890,8 @@ void hl_cs_rollback_all(struct hl_device *hdev)
 		cs_rollback(hdev, cs);
 		cs_put(cs);
 	}
+
+	force_complete_multi_cs(hdev);
 }
 
 void hl_pending_cb_list_flush(struct hl_ctx *ctx)
@@ -1134,6 +1222,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 	struct hl_cs *cs;
 	struct hl_cb *cb;
 	u64 user_sequence;
+	u8 stream_map = 0;
 	int rc, i;
 
 	cntr = &hdev->aggregated_cs_counters;
@@ -1192,9 +1281,18 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 			cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
 		}
 
-		if (queue_type == QUEUE_TYPE_EXT || queue_type == QUEUE_TYPE_HW)
+		if (queue_type == QUEUE_TYPE_EXT ||
+						queue_type == QUEUE_TYPE_HW) {
 			int_queues_only = false;
 
+			/*
+			 * store which stream are being used for external/HW
+			 * queues of this CS
+			 */
+			if (hdev->supports_wait_for_multi_cs)
+				stream_map |= BIT((chunk->queue_index % 4));
+		}
+
 		job = hl_cs_allocate_job(hdev, queue_type,
 						is_kernel_allocated_cb);
 		if (!job) {
@@ -1255,6 +1353,13 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 		goto free_cs_object;
 	}
 
+	/*
+	 * store the (external/HW queues) streams used by the CS in the
+	 * fence object for multi-CS completion
+	 */
+	if (hdev->supports_wait_for_multi_cs)
+		cs->fence->stream_map = stream_map;
+
 	rc = hl_hw_queue_schedule_cs(cs);
 	if (rc) {
 		if (rc != -EAGAIN)
@@ -1959,6 +2064,95 @@ static int hl_wait_for_fence(struct hl_ctx *ctx, u64 seq, struct hl_fence *fence
 	return rc;
 }
 
+/*
+ * hl_cs_poll_fences - iterate CS fences to check for CS completion
+ *
+ * @mcs_data: multi-CS internal data
+ *
+ * @return 0 on success, otherwise non 0 error code
+ *
+ * The function iterates on all CS sequence in the list and set bit in
+ * completion_bitmap for each completed CS.
+ * while iterating, the function can extracts the stream map to be later
+ * used by the waiting function.
+ * this function shall be called after taking context ref
+ */
+static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
+{
+	struct hl_fence **fence_ptr = mcs_data->fence_arr;
+	struct hl_device *hdev = mcs_data->ctx->hdev;
+	int i, rc, arr_len = mcs_data->arr_len;
+	u64 *seq_arr = mcs_data->seq_arr;
+	ktime_t max_ktime, first_cs_time;
+	enum hl_cs_wait_status status;
+
+	memset(fence_ptr, 0, arr_len * sizeof(*fence_ptr));
+
+	/* get all fences under the same lock */
+	rc = hl_ctx_get_fences(mcs_data->ctx, seq_arr, fence_ptr, arr_len);
+	if (rc)
+		return rc;
+
+	/*
+	 * set to maximum time to verify timestamp is valid: if at the end
+	 * this value is maintained- no timestamp was updated
+	 */
+	max_ktime = ktime_set(KTIME_SEC_MAX, 0);
+	first_cs_time = max_ktime;
+
+	for (i = 0; i < arr_len; i++, fence_ptr++) {
+		struct hl_fence *fence = *fence_ptr;
+
+		/*
+		 * function won't sleep as it is called with timeout 0 (i.e.
+		 * poll the fence)
+		 */
+		rc = hl_wait_for_fence(mcs_data->ctx, seq_arr[i], fence,
+						&status, 0, NULL);
+		if (rc) {
+			dev_err(hdev->dev,
+				"wait_for_fence error :%d for CS seq %llu\n",
+								rc, seq_arr[i]);
+			break;
+		}
+
+		mcs_data->stream_map |= fence->stream_map;
+
+		if (status == CS_WAIT_STATUS_BUSY)
+			continue;
+
+		mcs_data->completion_bitmap |= BIT(i);
+
+		/*
+		 * best effort to extract timestamp. few notes:
+		 * - if even single fence is gone we cannot extract timestamp
+		 *   (as fence not exist anymore)
+		 * - for all completed CSs we take the earliest timestamp.
+		 *   for this we have to validate that:
+		 *       1. given timestamp was indeed set
+		 *       2. the timestamp is earliest of all timestamps so far
+		 */
+
+		if (status == CS_WAIT_STATUS_GONE) {
+			mcs_data->update_ts = false;
+			mcs_data->gone_cs = true;
+		} else if (mcs_data->update_ts &&
+			(ktime_compare(fence->timestamp,
+						ktime_set(0, 0)) > 0) &&
+			(ktime_compare(fence->timestamp, first_cs_time) < 0)) {
+			first_cs_time = fence->timestamp;
+		}
+	}
+
+	hl_fences_put(mcs_data->fence_arr, arr_len);
+
+	if (mcs_data->update_ts &&
+			(ktime_compare(first_cs_time, max_ktime) != 0))
+		mcs_data->timestamp = ktime_to_ns(first_cs_time);
+
+	return rc;
+}
+
 static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 				u64 timeout_us, u64 seq,
 				enum hl_cs_wait_status *status, s64 *timestamp)
@@ -1980,6 +2174,251 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 	return rc;
 }
 
+/*
+ * hl_wait_multi_cs_completion_init - init completion structure
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @stream_map: stream map, set bit indicates stream to wait on
+ *
+ * @return valid completion struct pointer on success, otherwise error pointer
+ *
+ * up to MULTI_CS_MAX_USER_CTX calls can be done concurrently to the driver.
+ * the function gets the first available completion (by marking it "used")
+ * and initialize its values.
+ */
+static struct multi_cs_completion *hl_wait_multi_cs_completion_init(
+							struct hl_device *hdev,
+							u8 stream_map)
+{
+	struct multi_cs_completion *mcs_compl;
+	int i;
+
+	/* find free multi_cs completion structure */
+	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
+		mcs_compl = &hdev->multi_cs_completion[i];
+		spin_lock(&mcs_compl->lock);
+		if (!mcs_compl->used) {
+			mcs_compl->used = 1;
+			mcs_compl->timestamp = 0;
+			mcs_compl->stream_map = stream_map;
+			reinit_completion(&mcs_compl->completion);
+			spin_unlock(&mcs_compl->lock);
+			break;
+		}
+		spin_unlock(&mcs_compl->lock);
+	}
+
+	if (i == MULTI_CS_MAX_USER_CTX) {
+		dev_err(hdev->dev,
+				"no available multi-CS completion structure\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	return mcs_compl;
+}
+
+/*
+ * hl_wait_multi_cs_completion_fini - return completion structure and set as
+ *                                    unused
+ *
+ * @mcs_compl: pointer to the completion structure
+ */
+static void hl_wait_multi_cs_completion_fini(
+					struct multi_cs_completion *mcs_compl)
+{
+	/*
+	 * free completion structure, do it under lock to be in-sync with the
+	 * thread that signals completion
+	 */
+	spin_lock(&mcs_compl->lock);
+	mcs_compl->used = 0;
+	spin_unlock(&mcs_compl->lock);
+}
+
+/*
+ * hl_wait_multi_cs_completion - wait for first CS to complete
+ *
+ * @mcs_data: multi-CS internal data
+ *
+ * @return 0 on success, otherwise non 0 error code
+ */
+static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data)
+{
+	struct hl_device *hdev = mcs_data->ctx->hdev;
+	struct multi_cs_completion *mcs_compl;
+	long completion_rc;
+
+	mcs_compl = hl_wait_multi_cs_completion_init(hdev,
+							mcs_data->stream_map);
+	if (IS_ERR(mcs_compl))
+		return PTR_ERR(mcs_compl);
+
+	completion_rc = wait_for_completion_interruptible_timeout(
+					&mcs_compl->completion,
+					usecs_to_jiffies(mcs_data->timeout_us));
+
+	/* update timestamp */
+	if (completion_rc > 0)
+		mcs_data->timestamp = mcs_compl->timestamp;
+
+	hl_wait_multi_cs_completion_fini(mcs_compl);
+
+	mcs_data->wait_status = completion_rc;
+
+	return 0;
+}
+
+/*
+ * hl_multi_cs_completion_init - init array of multi-CS completion structures
+ *
+ * @hdev: pointer to habanalabs device structure
+ */
+void hl_multi_cs_completion_init(struct hl_device *hdev)
+{
+	struct multi_cs_completion *mcs_cmpl;
+	int i;
+
+	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
+		mcs_cmpl = &hdev->multi_cs_completion[i];
+		mcs_cmpl->used = 0;
+		spin_lock_init(&mcs_cmpl->lock);
+		init_completion(&mcs_cmpl->completion);
+	}
+}
+
+/*
+ * hl_multi_cs_wait_ioctl - implementation of the multi-CS wait ioctl
+ *
+ * @hpriv: pointer to the private data of the fd
+ * @data: pointer to multi-CS wait ioctl in/out args
+ *
+ */
+static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct multi_cs_data mcs_data = {0};
+	union hl_wait_cs_args *args = data;
+	struct hl_ctx *ctx = hpriv->ctx;
+	struct hl_fence **fence_arr;
+	void __user *seq_arr;
+	u32 size_to_copy;
+	u64 *cs_seq_arr;
+	u8 seq_arr_len;
+	int rc;
+
+	if (!hdev->supports_wait_for_multi_cs) {
+		dev_err(hdev->dev, "Wait for multi CS is not supported\n");
+		return -EPERM;
+	}
+
+	seq_arr_len = args->in.seq_arr_len;
+
+	if (seq_arr_len > HL_WAIT_MULTI_CS_LIST_MAX_LEN) {
+		dev_err(hdev->dev, "Can wait only up to %d CSs, input sequence is of length %u\n",
+				HL_WAIT_MULTI_CS_LIST_MAX_LEN, seq_arr_len);
+		return -EINVAL;
+	}
+
+	/* allocate memory for sequence array */
+	cs_seq_arr =
+		kmalloc_array(seq_arr_len, sizeof(*cs_seq_arr), GFP_KERNEL);
+	if (!cs_seq_arr)
+		return -ENOMEM;
+
+	/* copy CS sequence array from user */
+	seq_arr = (void __user *) (uintptr_t) args->in.seq;
+	size_to_copy = seq_arr_len * sizeof(*cs_seq_arr);
+	if (copy_from_user(cs_seq_arr, seq_arr, size_to_copy)) {
+		dev_err(hdev->dev, "Failed to copy multi-cs sequence array from user\n");
+		rc = -EFAULT;
+		goto free_seq_arr;
+	}
+
+	/* allocate array for the fences */
+	fence_arr = kmalloc_array(seq_arr_len, sizeof(*fence_arr), GFP_KERNEL);
+	if (!fence_arr) {
+		rc = -ENOMEM;
+		goto free_seq_arr;
+	}
+
+	/* initialize the multi-CS internal data */
+	mcs_data.ctx = ctx;
+	mcs_data.seq_arr = cs_seq_arr;
+	mcs_data.fence_arr = fence_arr;
+	mcs_data.arr_len = seq_arr_len;
+
+	hl_ctx_get(hdev, ctx);
+
+	/* poll all CS fences, extract timestamp */
+	mcs_data.update_ts = true;
+	rc = hl_cs_poll_fences(&mcs_data);
+	/*
+	 * skip wait for CS completion when one of the below is true:
+	 * - an error on the poll function
+	 * - one or more CS in the list completed
+	 * - the user called ioctl with timeout 0
+	 */
+	if (rc || mcs_data.completion_bitmap || !args->in.timeout_us)
+		goto put_ctx;
+
+	/* wait (with timeout) for the first CS to be completed */
+	mcs_data.timeout_us = args->in.timeout_us;
+	rc = hl_wait_multi_cs_completion(&mcs_data);
+	if (rc)
+		goto put_ctx;
+
+	if (mcs_data.wait_status > 0) {
+		/*
+		 * poll fences once again to update the CS map.
+		 * no timestamp should be updated this time.
+		 */
+		mcs_data.update_ts = false;
+		rc = hl_cs_poll_fences(&mcs_data);
+
+		/*
+		 * if hl_wait_multi_cs_completion returned before timeout (i.e.
+		 * it got a completion) we expect to see at least one CS
+		 * completed after the poll function.
+		 */
+		if (!mcs_data.completion_bitmap) {
+			dev_err(hdev->dev, "Multi-CS got completion on wait but no CS completed\n");
+			rc = -EFAULT;
+		}
+	}
+
+put_ctx:
+	hl_ctx_put(ctx);
+	kfree(fence_arr);
+
+free_seq_arr:
+	kfree(cs_seq_arr);
+
+	/* update output args */
+	memset(args, 0, sizeof(*args));
+	if (rc)
+		return rc;
+
+	if (mcs_data.completion_bitmap) {
+		args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
+		args->out.cs_completion_map = mcs_data.completion_bitmap;
+
+		/* if timestamp not 0- it's valid */
+		if (mcs_data.timestamp) {
+			args->out.timestamp_nsec = mcs_data.timestamp;
+			args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
+		}
+
+		/* update if some CS was gone */
+		if (mcs_data.timestamp)
+			args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
+	} else if (mcs_data.wait_status == -ERESTARTSYS) {
+		args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED;
+	} else {
+		args->out.status = HL_WAIT_CS_STATUS_BUSY;
+	}
+
+	return 0;
+}
+
 static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 {
 	struct hl_device *hdev = hpriv->hdev;
@@ -2221,6 +2660,8 @@ int hl_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 
 	if (flags & HL_WAIT_CS_FLAGS_INTERRUPT)
 		rc = hl_interrupt_wait_ioctl(hpriv, data);
+	else if (flags & HL_WAIT_CS_FLAGS_MULTI_CS)
+		rc = hl_multi_cs_wait_ioctl(hpriv, data);
 	else
 		rc = hl_cs_wait_ioctl(hpriv, data);
 
diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index 1001c65c5e7a39..4d922e4d03935e 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -229,7 +229,17 @@ int hl_ctx_put(struct hl_ctx *ctx)
 	return kref_put(&ctx->refcount, hl_ctx_do_release);
 }
 
-/* this function shall be called with cs_lock locked */
+/*
+ * hl_ctx_get_fence_locked - get CS fence under CS lock
+ *
+ * @ctx: pointer to the context structure.
+ * @seq: CS sequences number
+ *
+ * @return valid fence pointer on success, NULL if fence is gone, otherwise
+ *         error pointer.
+ *
+ * NOTE: this function shall be called with cs_lock locked
+ */
 static struct hl_fence *hl_ctx_get_fence_locked(struct hl_ctx *ctx, u64 seq)
 {
 	struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
@@ -259,6 +269,16 @@ struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
 	return fence;
 }
 
+/*
+ * hl_ctx_get_fences - get multiple CS fences under the same CS lock
+ *
+ * @ctx: pointer to the context structure.
+ * @seq_arr: array of CS sequences to wait for
+ * @fence: fence array to store the CS fences
+ * @arr_len: length of seq_arr and fence_arr
+ *
+ * @return 0 on success, otherwise non 0 error code
+ */
 int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr,
 				struct hl_fence **fence, u32 arr_len)
 {
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 4f76c7a5160589..3751c915f731d0 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1297,6 +1297,10 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 	if (rc)
 		goto user_interrupts_fini;
 
+
+	/* initialize completion structure for multi CS wait */
+	hl_multi_cs_completion_init(hdev);
+
 	/*
 	 * Initialize the H/W queues. Must be done before hw_init, because
 	 * there the addresses of the kernel queue are being written to the
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 26f75d070f726c..6d51544346379d 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -585,7 +585,8 @@ struct asic_fixed_properties {
  * @cs_sequence: sequence of the corresponding command submission
  * @error: mark this fence with error
  * @timestamp: timestamp upon completion
- *
+ * @stream_map: streams bitmap to represent all streams that multi cs is
+ *              waiting on
  */
 struct hl_fence {
 	struct completion	completion;
@@ -593,6 +594,7 @@ struct hl_fence {
 	u64			cs_sequence;
 	int			error;
 	ktime_t			timestamp;
+	u8			stream_map;
 };
 
 /**
@@ -2234,6 +2236,58 @@ struct hl_mmu_funcs {
 			u64 virt_addr, struct hl_mmu_hop_info *hops);
 };
 
+/**
+ * number of user contexts allowed to call wait_for_multi_cs ioctl in
+ * parallel
+ */
+#define MULTI_CS_MAX_USER_CTX	2
+
+/**
+ * struct multi_cs_completion - multi CS wait completion.
+ * @completion: completion of any of the CS in the list
+ * @lock: spinlock for the completion structure
+ * @timestamp: timestamp for the multi-CS completion
+ * @used: 1 if in use, otherwise 0
+ * @stream_map: bitmap of all HW/external queues streams on which the multi-CS
+ *              is waiting
+ */
+struct multi_cs_completion {
+	struct completion	completion;
+	spinlock_t		lock;
+	s64			timestamp;
+	u8			used;
+	u8			stream_map;
+};
+
+/**
+ * struct multi_cs_data - internal data for multi CS call
+ * @ctx: pointer to the context structure
+ * @fence_arr: array of fences of all CSs
+ * @seq_arr: array of CS sequence numbers
+ * @timeout_us: timeout in usec for waiting for CS to complete
+ * @timestamp: timestamp of first completed CS
+ * @wait_status: wait for CS status
+ * @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0)
+ * @arr_len: fence_arr and seq_arr array length
+ * @stream_map: bitmap of all HW/external queues streams on which the multi-CS
+ *              is waiting
+ * @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0)
+ * @update_ts: update timestamp. 1- update the timestamp, otherwise 0.
+ */
+struct multi_cs_data {
+	struct hl_ctx	*ctx;
+	struct hl_fence	**fence_arr;
+	u64		*seq_arr;
+	s64		timeout_us;
+	s64		timestamp;
+	long		wait_status;
+	u32		completion_bitmap;
+	u8		arr_len;
+	u8		stream_map;
+	u8		gone_cs;
+	u8		update_ts;
+};
+
 /**
  * struct hl_device - habanalabs device structure.
  * @pdev: pointer to PCI device, can be NULL in case of simulator device.
@@ -2300,6 +2354,7 @@ struct hl_mmu_funcs {
  * @fw_loader: FW loader manager.
  * @pci_mem_region: array of memory regions in the PCI
  * @state_dump_specs: constants and dictionaries needed to dump system state.
+ * @multi_cs_completion: array of multi-CS completion.
  * @dram_used_mem: current DRAM memory consumption.
  * @timeout_jiffies: device CS timeout value.
  * @max_power: the max power of the device, as configured by the sysadmin. This
@@ -2376,6 +2431,7 @@ struct hl_mmu_funcs {
  *                        halted. We can't halt it again because the COMMS
  *                        protocol will throw an error. Relevant only for
  *                        cases where Linux was not loaded to device CPU
+ * @supports_wait_for_multi_cs: true if wait for multi CS is supported
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -2446,6 +2502,9 @@ struct hl_device {
 
 	struct hl_state_dump_specs	state_dump_specs;
 
+	struct multi_cs_completion	multi_cs_completion[
+							MULTI_CS_MAX_USER_CTX];
+
 	atomic64_t			dram_used_mem;
 	u64				timeout_jiffies;
 	u64				max_power;
@@ -2495,6 +2554,7 @@ struct hl_device {
 	u8				curr_reset_cause;
 	u8				skip_reset_on_timeout;
 	u8				device_cpu_is_halted;
+	u8				supports_wait_for_multi_cs;
 
 	/* Parameters for bring-up */
 	u64				nic_ports_mask;
@@ -2701,6 +2761,7 @@ bool cs_needs_completion(struct hl_cs *cs);
 bool cs_needs_timeout(struct hl_cs *cs);
 bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs);
 struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq);
+void hl_multi_cs_completion_init(struct hl_device *hdev);
 
 void goya_set_asic_funcs(struct hl_device *hdev);
 void gaudi_set_asic_funcs(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index 0afead229e9773..f05a0dbd099077 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -603,6 +603,10 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 		}
 
 		list_add_tail(&cs->staged_cs_node, &staged_cs->staged_cs_node);
+
+		/* update stream map of the first CS */
+		if (hdev->supports_wait_for_multi_cs)
+			staged_cs->fence->stream_map |= cs->fence->stream_map;
 	}
 
 	list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list);
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 561485dcddaf78..a4b33b0b17d4a6 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -1814,6 +1814,7 @@ static int gaudi_sw_init(struct hl_device *hdev)
 	hdev->supports_sync_stream = true;
 	hdev->supports_coresight = true;
 	hdev->supports_staged_submission = true;
+	hdev->supports_wait_for_multi_cs = true;
 
 	gaudi_set_pci_memory_regions(hdev);
 
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index c8d58dd2c041ff..f6251d8663b2b5 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -958,6 +958,7 @@ static int goya_sw_init(struct hl_device *hdev)
 	hdev->supports_coresight = true;
 	hdev->supports_soft_reset = true;
 	hdev->allow_external_soft_reset = true;
+	hdev->supports_wait_for_multi_cs = false;
 
 	goya_set_pci_memory_regions(hdev);
 
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 18765eb75b6590..49c737c4a2f647 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -735,11 +735,18 @@ union hl_cs_args {
 
 #define HL_WAIT_CS_FLAGS_INTERRUPT	0x2
 #define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000
+#define HL_WAIT_CS_FLAGS_MULTI_CS	0x4
+
+#define HL_WAIT_MULTI_CS_LIST_MAX_LEN	32
 
 struct hl_wait_cs_in {
 	union {
 		struct {
-			/* Command submission sequence number */
+			/*
+			 * In case of wait_cs holds the CS sequence number.
+			 * In case of wait for multi CS hold a user pointer to
+			 * an array of CS sequence numbers
+			 */
 			__u64 seq;
 			/* Absolute timeout to wait for command submission
 			 * in microseconds
@@ -767,12 +774,17 @@ struct hl_wait_cs_in {
 
 	/* Context ID - Currently not in use */
 	__u32 ctx_id;
+
 	/* HL_WAIT_CS_FLAGS_*
 	 * If HL_WAIT_CS_FLAGS_INTERRUPT is set, this field should include
 	 * interrupt id according to HL_WAIT_CS_FLAGS_INTERRUPT_MASK, in order
 	 * not to specify an interrupt id ,set mask to all 1s.
 	 */
 	__u32 flags;
+
+	/* Multi CS API info- valid entries in multi-CS array */
+	__u8 seq_arr_len;
+	__u8 pad[7];
 };
 
 #define HL_WAIT_CS_STATUS_COMPLETED	0
@@ -789,8 +801,15 @@ struct hl_wait_cs_out {
 	__u32 status;
 	/* HL_WAIT_CS_STATUS_FLAG* */
 	__u32 flags;
-	/* valid only if HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD is set */
+	/*
+	 * valid only if HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD is set
+	 * for wait_cs: timestamp of CS completion
+	 * for wait_multi_cs: timestamp of FIRST CS completion
+	 */
 	__s64 timestamp_nsec;
+	/* multi CS completion bitmap */
+	__u32 cs_completion_map;
+	__u32 pad;
 };
 
 union hl_wait_cs_args {

From b8e8996e00815b8e79942e5df03a1cca15e39ae6 Mon Sep 17 00:00:00 2001
From: farah kassabri <fkassabri@habana.ai>
Date: Sun, 20 Jun 2021 11:08:19 +0300
Subject: [PATCH 29/55] habanalabs: signal/wait change sync object reset flow

Currently the SOB reset was in fence release function which happens
only at the CS wraparound during the CS allocation time.

In order to support the new encapsulated signals reservation feature,
we need to move the SOB reset to an earlier phase because this SOB
could reach it's max value very fast using the signal reservation.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 104 ++++++++++++------
 drivers/misc/habanalabs/common/habanalabs.h   |   2 +-
 drivers/misc/habanalabs/common/hw_queue.c     |  56 +++++++---
 drivers/misc/habanalabs/gaudi/gaudi.c         |  34 ++++--
 drivers/misc/habanalabs/goya/goya.c           |   4 +-
 5 files changed, 141 insertions(+), 59 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index ba0c854b2ed46a..458cdf2ddab5f2 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -52,6 +52,24 @@ void hl_sob_reset_error(struct kref *ref)
 		hw_sob->q_idx, hw_sob->sob_id);
 }
 
+static void hw_sob_put(struct hl_hw_sob *hw_sob)
+{
+	if (hw_sob)
+		kref_put(&hw_sob->kref, hl_sob_reset);
+}
+
+static void hw_sob_put_err(struct hl_hw_sob *hw_sob)
+{
+	if (hw_sob)
+		kref_put(&hw_sob->kref, hl_sob_reset_error);
+}
+
+static void hw_sob_get(struct hl_hw_sob *hw_sob)
+{
+	if (hw_sob)
+		kref_get(&hw_sob->kref);
+}
+
 /**
  * hl_gen_sob_mask() - Generates a sob mask to be used in a monitor arm packet
  * @sob_base: sob base id
@@ -122,31 +140,7 @@ static void hl_fence_release(struct kref *kref)
 		container_of(kref, struct hl_fence, refcount);
 	struct hl_cs_compl *hl_cs_cmpl =
 		container_of(fence, struct hl_cs_compl, base_fence);
-	struct hl_device *hdev = hl_cs_cmpl->hdev;
 
-	/* EBUSY means the CS was never submitted and hence we don't have
-	 * an attached hw_sob object that we should handle here
-	 */
-	if (fence->error == -EBUSY)
-		goto free;
-
-	if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
-		(hl_cs_cmpl->type == CS_TYPE_WAIT) ||
-		(hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)) {
-
-		dev_dbg(hdev->dev,
-			"CS 0x%llx type %d finished, sob_id: %d, sob_val: 0x%x\n",
-			hl_cs_cmpl->cs_seq,
-			hl_cs_cmpl->type,
-			hl_cs_cmpl->hw_sob->sob_id,
-			hl_cs_cmpl->sob_val);
-
-		queue_work(hdev->sob_reset_wq, &hl_cs_cmpl->sob_reset_work);
-
-		return;
-	}
-
-free:
 	kfree(hl_cs_cmpl);
 }
 
@@ -567,11 +561,46 @@ static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
 	}
 }
 
+static inline void cs_release_sob_reset_handler(struct hl_device *hdev,
+					struct hl_cs *cs,
+					struct hl_cs_compl *hl_cs_cmpl)
+{
+	/* Skip this handler if the cs wasn't submitted, to avoid putting
+	 * the hw_sob twice, since this case already handled at this point,
+	 * also skip if the hw_sob pointer wasn't set.
+	 */
+	if (!hl_cs_cmpl->hw_sob || !cs->submitted)
+		return;
+
+	spin_lock(&hl_cs_cmpl->lock);
+
+	if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
+			(hl_cs_cmpl->type == CS_TYPE_WAIT) ||
+			(hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)) {
+		dev_dbg(hdev->dev,
+				"CS 0x%llx type %d finished, sob_id: %d, sob_val: 0x%x\n",
+				hl_cs_cmpl->cs_seq,
+				hl_cs_cmpl->type,
+				hl_cs_cmpl->hw_sob->sob_id,
+				hl_cs_cmpl->sob_val);
+
+		hw_sob_put(hl_cs_cmpl->hw_sob);
+
+		if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
+			hdev->asic_funcs->reset_sob_group(hdev,
+					hl_cs_cmpl->sob_group);
+	}
+
+	spin_unlock(&hl_cs_cmpl->lock);
+}
+
 static void cs_do_release(struct kref *ref)
 {
 	struct hl_cs *cs = container_of(ref, struct hl_cs, refcount);
 	struct hl_device *hdev = cs->ctx->hdev;
 	struct hl_cs_job *job, *tmp;
+	struct hl_cs_compl *hl_cs_cmpl =
+			container_of(cs->fence, struct hl_cs_compl, base_fence);
 
 	cs->completed = true;
 
@@ -587,8 +616,9 @@ static void cs_do_release(struct kref *ref)
 		complete_job(hdev, job);
 
 	if (!cs->submitted) {
-		/* In case the wait for signal CS was submitted, the put occurs
-		 * in init_signal_wait_cs() or collective_wait_init_cs()
+		/*
+		 * In case the wait for signal CS was submitted, the fence put
+		 * occurs in init_signal_wait_cs() or collective_wait_init_cs()
 		 * right before hanging on the PQ.
 		 */
 		if (cs->type == CS_TYPE_WAIT ||
@@ -661,6 +691,9 @@ static void cs_do_release(struct kref *ref)
 		cs->fence->timestamp = ktime_get();
 	complete_all(&cs->fence->completion);
 	complete_multi_cs(hdev, cs);
+
+	cs_release_sob_reset_handler(hdev, cs, hl_cs_cmpl);
+
 	hl_fence_put(cs->fence);
 
 	kfree(cs->jobs_in_queue_cnt);
@@ -1630,7 +1663,7 @@ int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
 
 	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
 
-	kref_get(&sob->kref);
+	hw_sob_get(sob);
 
 	/* check for wraparound */
 	if (prop->next_sob_val + count >= HL_MAX_SOB_VAL) {
@@ -1640,7 +1673,7 @@ int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
 		 * just incremented the refcount right before calling this
 		 * function.
 		 */
-		kref_put(&sob->kref, hl_sob_reset_error);
+		hw_sob_put_err(sob);
 
 		/*
 		 * check the other sob value, if it still in use then fail
@@ -1797,6 +1830,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	struct hl_fence *sig_fence = NULL;
 	struct hl_ctx *ctx = hpriv->ctx;
 	enum hl_queue_type q_type;
+	bool is_wait_cs = false;
 	struct hl_cs *cs;
 	u64 signal_seq;
 	int rc;
@@ -1849,6 +1883,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	}
 
 	if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_COLLECTIVE_WAIT) {
+		is_wait_cs = true;
+
 		rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq, ctx);
 		if (rc)
 			goto free_cs_chunk_array;
@@ -1894,9 +1930,9 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 
 	rc = allocate_cs(hdev, ctx, cs_type, ULLONG_MAX, &cs, flags, timeout);
 	if (rc) {
-		if (cs_type == CS_TYPE_WAIT ||
-			cs_type == CS_TYPE_COLLECTIVE_WAIT)
+		if (is_wait_cs)
 			hl_fence_put(sig_fence);
+
 		goto free_cs_chunk_array;
 	}
 
@@ -1928,7 +1964,13 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 
 	rc = hl_hw_queue_schedule_cs(cs);
 	if (rc) {
-		if (rc != -EAGAIN)
+		/* In case wait cs failed here, it means the signal cs
+		 * already completed. we want to free all it's related objects
+		 * but we don't want to fail the ioctl.
+		 */
+		if (is_wait_cs)
+			rc = 0;
+		else if (rc != -EAGAIN)
 			dev_err(hdev->dev,
 				"Failed to submit CS %d.%llu to H/W queues, error %d\n",
 				ctx->asid, cs->sequence, rc);
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 6d51544346379d..bf327cb7ddd655 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1244,7 +1244,7 @@ struct hl_asic_funcs {
 	void (*reset_sob_group)(struct hl_device *hdev, u16 sob_group);
 	void (*set_dma_mask_from_fw)(struct hl_device *hdev);
 	u64 (*get_device_time)(struct hl_device *hdev);
-	void (*collective_wait_init_cs)(struct hl_cs *cs);
+	int (*collective_wait_init_cs)(struct hl_cs *cs);
 	int (*collective_wait_create_jobs)(struct hl_device *hdev,
 			struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
 			u32 collective_engine_id);
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index f05a0dbd099077..2494bd6e9358b7 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -416,8 +416,9 @@ static int init_signal_cs(struct hl_device *hdev,
 	cs_cmpl->sob_val = prop->next_sob_val;
 
 	dev_dbg(hdev->dev,
-		"generate signal CB, sob_id: %d, sob val: 0x%x, q_idx: %d\n",
-		cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, q_idx);
+		"generate signal CB, sob_id: %d, sob val: 0x%x, q_idx: %d, seq: %llu\n",
+		cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, q_idx,
+		cs_cmpl->cs_seq);
 
 	/* we set an EB since we must make sure all oeprations are done
 	 * when sending the signal
@@ -430,7 +431,7 @@ static int init_signal_cs(struct hl_device *hdev,
 	return rc;
 }
 
-static void init_wait_cs(struct hl_device *hdev, struct hl_cs *cs,
+static int init_wait_cs(struct hl_device *hdev, struct hl_cs *cs,
 		struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl)
 {
 	struct hl_cs_compl *signal_cs_cmpl;
@@ -449,10 +450,33 @@ static void init_wait_cs(struct hl_device *hdev, struct hl_cs *cs,
 	cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
 	cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
 
+	/* check again if the signal cs already completed.
+	 * if yes then don't send any wait cs since the hw_sob
+	 * could be in reset already. if signal is not completed
+	 * then get refcount to hw_sob to prevent resetting the sob
+	 * while wait cs is not submitted.
+	 * note that this check is protected by two locks,
+	 * hw queue lock and completion object lock,
+	 * and the same completion object lock also protects
+	 * the hw_sob reset handler function.
+	 * The hw_queue lock prevent out of sync of hw_sob
+	 * refcount value, changed by signal/wait flows.
+	 */
+	spin_lock(&signal_cs_cmpl->lock);
+
+	if (completion_done(&cs->signal_fence->completion)) {
+		spin_unlock(&signal_cs_cmpl->lock);
+		return -EINVAL;
+	}
+
+	kref_get(&cs_cmpl->hw_sob->kref);
+
+	spin_unlock(&signal_cs_cmpl->lock);
+
 	dev_dbg(hdev->dev,
-		"generate wait CB, sob_id: %d, sob_val: 0x%x, mon_id: %d, q_idx: %d\n",
+		"generate wait CB, sob_id: %d, sob_val: 0x%x, mon_id: %d, q_idx: %d, seq: %llu\n",
 		cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val,
-		prop->base_mon_id, q_idx);
+		prop->base_mon_id, q_idx, cs->sequence);
 
 	wait_prop.data = (void *) job->patched_cb;
 	wait_prop.sob_base = cs_cmpl->hw_sob->sob_id;
@@ -461,17 +485,14 @@ static void init_wait_cs(struct hl_device *hdev, struct hl_cs *cs,
 	wait_prop.mon_id = prop->base_mon_id;
 	wait_prop.q_idx = q_idx;
 	wait_prop.size = 0;
+
 	hdev->asic_funcs->gen_wait_cb(hdev, &wait_prop);
 
-	kref_get(&cs_cmpl->hw_sob->kref);
-	/*
-	 * Must put the signal fence after the SOB refcnt increment so
-	 * the SOB refcnt won't turn 0 and reset the SOB before the
-	 * wait CS was submitted.
-	 */
 	mb();
 	hl_fence_put(cs->signal_fence);
 	cs->signal_fence = NULL;
+
+	return 0;
 }
 
 /*
@@ -496,7 +517,7 @@ static int init_signal_wait_cs(struct hl_cs *cs)
 	if (cs->type & CS_TYPE_SIGNAL)
 		rc = init_signal_cs(hdev, job, cs_cmpl);
 	else if (cs->type & CS_TYPE_WAIT)
-		init_wait_cs(hdev, cs, job, cs_cmpl);
+		rc = init_wait_cs(hdev, cs, job, cs_cmpl);
 
 	return rc;
 }
@@ -571,12 +592,13 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 
 	if ((cs->type == CS_TYPE_SIGNAL) || (cs->type == CS_TYPE_WAIT)) {
 		rc = init_signal_wait_cs(cs);
-		if (rc) {
-			dev_err(hdev->dev, "Failed to submit signal cs\n");
+		if (rc)
 			goto unroll_cq_resv;
-		}
-	} else if (cs->type == CS_TYPE_COLLECTIVE_WAIT)
-		hdev->asic_funcs->collective_wait_init_cs(cs);
+	} else if (cs->type == CS_TYPE_COLLECTIVE_WAIT) {
+		rc = hdev->asic_funcs->collective_wait_init_cs(cs);
+		if (rc)
+			goto unroll_cq_resv;
+	}
 
 
 	spin_lock(&hdev->cs_mirror_lock);
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index a4b33b0b17d4a6..5b7a5692cd21e8 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -1239,7 +1239,7 @@ static void gaudi_collective_slave_init_job(struct hl_device *hdev,
 			prop->collective_sob_id, cb_size, false);
 }
 
-static void gaudi_collective_wait_init_cs(struct hl_cs *cs)
+static int gaudi_collective_wait_init_cs(struct hl_cs *cs)
 {
 	struct hl_cs_compl *signal_cs_cmpl =
 		container_of(cs->signal_fence, struct hl_cs_compl, base_fence);
@@ -1261,6 +1261,29 @@ static void gaudi_collective_wait_init_cs(struct hl_cs *cs)
 	cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
 	cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
 
+	/* check again if the signal cs already completed.
+	 * if yes then don't send any wait cs since the hw_sob
+	 * could be in reset already. if signal is not completed
+	 * then get refcount to hw_sob to prevent resetting the sob
+	 * while wait cs is not submitted.
+	 * note that this check is protected by two locks,
+	 * hw queue lock and completion object lock,
+	 * and the same completion object lock also protects
+	 * the hw_sob reset handler function.
+	 * The hw_queue lock prevent out of sync of hw_sob
+	 * refcount value, changed by signal/wait flows.
+	 */
+	spin_lock(&signal_cs_cmpl->lock);
+
+	if (completion_done(&cs->signal_fence->completion)) {
+		spin_unlock(&signal_cs_cmpl->lock);
+		return -EINVAL;
+	}
+	/* Increment kref since all slave queues are now waiting on it */
+	kref_get(&cs_cmpl->hw_sob->kref);
+
+	spin_unlock(&signal_cs_cmpl->lock);
+
 	/* Calculate the stream from collective master queue (1st job) */
 	job = list_first_entry(&cs->job_list, struct hl_cs_job, cs_node);
 	stream = job->hw_queue_id % 4;
@@ -1304,16 +1327,11 @@ static void gaudi_collective_wait_init_cs(struct hl_cs *cs)
 				cprop->curr_sob_group_idx[stream], stream);
 	}
 
-	/* Increment kref since all slave queues are now waiting on it */
-	kref_get(&cs_cmpl->hw_sob->kref);
-	/*
-	 * Must put the signal fence after the SOB refcnt increment so
-	 * the SOB refcnt won't turn 0 and reset the SOB before the
-	 * wait CS was submitted.
-	 */
 	mb();
 	hl_fence_put(cs->signal_fence);
 	cs->signal_fence = NULL;
+
+	return 0;
 }
 
 static int gaudi_collective_wait_create_job(struct hl_device *hdev,
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index f6251d8663b2b5..dd218a4bb62e14 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5480,9 +5480,9 @@ u64 goya_get_device_time(struct hl_device *hdev)
 	return device_time | RREG32(mmPSOC_TIMESTAMP_CNTCVL);
 }
 
-static void goya_collective_wait_init_cs(struct hl_cs *cs)
+static int goya_collective_wait_init_cs(struct hl_cs *cs)
 {
-
+	return 0;
 }
 
 static int goya_collective_wait_create_jobs(struct hl_device *hdev,

From b49e435455c5a603ac5fdd603da8047da7beeadd Mon Sep 17 00:00:00 2001
From: farah kassabri <fkassabri@habana.ai>
Date: Mon, 24 May 2021 18:09:22 +0300
Subject: [PATCH 30/55] habanalabs: add support for encapsulated signals
 reservation

The signaling from within encapsulated OP capability is merged into the
existing stream architecture, such that one can trigger multiple
signaling from an encapsulated op, according to the time the event
was done in the graph execution and avoid the need to wait for the
whole encapsulated OP execution to be complete before the stream can
signal.

This commit implements only the reserve/unreserve part.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 255 +++++++++++++++++-
 drivers/misc/habanalabs/common/context.c      |  56 ++++
 drivers/misc/habanalabs/common/habanalabs.h   |  55 +++-
 .../misc/habanalabs/common/habanalabs_drv.c   |   1 -
 drivers/misc/habanalabs/common/hw_queue.c     |   5 +-
 drivers/misc/habanalabs/gaudi/gaudi.c         |   8 +-
 drivers/misc/habanalabs/goya/goya.c           |   6 +
 include/uapi/misc/habanalabs.h                | 110 +++++++-
 8 files changed, 468 insertions(+), 28 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 458cdf2ddab5f2..84032b1bae5c87 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -38,7 +38,11 @@ static void hl_sob_reset(struct kref *ref)
 							kref);
 	struct hl_device *hdev = hw_sob->hdev;
 
+	dev_dbg(hdev->dev, "reset sob id %u\n", hw_sob->sob_id);
+
 	hdev->asic_funcs->reset_sob(hdev, hw_sob);
+
+	hw_sob->need_reset = false;
 }
 
 void hl_sob_reset_error(struct kref *ref)
@@ -52,7 +56,7 @@ void hl_sob_reset_error(struct kref *ref)
 		hw_sob->q_idx, hw_sob->sob_id);
 }
 
-static void hw_sob_put(struct hl_hw_sob *hw_sob)
+void hw_sob_put(struct hl_hw_sob *hw_sob)
 {
 	if (hw_sob)
 		kref_put(&hw_sob->kref, hl_sob_reset);
@@ -64,7 +68,7 @@ static void hw_sob_put_err(struct hl_hw_sob *hw_sob)
 		kref_put(&hw_sob->kref, hl_sob_reset_error);
 }
 
-static void hw_sob_get(struct hl_hw_sob *hw_sob)
+void hw_sob_get(struct hl_hw_sob *hw_sob)
 {
 	if (hw_sob)
 		kref_get(&hw_sob->kref);
@@ -576,7 +580,8 @@ static inline void cs_release_sob_reset_handler(struct hl_device *hdev,
 
 	if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
 			(hl_cs_cmpl->type == CS_TYPE_WAIT) ||
-			(hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)) {
+			(hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) ||
+			(!!hl_cs_cmpl->encaps_signals)) {
 		dev_dbg(hdev->dev,
 				"CS 0x%llx type %d finished, sob_id: %d, sob_val: 0x%x\n",
 				hl_cs_cmpl->cs_seq,
@@ -829,6 +834,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 
 	cs_cmpl->hdev = hdev;
 	cs_cmpl->type = cs->type;
+	cs_cmpl->encaps_signals = false;
 	spin_lock_init(&cs_cmpl->lock);
 	INIT_WORK(&cs_cmpl->sob_reset_work, sob_reset_work);
 	cs->fence = &cs_cmpl->base_fence;
@@ -1115,6 +1121,10 @@ static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
 		return CS_TYPE_WAIT;
 	else if (cs_type_flags & HL_CS_FLAGS_COLLECTIVE_WAIT)
 		return CS_TYPE_COLLECTIVE_WAIT;
+	else if (cs_type_flags & HL_CS_FLAGS_RESERVE_SIGNALS_ONLY)
+		return CS_RESERVE_SIGNALS;
+	else if (cs_type_flags & HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY)
+		return CS_UNRESERVE_SIGNALS;
 	else
 		return CS_TYPE_DEFAULT;
 }
@@ -1652,10 +1662,17 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
  * hl_cs_signal_sob_wraparound_handler: handle SOB value wrapaound case.
  * if the SOB value reaches the max value move to the other SOB reserved
  * to the queue.
+ * @hdev: pointer to device structure
+ * @q_idx: stream queue index
+ * @hw_sob: the H/W SOB used in this signal CS.
+ * @count: signals count
+ * @encaps_sig: tells whether it's reservation for encaps signals or not.
+ *
  * Note that this function must be called while hw_queues_lock is taken.
  */
 int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
-			struct hl_hw_sob **hw_sob, u32 count)
+			struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig)
+
 {
 	struct hl_sync_stream_properties *prop;
 	struct hl_hw_sob *sob = *hw_sob, *other_sob;
@@ -1688,12 +1705,34 @@ int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
 			return -EINVAL;
 		}
 
-		prop->next_sob_val = 1;
+		prop->next_sob_val = count;
 
 		/* only two SOBs are currently in use */
 		prop->curr_sob_offset = other_sob_offset;
 		*hw_sob = other_sob;
 
+		/*
+		 * check if other_sob needs reset, then do it before using it
+		 * for the reservation or the next signal cs.
+		 * we do it here, and for both encaps and regular signal cs
+		 * cases in order to avoid possible races of two kref_put
+		 * of the sob which can occur at the same time if we move the
+		 * sob reset(kref_put) to cs_do_release function.
+		 * in addition, if we have combination of cs signal and
+		 * encaps, and at the point we need to reset the sob there was
+		 * no more reservations and only signal cs keep coming,
+		 * in such case we need to signal_cs to put the refcount and
+		 * reset the sob.
+		 */
+		if (other_sob->need_reset)
+			kref_put(&other_sob->kref, hl_sob_reset);
+
+		if (encaps_sig) {
+			/* set reset indication for the sob */
+			sob->need_reset = true;
+			hw_sob_get(other_sob);
+		}
+
 		dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n",
 				prop->curr_sob_offset, q_idx);
 	} else {
@@ -1817,6 +1856,187 @@ static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
 	return 0;
 }
 
+static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
+				u32 q_idx, u32 count,
+				u32 *handle_id, u32 *sob_addr,
+				u32 *signals_count)
+{
+	struct hw_queue_properties *hw_queue_prop;
+	struct hl_sync_stream_properties *prop;
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_cs_encaps_sig_handle *handle;
+	struct hl_encaps_signals_mgr *mgr;
+	struct hl_hw_sob *hw_sob;
+	int hdl_id;
+	int rc = 0;
+
+	if (count >= HL_MAX_SOB_VAL) {
+		dev_err(hdev->dev, "signals count(%u) exceeds the max SOB value\n",
+						count);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (q_idx >= hdev->asic_prop.max_queues) {
+		dev_err(hdev->dev, "Queue index %d is invalid\n",
+			q_idx);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
+
+	if (!hw_queue_prop->supports_sync_stream) {
+		dev_err(hdev->dev,
+			"Queue index %d does not support sync stream operations\n",
+									q_idx);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
+
+	handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+	if (!handle) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	handle->count = count;
+	mgr = &hpriv->ctx->sig_mgr;
+
+	spin_lock(&mgr->lock);
+	hdl_id = idr_alloc(&mgr->handles, handle, 1, 0, GFP_KERNEL);
+	spin_unlock(&mgr->lock);
+
+	if (hdl_id < 0) {
+		dev_err(hdev->dev, "Failed to allocate IDR for a new signal reservation\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	handle->id = hdl_id;
+	handle->q_idx = q_idx;
+	handle->hdev = hdev;
+	kref_init(&handle->refcount);
+
+	hdev->asic_funcs->hw_queues_lock(hdev);
+
+	hw_sob = &prop->hw_sob[prop->curr_sob_offset];
+
+	/*
+	 * Increment the SOB value by count by user request
+	 * to reserve those signals
+	 * check if the signals amount to reserve is not exceeding the max sob
+	 * value, if yes then switch sob.
+	 */
+	rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, count,
+						true);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to switch SOB\n");
+		hdev->asic_funcs->hw_queues_unlock(hdev);
+		rc = -EINVAL;
+		goto remove_idr;
+	}
+
+	/* set the hw_sob to the handle after calling the sob wraparound handler
+	 * since sob could have changed.
+	 */
+	handle->hw_sob = hw_sob;
+
+	/* store the current sob value for unreserve validity check, and
+	 * signal offset support
+	 */
+	handle->pre_sob_val = prop->next_sob_val - handle->count;
+
+	*signals_count = prop->next_sob_val;
+	hdev->asic_funcs->hw_queues_unlock(hdev);
+
+	*sob_addr = handle->hw_sob->sob_addr;
+	*handle_id = hdl_id;
+
+	dev_dbg(hdev->dev,
+		"Signals reserved, sob_id: %d, sob addr: 0x%x, sob val: 0x%x, q_idx: %d, hdl_id: %d\n",
+			hw_sob->sob_id, handle->hw_sob->sob_addr,
+			prop->next_sob_val, q_idx, hdl_id);
+	goto out;
+
+remove_idr:
+	spin_lock(&mgr->lock);
+	idr_remove(&mgr->handles, hdl_id);
+	spin_unlock(&mgr->lock);
+
+	kfree(handle);
+out:
+	return rc;
+}
+
+static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
+{
+	struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
+	struct hl_sync_stream_properties *prop;
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_encaps_signals_mgr *mgr;
+	struct hl_hw_sob *hw_sob;
+	u32 q_idx, sob_addr;
+	int rc = 0;
+
+	mgr = &hpriv->ctx->sig_mgr;
+
+	spin_lock(&mgr->lock);
+	encaps_sig_hdl = idr_find(&mgr->handles, handle_id);
+	if (encaps_sig_hdl) {
+		dev_dbg(hdev->dev, "unreserve signals, handle: %u, SOB:0x%x, count: %u\n",
+				handle_id, encaps_sig_hdl->hw_sob->sob_addr,
+					encaps_sig_hdl->count);
+
+		hdev->asic_funcs->hw_queues_lock(hdev);
+
+		q_idx = encaps_sig_hdl->q_idx;
+		prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
+		hw_sob = &prop->hw_sob[prop->curr_sob_offset];
+		sob_addr = hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id);
+
+		/* Check if sob_val got out of sync due to other
+		 * signal submission requests which were handled
+		 * between the reserve-unreserve calls or SOB switch
+		 * upon reaching SOB max value.
+		 */
+		if (encaps_sig_hdl->pre_sob_val + encaps_sig_hdl->count
+				!= prop->next_sob_val ||
+				sob_addr != encaps_sig_hdl->hw_sob->sob_addr) {
+			dev_err(hdev->dev, "Cannot unreserve signals, SOB val ran out of sync, expected: %u, actual val: %u\n",
+				encaps_sig_hdl->pre_sob_val,
+				(prop->next_sob_val - encaps_sig_hdl->count));
+
+			hdev->asic_funcs->hw_queues_unlock(hdev);
+			rc = -EINVAL;
+			goto out;
+		}
+
+		/*
+		 * Decrement the SOB value by count by user request
+		 * to unreserve those signals
+		 */
+		prop->next_sob_val -= encaps_sig_hdl->count;
+
+		hdev->asic_funcs->hw_queues_unlock(hdev);
+
+		hw_sob_put(hw_sob);
+
+		/* Release the id and free allocated memory of the handle */
+		idr_remove(&mgr->handles, handle_id);
+		kfree(encaps_sig_hdl);
+	} else {
+		rc = -EINVAL;
+		dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n");
+	}
+out:
+	spin_unlock(&mgr->lock);
+
+	return rc;
+}
+
 static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 				void __user *chunks, u32 num_chunks,
 				u64 *cs_seq, u32 flags, u32 timeout)
@@ -1996,10 +2216,11 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 {
 	union hl_cs_args *args = data;
-	enum hl_cs_type cs_type;
+	enum hl_cs_type cs_type = 0;
 	u64 cs_seq = ULONG_MAX;
 	void __user *chunks;
-	u32 num_chunks, flags, timeout;
+	u32 num_chunks, flags, timeout,
+		signals_count = 0, sob_addr = 0, handle_id = 0;
 	int rc;
 
 	rc = hl_cs_sanity_checks(hpriv, args);
@@ -2036,17 +2257,33 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 		rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks,
 					&cs_seq, args->in.cs_flags, timeout);
 		break;
+	case CS_RESERVE_SIGNALS:
+		rc = cs_ioctl_reserve_signals(hpriv,
+					args->in.encaps_signals_q_idx,
+					args->in.encaps_signals_count,
+					&handle_id, &sob_addr, &signals_count);
+		break;
+	case CS_UNRESERVE_SIGNALS:
+		rc = cs_ioctl_unreserve_signals(hpriv,
+					args->in.encaps_sig_handle_id);
+		break;
 	default:
 		rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
 						args->in.cs_flags, timeout);
 		break;
 	}
-
 out:
 	if (rc != -EAGAIN) {
 		memset(args, 0, sizeof(*args));
+
+		if (cs_type == CS_RESERVE_SIGNALS) {
+			args->out.handle_id = handle_id;
+			args->out.sob_base_addr_offset = sob_addr;
+			args->out.count = signals_count;
+		} else {
+			args->out.seq = cs_seq;
+		}
 		args->out.status = rc;
-		args->out.seq = cs_seq;
 	}
 
 	return rc;
diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index 4d922e4d03935e..abbba4194d3bff 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -9,6 +9,59 @@
 
 #include <linux/slab.h>
 
+void hl_encaps_handle_do_release(struct kref *ref)
+{
+	struct hl_cs_encaps_sig_handle *handle =
+		container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
+	struct hl_ctx *ctx = handle->hdev->compute_ctx;
+	struct hl_encaps_signals_mgr *mgr = &ctx->sig_mgr;
+
+	idr_remove(&mgr->handles, handle->id);
+	kfree(handle);
+}
+
+static void hl_encaps_handle_do_release_sob(struct kref *ref)
+{
+	struct hl_cs_encaps_sig_handle *handle =
+		container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
+	struct hl_ctx *ctx = handle->hdev->compute_ctx;
+	struct hl_encaps_signals_mgr *mgr = &ctx->sig_mgr;
+
+	/* if we're here, then there was a signals reservation but cs with
+	 * encaps signals wasn't submitted, so need to put refcount
+	 * to hw_sob taken at the reservation.
+	 */
+	hw_sob_put(handle->hw_sob);
+
+	idr_remove(&mgr->handles, handle->id);
+	kfree(handle);
+}
+
+static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
+{
+	spin_lock_init(&mgr->lock);
+	idr_init(&mgr->handles);
+}
+
+static void hl_encaps_sig_mgr_fini(struct hl_device *hdev,
+			struct hl_encaps_signals_mgr *mgr)
+{
+	struct hl_cs_encaps_sig_handle *handle;
+	struct idr *idp;
+	u32 id;
+
+	idp = &mgr->handles;
+
+	if (!idr_is_empty(idp)) {
+		dev_warn(hdev->dev, "device released while some encaps signals handles are still allocated\n");
+		idr_for_each_entry(idp, handle, id)
+			kref_put(&handle->refcount,
+					hl_encaps_handle_do_release_sob);
+	}
+
+	idr_destroy(&mgr->handles);
+}
+
 static void hl_ctx_fini(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
@@ -53,6 +106,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 		hl_cb_va_pool_fini(ctx);
 		hl_vm_ctx_fini(ctx);
 		hl_asid_free(hdev, ctx->asid);
+		hl_encaps_sig_mgr_fini(hdev, &ctx->sig_mgr);
 
 		/* Scrub both SRAM and DRAM */
 		hdev->asic_funcs->scrub_device_mem(hdev, 0, 0);
@@ -200,6 +254,8 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 			goto err_cb_va_pool_fini;
 		}
 
+		hl_encaps_sig_mgr_init(&ctx->sig_mgr);
+
 		dev_dbg(hdev->dev, "create user context %d\n", ctx->asid);
 	}
 
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index bf327cb7ddd655..81b6825e0c1c92 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -242,7 +242,9 @@ enum hl_cs_type {
 	CS_TYPE_DEFAULT,
 	CS_TYPE_SIGNAL,
 	CS_TYPE_WAIT,
-	CS_TYPE_COLLECTIVE_WAIT
+	CS_TYPE_COLLECTIVE_WAIT,
+	CS_RESERVE_SIGNALS,
+	CS_UNRESERVE_SIGNALS
 };
 
 /*
@@ -287,13 +289,17 @@ enum queue_cb_alloc_flags {
  * @hdev: habanalabs device structure.
  * @kref: refcount of this SOB. The SOB will reset once the refcount is zero.
  * @sob_id: id of this SOB.
+ * @sob_addr: the sob offset from the base address.
  * @q_idx: the H/W queue that uses this SOB.
+ * @need_reset: reset indication set when switching to the other sob.
  */
 struct hl_hw_sob {
 	struct hl_device	*hdev;
 	struct kref		kref;
 	u32			sob_id;
+	u32			sob_addr;
 	u32			q_idx;
+	bool			need_reset;
 };
 
 enum hl_collective_mode {
@@ -608,6 +614,8 @@ struct hl_fence {
  * @type: type of the CS - signal/wait.
  * @sob_val: the SOB value that is used in this signal/wait CS.
  * @sob_group: the SOB group that is used in this collective wait CS.
+ * @encaps_signals: indication whether it's a completion object of cs with
+ * encaps signals or not.
  */
 struct hl_cs_compl {
 	struct work_struct	sob_reset_work;
@@ -619,6 +627,7 @@ struct hl_cs_compl {
 	enum hl_cs_type		type;
 	u16			sob_val;
 	u16			sob_group;
+	bool			encaps_signals;
 };
 
 /*
@@ -730,6 +739,17 @@ struct hl_sync_stream_properties {
 	u8		curr_sob_offset;
 };
 
+/**
+ * struct hl_encaps_signals_mgr - describes sync stream encapsulated signals
+ * handlers manager
+ * @lock: protects handles.
+ * @handles: an idr to hold all encapsulated signals handles.
+ */
+struct hl_encaps_signals_mgr {
+	spinlock_t		lock;
+	struct idr		handles;
+};
+
 /**
  * struct hl_hw_queue - describes a H/W transport queue.
  * @shadow_queue: pointer to a shadow queue that holds pointers to jobs.
@@ -1135,6 +1155,7 @@ struct fw_load_mgr {
  * @init_firmware_loader: initialize data for FW loader.
  * @init_cpu_scrambler_dram: Enable CPU specific DRAM scrambling
  * @state_dump_init: initialize constants required for state dump
+ * @get_sob_addr: get SOB base address offset.
  */
 struct hl_asic_funcs {
 	int (*early_init)(struct hl_device *hdev);
@@ -1261,6 +1282,7 @@ struct hl_asic_funcs {
 	void (*init_firmware_loader)(struct hl_device *hdev);
 	void (*init_cpu_scrambler_dram)(struct hl_device *hdev);
 	void (*state_dump_init)(struct hl_device *hdev);
+	u32 (*get_sob_addr)(struct hl_device *hdev, u32 sob_id);
 };
 
 
@@ -1353,6 +1375,7 @@ struct hl_pending_cb {
  * @cs_counters: context command submission counters.
  * @cb_va_pool: device VA pool for command buffers which are mapped to the
  *              device's MMU.
+ * @sig_mgr: encaps signals handle manager.
  * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
  *			to user so user could inquire about CS. It is used as
  *			index to cs_pending array.
@@ -1392,6 +1415,7 @@ struct hl_ctx {
 	struct list_head		hw_block_mem_list;
 	struct hl_cs_counters_atomic	cs_counters;
 	struct gen_pool			*cb_va_pool;
+	struct hl_encaps_signals_mgr	sig_mgr;
 	u64				cs_sequence;
 	u64				*dram_default_hops;
 	spinlock_t			pending_cb_lock;
@@ -2504,7 +2528,6 @@ struct hl_device {
 
 	struct multi_cs_completion	multi_cs_completion[
 							MULTI_CS_MAX_USER_CTX];
-
 	atomic64_t			dram_used_mem;
 	u64				timeout_jiffies;
 	u64				max_power;
@@ -2576,6 +2599,29 @@ struct hl_device {
 };
 
 
+/**
+ * struct hl_cs_encaps_sig_handle - encapsulated signals handle structure
+ * @refcount: refcount used to protect removing this id when several
+ *            wait cs are used to wait of the reserved encaps signals.
+ * @hdev: pointer to habanalabs device structure.
+ * @hw_sob: pointer to  H/W SOB used in the reservation.
+ * @cs_seq: staged cs sequence which contains encapsulated signals
+ * @id: idr handler id to be used to fetch the handler info
+ * @q_idx: stream queue index
+ * @pre_sob_val: current SOB value before reservation
+ * @count: signals number
+ */
+struct hl_cs_encaps_sig_handle {
+	struct kref refcount;
+	struct hl_device *hdev;
+	struct hl_hw_sob *hw_sob;
+	u64  cs_seq;
+	u32  id;
+	u32  q_idx;
+	u32  pre_sob_val;
+	u32  count;
+};
+
 /*
  * IOCTLs
  */
@@ -2889,9 +2935,12 @@ int hl_set_voltage(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value);
 int hl_set_current(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value);
+void hl_encaps_handle_do_release(struct kref *ref);
+void hw_sob_get(struct hl_hw_sob *hw_sob);
+void hw_sob_put(struct hl_hw_sob *hw_sob);
 void hl_release_pending_user_interrupts(struct hl_device *hdev);
 int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
-			struct hl_hw_sob **hw_sob, u32 count);
+			struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig);
 
 int hl_state_dump(struct hl_device *hdev);
 const char *hl_state_dump_get_sync_name(struct hl_device *hdev, u32 sync_id);
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index bffca119946b33..3df4313d72cd28 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -194,7 +194,6 @@ int hl_device_open(struct inode *inode, struct file *filp)
 
 out_err:
 	mutex_unlock(&hdev->fpriv_list_lock);
-
 	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
 	filp->private_data = NULL;
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index 2494bd6e9358b7..9a59b8e9bf530e 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -426,7 +426,8 @@ static int init_signal_cs(struct hl_device *hdev,
 	hdev->asic_funcs->gen_signal_cb(hdev, job->patched_cb,
 				cs_cmpl->hw_sob->sob_id, 0, true);
 
-	rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, 1);
+	rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, 1,
+								false);
 
 	return rc;
 }
@@ -850,6 +851,8 @@ static void sync_stream_queue_init(struct hl_device *hdev, u32 q_idx)
 		hw_sob = &sync_stream_prop->hw_sob[sob];
 		hw_sob->hdev = hdev;
 		hw_sob->sob_id = sync_stream_prop->base_sob_id + sob;
+		hw_sob->sob_addr =
+			hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id);
 		hw_sob->q_idx = q_idx;
 		kref_init(&hw_sob->kref);
 	}
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 5b7a5692cd21e8..ae1a8b4e694c17 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -8720,6 +8720,11 @@ static u32 gaudi_get_wait_cb_size(struct hl_device *hdev)
 			sizeof(struct packet_msg_prot) * 2;
 }
 
+static u32 gaudi_get_sob_addr(struct hl_device *hdev, u32 sob_id)
+{
+	return mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 + (sob_id * 4);
+}
+
 static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id,
 				u32 size, bool eb)
 {
@@ -9424,7 +9429,8 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.map_pll_idx_to_fw_idx = gaudi_map_pll_idx_to_fw_idx,
 	.init_firmware_loader = gaudi_init_firmware_loader,
 	.init_cpu_scrambler_dram = gaudi_init_scrambler_hbm,
-	.state_dump_init = gaudi_state_dump_init
+	.state_dump_init = gaudi_state_dump_init,
+	.get_sob_addr = gaudi_get_sob_addr
 };
 
 /**
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index dd218a4bb62e14..8a689bf42397e6 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5575,6 +5575,11 @@ static void goya_state_dump_init(struct hl_device *hdev)
 	hdev->state_dump_specs.funcs = goya_state_dump_funcs;
 }
 
+static u32 goya_get_sob_addr(struct hl_device *hdev, u32 sob_id)
+{
+	return 0;
+}
+
 static const struct hl_asic_funcs goya_funcs = {
 	.early_init = goya_early_init,
 	.early_fini = goya_early_fini,
@@ -5662,6 +5667,7 @@ static const struct hl_asic_funcs goya_funcs = {
 	.init_firmware_loader = goya_init_firmware_loader,
 	.init_cpu_scrambler_dram = goya_cpu_init_scrambler_dram,
 	.state_dump_init = goya_state_dump_init,
+	.get_sob_addr = &goya_get_sob_addr
 };
 
 /*
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 49c737c4a2f647..eca86c54591658 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -628,12 +628,21 @@ struct hl_cs_chunk {
 		__u64 cb_handle;
 
 		/* Relevant only when HL_CS_FLAGS_WAIT or
-		 * HL_CS_FLAGS_COLLECTIVE_WAIT is set.
+		 * HL_CS_FLAGS_COLLECTIVE_WAIT is set
 		 * This holds address of array of u64 values that contain
-		 * signal CS sequence numbers. The wait described by this job
-		 * will listen on all those signals (wait event per signal)
+		 * signal CS sequence numbers. The wait described by
+		 * this job will listen on all those signals
+		 * (wait event per signal)
 		 */
 		__u64 signal_seq_arr;
+
+		/*
+		 * Relevant only when HL_CS_FLAGS_WAIT or
+		 * HL_CS_FLAGS_COLLECTIVE_WAIT is set
+		 * along with HL_CS_FLAGS_ENCAP_SIGNALS.
+		 * This is the CS sequence which has the encapsulated signals.
+		 */
+		__u64 encaps_signal_seq;
 	};
 
 	/* Index of queue to put the CB on */
@@ -651,6 +660,17 @@ struct hl_cs_chunk {
 		 * Number of entries in signal_seq_arr
 		 */
 		__u32 num_signal_seq_arr;
+
+		/* Relevant only when HL_CS_FLAGS_WAIT or
+		 * HL_CS_FLAGS_COLLECTIVE_WAIT is set along
+		 * with HL_CS_FLAGS_ENCAP_SIGNALS
+		 * This set the signals range that the user want to wait for
+		 * out of the whole reserved signals range.
+		 * e.g if the signals range is 20, and user don't want
+		 * to wait for signal 8, so he set this offset to 7, then
+		 * he call the API again with 9 and so on till 20.
+		 */
+		__u32 encaps_signal_offset;
 	};
 
 	/* HL_CS_CHUNK_FLAGS_* */
@@ -678,6 +698,28 @@ struct hl_cs_chunk {
 #define HL_CS_FLAGS_CUSTOM_TIMEOUT		0x200
 #define HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT	0x400
 
+/*
+ * The encapsulated signals CS is merged into the existing CS ioctls.
+ * In order to use this feature need to follow the below procedure:
+ * 1. Reserve signals, set the CS type to HL_CS_FLAGS_RESERVE_SIGNALS_ONLY
+ *    the output of this API will be the SOB offset from CFG_BASE.
+ *    this address will be used to patch CB cmds to do the signaling for this
+ *    SOB by incrementing it's value.
+ *    for reverting the reservation use HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY
+ *    CS type, note that this might fail if out-of-sync happened to the SOB
+ *    value, in case other signaling request to the same SOB occurred between
+ *    reserve-unreserve calls.
+ * 2. Use the staged CS to do the encapsulated signaling jobs.
+ *    use HL_CS_FLAGS_STAGED_SUBMISSION and HL_CS_FLAGS_STAGED_SUBMISSION_FIRST
+ *    along with HL_CS_FLAGS_ENCAP_SIGNALS flag, and set encaps_signal_offset
+ *    field. This offset allows app to wait on part of the reserved signals.
+ * 3. Use WAIT/COLLECTIVE WAIT CS along with HL_CS_FLAGS_ENCAP_SIGNALS flag
+ *    to wait for the encapsulated signals.
+ */
+#define HL_CS_FLAGS_ENCAP_SIGNALS		0x800
+#define HL_CS_FLAGS_RESERVE_SIGNALS_ONLY	0x1000
+#define HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY	0x2000
+
 #define HL_CS_STATUS_SUCCESS		0
 
 #define HL_MAX_JOBS_PER_CS		512
@@ -690,10 +732,35 @@ struct hl_cs_in {
 	/* holds address of array of hl_cs_chunk for execution phase */
 	__u64 chunks_execute;
 
-	/* Sequence number of a staged submission CS
-	 * valid only if HL_CS_FLAGS_STAGED_SUBMISSION is set
-	 */
-	__u64 seq;
+	union {
+		/*
+		 * Sequence number of a staged submission CS
+		 * valid only if HL_CS_FLAGS_STAGED_SUBMISSION is set and
+		 * HL_CS_FLAGS_STAGED_SUBMISSION_FIRST is unset.
+		 */
+		__u64 seq;
+
+		/*
+		 * Encapsulated signals handle id
+		 * Valid for two flows:
+		 * 1. CS with encapsulated signals:
+		 *    when HL_CS_FLAGS_STAGED_SUBMISSION and
+		 *    HL_CS_FLAGS_STAGED_SUBMISSION_FIRST
+		 *    and HL_CS_FLAGS_ENCAP_SIGNALS are set.
+		 * 2. unreserve signals:
+		 *    valid when HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY is set.
+		 */
+		__u32 encaps_sig_handle_id;
+
+		/* Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set */
+		struct {
+			/* Encapsulated signals number */
+			__u32 encaps_signals_count;
+
+			/* Encapsulated signals queue index (stream) */
+			__u32 encaps_signals_q_idx;
+		};
+	};
 
 	/* Number of chunks in restore phase array. Maximum number is
 	 * HL_MAX_JOBS_PER_CS
@@ -718,14 +785,31 @@ struct hl_cs_in {
 };
 
 struct hl_cs_out {
+	union {
+		/*
+		 * seq holds the sequence number of the CS to pass to wait
+		 * ioctl. All values are valid except for 0 and ULLONG_MAX
+		 */
+		__u64 seq;
+
+		/* Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set */
+		struct {
+			/* This is the resereved signal handle id */
+			__u32 handle_id;
+
+			/* This is the signals count */
+			__u32 count;
+		};
+	};
+
+	/* HL_CS_STATUS */
+	__u32 status;
+
 	/*
-	 * seq holds the sequence number of the CS to pass to wait ioctl. All
-	 * values are valid except for 0 and ULLONG_MAX
+	 * SOB base address offset
+	 * Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set
 	 */
-	__u64 seq;
-	/* HL_CS_STATUS_* */
-	__u32 status;
-	__u32 pad;
+	__u32 sob_base_addr_offset;
 };
 
 union hl_cs_args {

From 18de5b1239b88506fd4770576649ce21d6b116de Mon Sep 17 00:00:00 2001
From: farah kassabri <fkassabri@habana.ai>
Date: Wed, 26 May 2021 10:43:36 +0300
Subject: [PATCH 31/55] habanalabs: add support for encapsulated signals
 submission

This commit is the second part of the encapsulated signals feature.
It contains the driver support for submission of cs with encapsulated
signals and the wait for them.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 343 +++++++-----------
 drivers/misc/habanalabs/common/context.c      |  14 +-
 drivers/misc/habanalabs/common/habanalabs.h   |  49 +--
 drivers/misc/habanalabs/common/hw_queue.c     | 104 +++++-
 drivers/misc/habanalabs/gaudi/gaudi.c         | 145 +++-----
 drivers/misc/habanalabs/goya/goya.c           |   2 +-
 6 files changed, 308 insertions(+), 349 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 84032b1bae5c87..d71bd48cbc4448 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -106,38 +106,6 @@ int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask)
 	return 0;
 }
 
-static void sob_reset_work(struct work_struct *work)
-{
-	struct hl_cs_compl *hl_cs_cmpl =
-		container_of(work, struct hl_cs_compl, sob_reset_work);
-	struct hl_device *hdev = hl_cs_cmpl->hdev;
-
-	/*
-	 * A signal CS can get completion while the corresponding wait
-	 * for signal CS is on its way to the PQ. The wait for signal CS
-	 * will get stuck if the signal CS incremented the SOB to its
-	 * max value and there are no pending (submitted) waits on this
-	 * SOB.
-	 * We do the following to void this situation:
-	 * 1. The wait for signal CS must get a ref for the signal CS as
-	 *    soon as possible in cs_ioctl_signal_wait() and put it
-	 *    before being submitted to the PQ but after it incremented
-	 *    the SOB refcnt in init_signal_wait_cs().
-	 * 2. Signal/Wait for signal CS will decrement the SOB refcnt
-	 *    here.
-	 * These two measures guarantee that the wait for signal CS will
-	 * reset the SOB upon completion rather than the signal CS and
-	 * hence the above scenario is avoided.
-	 */
-	kref_put(&hl_cs_cmpl->hw_sob->kref, hl_sob_reset);
-
-	if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
-		hdev->asic_funcs->reset_sob_group(hdev,
-				hl_cs_cmpl->sob_group);
-
-	kfree(hl_cs_cmpl);
-}
-
 static void hl_fence_release(struct kref *kref)
 {
 	struct hl_fence *fence =
@@ -578,12 +546,17 @@ static inline void cs_release_sob_reset_handler(struct hl_device *hdev,
 
 	spin_lock(&hl_cs_cmpl->lock);
 
+	/*
+	 * we get refcount upon reservation of signals or signal/wait cs for the
+	 * hw_sob object, and need to put it when the first staged cs
+	 * (which cotains the encaps signals) or cs signal/wait is completed.
+	 */
 	if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
 			(hl_cs_cmpl->type == CS_TYPE_WAIT) ||
 			(hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) ||
 			(!!hl_cs_cmpl->encaps_signals)) {
 		dev_dbg(hdev->dev,
-				"CS 0x%llx type %d finished, sob_id: %d, sob_val: 0x%x\n",
+				"CS 0x%llx type %d finished, sob_id: %d, sob_val: %u\n",
 				hl_cs_cmpl->cs_seq,
 				hl_cs_cmpl->type,
 				hl_cs_cmpl->hw_sob->sob_id,
@@ -664,8 +637,20 @@ static void cs_do_release(struct kref *ref)
 			list_del(&cs->staged_cs_node);
 			spin_unlock(&hdev->cs_mirror_lock);
 		}
+
+		/* decrement refcount to handle when first staged cs
+		 * with encaps signals is completed.
+		 */
+		if (hl_cs_cmpl->encaps_signals)
+			kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount,
+						hl_encaps_handle_do_release);
 	}
 
+	if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
+			&& cs->encaps_signals)
+		kref_put(&cs->encaps_sig_hdl->refcount,
+					hl_encaps_handle_do_release);
+
 out:
 	/* Must be called before hl_ctx_put because inside we use ctx to get
 	 * the device
@@ -798,6 +783,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	cs->completed = false;
 	cs->type = cs_type;
 	cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
+	cs->encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
 	cs->timeout_jiffies = timeout;
 	cs->skip_reset_on_timeout =
 		hdev->skip_reset_on_timeout ||
@@ -808,9 +794,9 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	kref_init(&cs->refcount);
 	spin_lock_init(&cs->job_lock);
 
-	cs_cmpl = kmalloc(sizeof(*cs_cmpl), GFP_ATOMIC);
+	cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_ATOMIC);
 	if (!cs_cmpl)
-		cs_cmpl = kmalloc(sizeof(*cs_cmpl), GFP_KERNEL);
+		cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_KERNEL);
 
 	if (!cs_cmpl) {
 		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
@@ -834,9 +820,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 
 	cs_cmpl->hdev = hdev;
 	cs_cmpl->type = cs->type;
-	cs_cmpl->encaps_signals = false;
 	spin_lock_init(&cs_cmpl->lock);
-	INIT_WORK(&cs_cmpl->sob_reset_work, sob_reset_work);
 	cs->fence = &cs_cmpl->base_fence;
 
 	spin_lock(&ctx->cs_lock);
@@ -933,18 +917,6 @@ void hl_cs_rollback_all(struct hl_device *hdev)
 	force_complete_multi_cs(hdev);
 }
 
-void hl_pending_cb_list_flush(struct hl_ctx *ctx)
-{
-	struct hl_pending_cb *pending_cb, *tmp;
-
-	list_for_each_entry_safe(pending_cb, tmp,
-			&ctx->pending_cb_list, cb_node) {
-		list_del(&pending_cb->cb_node);
-		hl_cb_put(pending_cb->cb);
-		kfree(pending_cb);
-	}
-}
-
 static void
 wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt)
 {
@@ -1225,7 +1197,8 @@ static int hl_cs_copy_chunk_array(struct hl_device *hdev,
 }
 
 static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
-				u64 sequence, u32 flags)
+				u64 sequence, u32 flags,
+				u32 encaps_signal_handle)
 {
 	if (!(flags & HL_CS_FLAGS_STAGED_SUBMISSION))
 		return 0;
@@ -1237,6 +1210,9 @@ static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
 		/* Staged CS sequence is the first CS sequence */
 		INIT_LIST_HEAD(&cs->staged_cs_node);
 		cs->staged_sequence = cs->sequence;
+
+		if (cs->encaps_signals)
+			cs->encaps_sig_hdl_id = encaps_signal_handle;
 	} else {
 		/* User sequence will be validated in 'hl_hw_queue_schedule_cs'
 		 * under the cs_mirror_lock
@@ -1254,7 +1230,7 @@ static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
 
 static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 				u32 num_chunks, u64 *cs_seq, u32 flags,
-				u32 timeout)
+				u32 encaps_signals_handle, u32 timeout)
 {
 	bool staged_mid, int_queues_only = true;
 	struct hl_device *hdev = hpriv->hdev;
@@ -1293,7 +1269,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 
 	hl_debugfs_add_cs(cs);
 
-	rc = cs_staged_submission(hdev, cs, user_sequence, flags);
+	rc = cs_staged_submission(hdev, cs, user_sequence, flags,
+						encaps_signals_handle);
 	if (rc)
 		goto free_cs_object;
 
@@ -1431,130 +1408,6 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 	return rc;
 }
 
-static int pending_cb_create_job(struct hl_device *hdev, struct hl_ctx *ctx,
-		struct hl_cs *cs, struct hl_cb *cb, u32 size, u32 hw_queue_id)
-{
-	struct hw_queue_properties *hw_queue_prop;
-	struct hl_cs_counters_atomic *cntr;
-	struct hl_cs_job *job;
-
-	hw_queue_prop = &hdev->asic_prop.hw_queues_props[hw_queue_id];
-	cntr = &hdev->aggregated_cs_counters;
-
-	job = hl_cs_allocate_job(hdev, hw_queue_prop->type, true);
-	if (!job) {
-		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
-		atomic64_inc(&cntr->out_of_mem_drop_cnt);
-		dev_err(hdev->dev, "Failed to allocate a new job\n");
-		return -ENOMEM;
-	}
-
-	job->id = 0;
-	job->cs = cs;
-	job->user_cb = cb;
-	atomic_inc(&job->user_cb->cs_cnt);
-	job->user_cb_size = size;
-	job->hw_queue_id = hw_queue_id;
-	job->patched_cb = job->user_cb;
-	job->job_cb_size = job->user_cb_size;
-
-	/* increment refcount as for external queues we get completion */
-	cs_get(cs);
-
-	cs->jobs_in_queue_cnt[job->hw_queue_id]++;
-
-	list_add_tail(&job->cs_node, &cs->job_list);
-
-	hl_debugfs_add_job(hdev, job);
-
-	return 0;
-}
-
-static int hl_submit_pending_cb(struct hl_fpriv *hpriv)
-{
-	struct hl_device *hdev = hpriv->hdev;
-	struct hl_ctx *ctx = hpriv->ctx;
-	struct hl_pending_cb *pending_cb, *tmp;
-	struct list_head local_cb_list;
-	struct hl_cs *cs;
-	struct hl_cb *cb;
-	u32 hw_queue_id;
-	u32 cb_size;
-	int process_list, rc = 0;
-
-	if (list_empty(&ctx->pending_cb_list))
-		return 0;
-
-	process_list = atomic_cmpxchg(&ctx->thread_pending_cb_token, 1, 0);
-
-	/* Only a single thread is allowed to process the list */
-	if (!process_list)
-		return 0;
-
-	if (list_empty(&ctx->pending_cb_list))
-		goto free_pending_cb_token;
-
-	/* move all list elements to a local list */
-	INIT_LIST_HEAD(&local_cb_list);
-	spin_lock(&ctx->pending_cb_lock);
-	list_for_each_entry_safe(pending_cb, tmp, &ctx->pending_cb_list,
-								cb_node)
-		list_move_tail(&pending_cb->cb_node, &local_cb_list);
-	spin_unlock(&ctx->pending_cb_lock);
-
-	rc = allocate_cs(hdev, ctx, CS_TYPE_DEFAULT, ULLONG_MAX, &cs, 0,
-				hdev->timeout_jiffies);
-	if (rc)
-		goto add_list_elements;
-
-	hl_debugfs_add_cs(cs);
-
-	/* Iterate through pending cb list, create jobs and add to CS */
-	list_for_each_entry(pending_cb, &local_cb_list, cb_node) {
-		cb = pending_cb->cb;
-		cb_size = pending_cb->cb_size;
-		hw_queue_id = pending_cb->hw_queue_id;
-
-		rc = pending_cb_create_job(hdev, ctx, cs, cb, cb_size,
-								hw_queue_id);
-		if (rc)
-			goto free_cs_object;
-	}
-
-	rc = hl_hw_queue_schedule_cs(cs);
-	if (rc) {
-		if (rc != -EAGAIN)
-			dev_err(hdev->dev,
-				"Failed to submit CS %d.%llu (%d)\n",
-				ctx->asid, cs->sequence, rc);
-		goto free_cs_object;
-	}
-
-	/* pending cb was scheduled successfully */
-	list_for_each_entry_safe(pending_cb, tmp, &local_cb_list, cb_node) {
-		list_del(&pending_cb->cb_node);
-		kfree(pending_cb);
-	}
-
-	cs_put(cs);
-
-	goto free_pending_cb_token;
-
-free_cs_object:
-	cs_rollback(hdev, cs);
-	cs_put(cs);
-add_list_elements:
-	spin_lock(&ctx->pending_cb_lock);
-	list_for_each_entry_safe_reverse(pending_cb, tmp, &local_cb_list,
-								cb_node)
-		list_move(&pending_cb->cb_node, &ctx->pending_cb_list);
-	spin_unlock(&ctx->pending_cb_lock);
-free_pending_cb_token:
-	atomic_set(&ctx->thread_pending_cb_token, 1);
-
-	return rc;
-}
-
 static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
 				u64 *cs_seq)
 {
@@ -1604,7 +1457,7 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
 			rc = 0;
 		} else {
 			rc = cs_ioctl_default(hpriv, chunks, num_chunks,
-					cs_seq, 0, hdev->timeout_jiffies);
+					cs_seq, 0, 0, hdev->timeout_jiffies);
 		}
 
 		mutex_unlock(&hpriv->restore_phase_mutex);
@@ -1705,7 +1558,15 @@ int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
 			return -EINVAL;
 		}
 
-		prop->next_sob_val = count;
+		/*
+		 * next_sob_val always points to the next available signal
+		 * in the sob, so in encaps signals it will be the next one
+		 * after reserving the required amount.
+		 */
+		if (encaps_sig)
+			prop->next_sob_val = count + 1;
+		else
+			prop->next_sob_val = count;
 
 		/* only two SOBs are currently in use */
 		prop->curr_sob_offset = other_sob_offset;
@@ -1721,11 +1582,11 @@ int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
 		 * in addition, if we have combination of cs signal and
 		 * encaps, and at the point we need to reset the sob there was
 		 * no more reservations and only signal cs keep coming,
-		 * in such case we need to signal_cs to put the refcount and
+		 * in such case we need signal_cs to put the refcount and
 		 * reset the sob.
 		 */
 		if (other_sob->need_reset)
-			kref_put(&other_sob->kref, hl_sob_reset);
+			hw_sob_put(other_sob);
 
 		if (encaps_sig) {
 			/* set reset indication for the sob */
@@ -1743,12 +1604,18 @@ int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
 }
 
 static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
-		struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx)
+		struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx,
+		bool encaps_signals)
 {
 	u64 *signal_seq_arr = NULL;
 	u32 size_to_copy, signal_seq_arr_len;
 	int rc = 0;
 
+	if (encaps_signals) {
+		*signal_seq = chunk->encaps_signal_seq;
+		return 0;
+	}
+
 	signal_seq_arr_len = chunk->num_signal_seq_arr;
 
 	/* currently only one signal seq is supported */
@@ -1773,7 +1640,7 @@ static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
 		return -ENOMEM;
 	}
 
-	size_to_copy = chunk->num_signal_seq_arr * sizeof(*signal_seq_arr);
+	size_to_copy = signal_seq_arr_len * sizeof(*signal_seq_arr);
 	if (copy_from_user(signal_seq_arr,
 				u64_to_user_ptr(chunk->signal_seq_arr),
 				size_to_copy)) {
@@ -1795,8 +1662,8 @@ static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
 }
 
 static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
-		struct hl_ctx *ctx, struct hl_cs *cs, enum hl_queue_type q_type,
-		u32 q_idx)
+		struct hl_ctx *ctx, struct hl_cs *cs,
+		enum hl_queue_type q_type, u32 q_idx, u32 encaps_signal_offset)
 {
 	struct hl_cs_counters_atomic *cntr;
 	struct hl_cs_job *job;
@@ -1834,6 +1701,9 @@ static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
 	job->user_cb_size = cb_size;
 	job->hw_queue_id = q_idx;
 
+	if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
+			&& cs->encaps_signals)
+		job->encaps_sig_wait_offset = encaps_signal_offset;
 	/*
 	 * No need in parsing, user CB is the patched CB.
 	 * We call hl_cb_destroy() out of two reasons - we don't need the CB in
@@ -1906,7 +1776,7 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
 	mgr = &hpriv->ctx->sig_mgr;
 
 	spin_lock(&mgr->lock);
-	hdl_id = idr_alloc(&mgr->handles, handle, 1, 0, GFP_KERNEL);
+	hdl_id = idr_alloc(&mgr->handles, handle, 1, 0, GFP_ATOMIC);
 	spin_unlock(&mgr->lock);
 
 	if (hdl_id < 0) {
@@ -1931,14 +1801,13 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
 	 * value, if yes then switch sob.
 	 */
 	rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, count,
-						true);
+								true);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to switch SOB\n");
 		hdev->asic_funcs->hw_queues_unlock(hdev);
 		rc = -EINVAL;
 		goto remove_idr;
 	}
-
 	/* set the hw_sob to the handle after calling the sob wraparound handler
 	 * since sob could have changed.
 	 */
@@ -1956,9 +1825,9 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
 	*handle_id = hdl_id;
 
 	dev_dbg(hdev->dev,
-		"Signals reserved, sob_id: %d, sob addr: 0x%x, sob val: 0x%x, q_idx: %d, hdl_id: %d\n",
+		"Signals reserved, sob_id: %d, sob addr: 0x%x, last sob_val: %u, q_idx: %d, hdl_id: %d\n",
 			hw_sob->sob_id, handle->hw_sob->sob_addr,
-			prop->next_sob_val, q_idx, hdl_id);
+			prop->next_sob_val - 1, q_idx, hdl_id);
 	goto out;
 
 remove_idr:
@@ -2041,7 +1910,12 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 				void __user *chunks, u32 num_chunks,
 				u64 *cs_seq, u32 flags, u32 timeout)
 {
+	struct hl_cs_encaps_sig_handle *encaps_sig_hdl = NULL;
+	bool handle_found = false, is_wait_cs = false,
+			wait_cs_submitted = false,
+			cs_encaps_signals = false;
 	struct hl_cs_chunk *cs_chunk_array, *chunk;
+	bool staged_cs_with_encaps_signals = false;
 	struct hw_queue_properties *hw_queue_prop;
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_cs_compl *sig_waitcs_cmpl;
@@ -2050,7 +1924,6 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	struct hl_fence *sig_fence = NULL;
 	struct hl_ctx *ctx = hpriv->ctx;
 	enum hl_queue_type q_type;
-	bool is_wait_cs = false;
 	struct hl_cs *cs;
 	u64 signal_seq;
 	int rc;
@@ -2102,13 +1975,58 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 		collective_engine_id = chunk->collective_engine_id;
 	}
 
-	if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_COLLECTIVE_WAIT) {
-		is_wait_cs = true;
+	is_wait_cs = !!(cs_type == CS_TYPE_WAIT ||
+			cs_type == CS_TYPE_COLLECTIVE_WAIT);
 
-		rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq, ctx);
+	cs_encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
+
+	if (is_wait_cs) {
+		rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq,
+				ctx, cs_encaps_signals);
 		if (rc)
 			goto free_cs_chunk_array;
 
+		if (cs_encaps_signals) {
+			/* check if cs sequence has encapsulated
+			 * signals handle
+			 */
+			struct idr *idp;
+			u32 id;
+
+			spin_lock(&ctx->sig_mgr.lock);
+			idp = &ctx->sig_mgr.handles;
+			idr_for_each_entry(idp, encaps_sig_hdl, id) {
+				if (encaps_sig_hdl->cs_seq == signal_seq) {
+					handle_found = true;
+					/* get refcount to protect removing
+					 * this handle from idr, needed when
+					 * multiple wait cs are used with offset
+					 * to wait on reserved encaps signals.
+					 */
+					kref_get(&encaps_sig_hdl->refcount);
+					break;
+				}
+			}
+			spin_unlock(&ctx->sig_mgr.lock);
+
+			if (!handle_found) {
+				dev_err(hdev->dev, "Cannot find encapsulated signals handle for seq 0x%llx\n",
+						signal_seq);
+				rc = -EINVAL;
+				goto free_cs_chunk_array;
+			}
+
+			/* validate also the signal offset value */
+			if (chunk->encaps_signal_offset >
+					encaps_sig_hdl->count) {
+				dev_err(hdev->dev, "offset(%u) value exceed max reserved signals count(%u)!\n",
+						chunk->encaps_signal_offset,
+						encaps_sig_hdl->count);
+				rc = -EINVAL;
+				goto free_cs_chunk_array;
+			}
+		}
+
 		sig_fence = hl_ctx_get_fence(ctx, signal_seq);
 		if (IS_ERR(sig_fence)) {
 			atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
@@ -2129,11 +2047,16 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 		sig_waitcs_cmpl =
 			container_of(sig_fence, struct hl_cs_compl, base_fence);
 
-		if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL) {
+		staged_cs_with_encaps_signals = !!
+				(sig_waitcs_cmpl->type == CS_TYPE_DEFAULT &&
+				(flags & HL_CS_FLAGS_ENCAP_SIGNALS));
+
+		if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL &&
+				!staged_cs_with_encaps_signals) {
 			atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
 			atomic64_inc(&cntr->validation_drop_cnt);
 			dev_err(hdev->dev,
-				"CS seq 0x%llx is not of a signal CS\n",
+				"CS seq 0x%llx is not of a signal/encaps-signal CS\n",
 				signal_seq);
 			hl_fence_put(sig_fence);
 			rc = -EINVAL;
@@ -2159,9 +2082,18 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	/*
 	 * Save the signal CS fence for later initialization right before
 	 * hanging the wait CS on the queue.
+	 * for encaps signals case, we save the cs sequence and handle pointer
+	 * for later initialization.
 	 */
-	if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_COLLECTIVE_WAIT)
+	if (is_wait_cs) {
 		cs->signal_fence = sig_fence;
+		/* store the handle pointer, so we don't have to
+		 * look for it again, later on the flow
+		 * when we need to set SOB info in hw_queue.
+		 */
+		if (cs->encaps_signals)
+			cs->encaps_sig_hdl = encaps_sig_hdl;
+	}
 
 	hl_debugfs_add_cs(cs);
 
@@ -2169,10 +2101,11 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 
 	if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_SIGNAL)
 		rc = cs_ioctl_signal_wait_create_jobs(hdev, ctx, cs, q_type,
-				q_idx);
+				q_idx, chunk->encaps_signal_offset);
 	else if (cs_type == CS_TYPE_COLLECTIVE_WAIT)
 		rc = hdev->asic_funcs->collective_wait_create_jobs(hdev, ctx,
-				cs, q_idx, collective_engine_id);
+				cs, q_idx, collective_engine_id,
+				chunk->encaps_signal_offset);
 	else {
 		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
 		atomic64_inc(&cntr->validation_drop_cnt);
@@ -2198,6 +2131,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	}
 
 	rc = HL_CS_STATUS_SUCCESS;
+	if (is_wait_cs)
+		wait_cs_submitted = true;
 	goto put_cs;
 
 free_cs_object:
@@ -2208,6 +2143,10 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	/* We finished with the CS in this function, so put the ref */
 	cs_put(cs);
 free_cs_chunk_array:
+	if (!wait_cs_submitted && cs_encaps_signals && handle_found &&
+							is_wait_cs)
+		kref_put(&encaps_sig_hdl->refcount,
+				hl_encaps_handle_do_release);
 	kfree(cs_chunk_array);
 out:
 	return rc;
@@ -2231,10 +2170,6 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 	if (rc)
 		goto out;
 
-	rc = hl_submit_pending_cb(hpriv);
-	if (rc)
-		goto out;
-
 	cs_type = hl_cs_get_cs_type(args->in.cs_flags &
 					~HL_CS_FLAGS_FORCE_RESTORE);
 	chunks = (void __user *) (uintptr_t) args->in.chunks_execute;
@@ -2269,7 +2204,9 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 		break;
 	default:
 		rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
-						args->in.cs_flags, timeout);
+						args->in.cs_flags,
+						args->in.encaps_sig_handle_id,
+						timeout);
 		break;
 	}
 out:
diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index abbba4194d3bff..007f3a48601cdb 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -16,7 +16,10 @@ void hl_encaps_handle_do_release(struct kref *ref)
 	struct hl_ctx *ctx = handle->hdev->compute_ctx;
 	struct hl_encaps_signals_mgr *mgr = &ctx->sig_mgr;
 
+	spin_lock(&mgr->lock);
 	idr_remove(&mgr->handles, handle->id);
+	spin_unlock(&mgr->lock);
+
 	kfree(handle);
 }
 
@@ -33,7 +36,10 @@ static void hl_encaps_handle_do_release_sob(struct kref *ref)
 	 */
 	hw_sob_put(handle->hw_sob);
 
+	spin_lock(&mgr->lock);
 	idr_remove(&mgr->handles, handle->id);
+	spin_unlock(&mgr->lock);
+
 	kfree(handle);
 }
 
@@ -67,11 +73,6 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 	struct hl_device *hdev = ctx->hdev;
 	int i;
 
-	/* Release all allocated pending cb's, those cb's were never
-	 * scheduled so it is safe to release them here
-	 */
-	hl_pending_cb_list_flush(ctx);
-
 	/* Release all allocated HW block mapped list entries and destroy
 	 * the mutex.
 	 */
@@ -198,11 +199,8 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 	kref_init(&ctx->refcount);
 
 	ctx->cs_sequence = 1;
-	INIT_LIST_HEAD(&ctx->pending_cb_list);
-	spin_lock_init(&ctx->pending_cb_lock);
 	spin_lock_init(&ctx->cs_lock);
 	atomic_set(&ctx->thread_ctx_switch_token, 1);
-	atomic_set(&ctx->thread_pending_cb_token, 1);
 	ctx->thread_ctx_switch_wait_token = 0;
 	ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
 				sizeof(struct hl_fence *),
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 81b6825e0c1c92..b72fcc9255aa89 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -605,11 +605,11 @@ struct hl_fence {
 
 /**
  * struct hl_cs_compl - command submission completion object.
- * @sob_reset_work: workqueue object to run SOB reset flow.
  * @base_fence: hl fence object.
  * @lock: spinlock to protect fence.
  * @hdev: habanalabs device structure.
  * @hw_sob: the H/W SOB used in this signal/wait CS.
+ * @encaps_sig_hdl: encaps signals hanlder.
  * @cs_seq: command submission sequence number.
  * @type: type of the CS - signal/wait.
  * @sob_val: the SOB value that is used in this signal/wait CS.
@@ -618,11 +618,11 @@ struct hl_fence {
  * encaps signals or not.
  */
 struct hl_cs_compl {
-	struct work_struct	sob_reset_work;
 	struct hl_fence		base_fence;
 	spinlock_t		lock;
 	struct hl_device	*hdev;
 	struct hl_hw_sob	*hw_sob;
+	struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
 	u64			cs_seq;
 	enum hl_cs_type		type;
 	u16			sob_val;
@@ -1267,8 +1267,9 @@ struct hl_asic_funcs {
 	u64 (*get_device_time)(struct hl_device *hdev);
 	int (*collective_wait_init_cs)(struct hl_cs *cs);
 	int (*collective_wait_create_jobs)(struct hl_device *hdev,
-			struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
-			u32 collective_engine_id);
+			struct hl_ctx *ctx, struct hl_cs *cs,
+			u32 wait_queue_id, u32 collective_engine_id,
+			u32 encaps_signal_offset);
 	u64 (*scramble_addr)(struct hl_device *hdev, u64 addr);
 	u64 (*descramble_addr)(struct hl_device *hdev, u64 addr);
 	void (*ack_protection_bits_errors)(struct hl_device *hdev);
@@ -1339,20 +1340,6 @@ struct hl_cs_counters_atomic {
 	atomic64_t validation_drop_cnt;
 };
 
-/**
- * struct hl_pending_cb - pending command buffer structure
- * @cb_node: cb node in pending cb list
- * @cb: command buffer to send in next submission
- * @cb_size: command buffer size
- * @hw_queue_id: destination queue id
- */
-struct hl_pending_cb {
-	struct list_head	cb_node;
-	struct hl_cb		*cb;
-	u32			cb_size;
-	u32			hw_queue_id;
-};
-
 /**
  * struct hl_ctx - user/kernel context.
  * @mem_hash: holds mapping from virtual address to virtual memory area
@@ -1369,8 +1356,6 @@ struct hl_pending_cb {
  *            MMU hash or walking the PGT requires talking this lock.
  * @hw_block_list_lock: protects the HW block memory list.
  * @debugfs_list: node in debugfs list of contexts.
- * pending_cb_list: list of pending command buffers waiting to be sent upon
- *                  next user command submission context.
  * @hw_block_mem_list: list of HW block virtual mapped addresses.
  * @cs_counters: context command submission counters.
  * @cb_va_pool: device VA pool for command buffers which are mapped to the
@@ -1381,17 +1366,11 @@ struct hl_pending_cb {
  *			index to cs_pending array.
  * @dram_default_hops: array that holds all hops addresses needed for default
  *                     DRAM mapping.
- * @pending_cb_lock: spinlock to protect pending cb list
  * @cs_lock: spinlock to protect cs_sequence.
  * @dram_phys_mem: amount of used physical DRAM memory by this context.
  * @thread_ctx_switch_token: token to prevent multiple threads of the same
  *				context	from running the context switch phase.
  *				Only a single thread should run it.
- * @thread_pending_cb_token: token to prevent multiple threads from processing
- *				the pending CB list. Only a single thread should
- *				process the list since it is protected by a
- *				spinlock and we don't want to halt the entire
- *				command submission sequence.
  * @thread_ctx_switch_wait_token: token to prevent the threads that didn't run
  *				the context switch phase from moving to their
  *				execution phase before the context switch phase
@@ -1411,18 +1390,15 @@ struct hl_ctx {
 	struct mutex			mmu_lock;
 	struct mutex			hw_block_list_lock;
 	struct list_head		debugfs_list;
-	struct list_head		pending_cb_list;
 	struct list_head		hw_block_mem_list;
 	struct hl_cs_counters_atomic	cs_counters;
 	struct gen_pool			*cb_va_pool;
 	struct hl_encaps_signals_mgr	sig_mgr;
 	u64				cs_sequence;
 	u64				*dram_default_hops;
-	spinlock_t			pending_cb_lock;
 	spinlock_t			cs_lock;
 	atomic64_t			dram_phys_mem;
 	atomic_t			thread_ctx_switch_token;
-	atomic_t			thread_pending_cb_token;
 	u32				thread_ctx_switch_wait_token;
 	u32				asid;
 	u32				handle;
@@ -1485,12 +1461,14 @@ struct hl_userptr {
  * @mirror_node : node in device mirror list of command submissions.
  * @staged_cs_node: node in the staged cs list.
  * @debugfs_list: node in debugfs list of command submissions.
+ * @encaps_sig_hdl: holds the encaps signals handle.
  * @sequence: the sequence number of this CS.
  * @staged_sequence: the sequence of the staged submission this CS is part of,
  *                   relevant only if staged_cs is set.
  * @timeout_jiffies: cs timeout in jiffies.
  * @submission_time_jiffies: submission time of the cs
  * @type: CS_TYPE_*.
+ * @encaps_sig_hdl_id: encaps signals handle id, set for the first staged cs.
  * @submitted: true if CS was submitted to H/W.
  * @completed: true if CS was completed by device.
  * @timedout : true if CS was timedout.
@@ -1504,6 +1482,7 @@ struct hl_userptr {
  * @staged_cs: true if this CS is part of a staged submission.
  * @skip_reset_on_timeout: true if we shall not reset the device in case
  *                         timeout occurs (debug scenario).
+ * @encaps_signals: true if this CS has encaps reserved signals.
  */
 struct hl_cs {
 	u16			*jobs_in_queue_cnt;
@@ -1518,11 +1497,13 @@ struct hl_cs {
 	struct list_head	mirror_node;
 	struct list_head	staged_cs_node;
 	struct list_head	debugfs_list;
+	struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
 	u64			sequence;
 	u64			staged_sequence;
 	u64			timeout_jiffies;
 	u64			submission_time_jiffies;
 	enum hl_cs_type		type;
+	u32			encaps_sig_hdl_id;
 	u8			submitted;
 	u8			completed;
 	u8			timedout;
@@ -1533,6 +1514,7 @@ struct hl_cs {
 	u8			staged_first;
 	u8			staged_cs;
 	u8			skip_reset_on_timeout;
+	u8			encaps_signals;
 };
 
 /**
@@ -1552,6 +1534,8 @@ struct hl_cs {
  * @hw_queue_id: the id of the H/W queue this job is submitted to.
  * @user_cb_size: the actual size of the CB we got from the user.
  * @job_cb_size: the actual size of the CB that we put on the queue.
+ * @encaps_sig_wait_offset: encapsulated signals offset, which allow user
+ *                          to wait on part of the reserved signals.
  * @is_kernel_allocated_cb: true if the CB handle we got from the user holds a
  *                          handle to a kernel-allocated CB object, false
  *                          otherwise (SRAM/DRAM/host address).
@@ -1576,6 +1560,7 @@ struct hl_cs_job {
 	u32			hw_queue_id;
 	u32			user_cb_size;
 	u32			job_cb_size;
+	u32			encaps_sig_wait_offset;
 	u8			is_kernel_allocated_cb;
 	u8			contains_dma_pkt;
 };
@@ -2794,7 +2779,6 @@ int hl_cb_va_pool_init(struct hl_ctx *ctx);
 void hl_cb_va_pool_fini(struct hl_ctx *ctx);
 
 void hl_cs_rollback_all(struct hl_device *hdev);
-void hl_pending_cb_list_flush(struct hl_ctx *ctx);
 struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
 		enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
 void hl_sob_reset_error(struct kref *ref);
@@ -2935,9 +2919,12 @@ int hl_set_voltage(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value);
 int hl_set_current(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value);
-void hl_encaps_handle_do_release(struct kref *ref);
 void hw_sob_get(struct hl_hw_sob *hw_sob);
 void hw_sob_put(struct hl_hw_sob *hw_sob);
+void hl_encaps_handle_do_release(struct kref *ref);
+void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev,
+			struct hl_cs *cs, struct hl_cs_job *job,
+			struct hl_cs_compl *cs_cmpl);
 void hl_release_pending_user_interrupts(struct hl_device *hdev);
 int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
 			struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig);
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index 9a59b8e9bf530e..6d3beccad91bfe 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -416,7 +416,7 @@ static int init_signal_cs(struct hl_device *hdev,
 	cs_cmpl->sob_val = prop->next_sob_val;
 
 	dev_dbg(hdev->dev,
-		"generate signal CB, sob_id: %d, sob val: 0x%x, q_idx: %d, seq: %llu\n",
+		"generate signal CB, sob_id: %d, sob val: %u, q_idx: %d, seq: %llu\n",
 		cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, q_idx,
 		cs_cmpl->cs_seq);
 
@@ -432,12 +432,31 @@ static int init_signal_cs(struct hl_device *hdev,
 	return rc;
 }
 
+void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev,
+			struct hl_cs *cs, struct hl_cs_job *job,
+			struct hl_cs_compl *cs_cmpl)
+{
+	struct hl_cs_encaps_sig_handle *handle = cs->encaps_sig_hdl;
+
+	cs_cmpl->hw_sob = handle->hw_sob;
+
+	/* Note that encaps_sig_wait_offset was validated earlier in the flow
+	 * for offset value which exceeds the max reserved signal count.
+	 * always decrement 1 of the offset since when the user
+	 * set offset 1 for example he mean to wait only for the first
+	 * signal only, which will be pre_sob_val, and if he set offset 2
+	 * then the value required is (pre_sob_val + 1) and so on...
+	 */
+	cs_cmpl->sob_val = handle->pre_sob_val +
+			(job->encaps_sig_wait_offset - 1);
+}
+
 static int init_wait_cs(struct hl_device *hdev, struct hl_cs *cs,
 		struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl)
 {
-	struct hl_cs_compl *signal_cs_cmpl;
-	struct hl_sync_stream_properties *prop;
 	struct hl_gen_wait_properties wait_prop;
+	struct hl_sync_stream_properties *prop;
+	struct hl_cs_compl *signal_cs_cmpl;
 	u32 q_idx;
 
 	q_idx = job->hw_queue_id;
@@ -447,9 +466,23 @@ static int init_wait_cs(struct hl_device *hdev, struct hl_cs *cs,
 					struct hl_cs_compl,
 					base_fence);
 
-	/* copy the SOB id and value of the signal CS */
-	cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
-	cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
+	if (cs->encaps_signals) {
+		/* use the encaps signal handle stored earlier in the flow
+		 * and set the SOB information from the encaps
+		 * signals handle
+		 */
+		hl_hw_queue_encaps_sig_set_sob_info(hdev, cs, job, cs_cmpl);
+
+		dev_dbg(hdev->dev, "Wait for encaps signals handle, qidx(%u), CS sequence(%llu), sob val: 0x%x, offset: %u\n",
+				cs->encaps_sig_hdl->q_idx,
+				cs->encaps_sig_hdl->cs_seq,
+				cs_cmpl->sob_val,
+				job->encaps_sig_wait_offset);
+	} else {
+		/* Copy the SOB id and value of the signal CS */
+		cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
+		cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
+	}
 
 	/* check again if the signal cs already completed.
 	 * if yes then don't send any wait cs since the hw_sob
@@ -523,6 +556,59 @@ static int init_signal_wait_cs(struct hl_cs *cs)
 	return rc;
 }
 
+static int encaps_sig_first_staged_cs_handler
+			(struct hl_device *hdev, struct hl_cs *cs)
+{
+	struct hl_cs_compl *cs_cmpl =
+			container_of(cs->fence,
+					struct hl_cs_compl, base_fence);
+	struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
+	struct hl_encaps_signals_mgr *mgr;
+	int rc = 0;
+
+	mgr = &hdev->compute_ctx->sig_mgr;
+
+	spin_lock(&mgr->lock);
+	encaps_sig_hdl = idr_find(&mgr->handles, cs->encaps_sig_hdl_id);
+	if (encaps_sig_hdl) {
+		/*
+		 * Set handler CS sequence,
+		 * the CS which contains the encapsulated signals.
+		 */
+		encaps_sig_hdl->cs_seq = cs->sequence;
+		/* store the handle and set encaps signal indication,
+		 * to be used later in cs_do_release to put the last
+		 * reference to encaps signals handlers.
+		 */
+		cs_cmpl->encaps_signals = true;
+		cs_cmpl->encaps_sig_hdl = encaps_sig_hdl;
+
+		/* set hw_sob pointer in completion object
+		 * since it's used in cs_do_release flow to put
+		 * refcount to sob
+		 */
+		cs_cmpl->hw_sob = encaps_sig_hdl->hw_sob;
+		cs_cmpl->sob_val = encaps_sig_hdl->pre_sob_val +
+						encaps_sig_hdl->count;
+
+		dev_dbg(hdev->dev, "CS seq (%llu) added to encaps signal handler id (%u), count(%u), qidx(%u), sob(%u), val(%u)\n",
+				cs->sequence, encaps_sig_hdl->id,
+				encaps_sig_hdl->count,
+				encaps_sig_hdl->q_idx,
+				cs_cmpl->hw_sob->sob_id,
+				cs_cmpl->sob_val);
+
+	} else {
+		dev_err(hdev->dev, "encaps handle id(%u) wasn't found!\n",
+				cs->encaps_sig_hdl_id);
+		rc = -EINVAL;
+	}
+
+	spin_unlock(&mgr->lock);
+
+	return rc;
+}
+
 /*
  * hl_hw_queue_schedule_cs - schedule a command submission
  * @cs: pointer to the CS
@@ -602,6 +688,12 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 	}
 
 
+	if (cs->encaps_signals && cs->staged_first) {
+		rc = encaps_sig_first_staged_cs_handler(hdev, cs);
+		if (rc)
+			goto unroll_cq_resv;
+	}
+
 	spin_lock(&hdev->cs_mirror_lock);
 
 	/* Verify staged CS exists and add to the staged list */
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index ae1a8b4e694c17..6187e2e802bc85 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -456,8 +456,6 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr,
 					u32 size, u64 val);
 static int gaudi_memset_registers(struct hl_device *hdev, u64 reg_base,
 					u32 num_regs, u32 val);
-static int gaudi_schedule_register_memset(struct hl_device *hdev,
-		u32 hw_queue_id, u64 reg_base, u32 num_regs, u32 val);
 static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel,
 				u32 tpc_id);
 static int gaudi_mmu_clear_pgt_range(struct hl_device *hdev);
@@ -468,7 +466,6 @@ static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id,
 				u32 size, bool eb);
 static u32 gaudi_gen_wait_cb(struct hl_device *hdev,
 				struct hl_gen_wait_properties *prop);
-
 static inline enum hl_collective_mode
 get_collective_mode(struct hl_device *hdev, u32 queue_id)
 {
@@ -1068,17 +1065,11 @@ static void gaudi_sob_group_hw_reset(struct kref *ref)
 	struct gaudi_hw_sob_group *hw_sob_group =
 		container_of(ref, struct gaudi_hw_sob_group, kref);
 	struct hl_device *hdev = hw_sob_group->hdev;
-	u64 base_addr;
-	int rc;
+	int i;
 
-	base_addr = CFG_BASE + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 +
-			hw_sob_group->base_sob_id * 4;
-	rc = gaudi_schedule_register_memset(hdev, hw_sob_group->queue_id,
-			base_addr, NUMBER_OF_SOBS_IN_GRP, 0);
-	if (rc)
-		dev_err(hdev->dev,
-			"failed resetting sob group - sob base %u, count %u",
-			hw_sob_group->base_sob_id, NUMBER_OF_SOBS_IN_GRP);
+	for (i = 0 ; i < NUMBER_OF_SOBS_IN_GRP ; i++)
+		WREG32((mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 +
+			(hw_sob_group->base_sob_id * 4) + (i * 4)), 0);
 
 	kref_init(&hw_sob_group->kref);
 }
@@ -1215,6 +1206,20 @@ static void gaudi_collective_slave_init_job(struct hl_device *hdev,
 	queue_id = job->hw_queue_id;
 	prop = &hdev->kernel_queues[queue_id].sync_stream_prop;
 
+	if (job->cs->encaps_signals) {
+		/* use the encaps signal handle store earlier in the flow
+		 * and set the SOB information from the encaps
+		 * signals handle
+		 */
+		hl_hw_queue_encaps_sig_set_sob_info(hdev, job->cs, job,
+						cs_cmpl);
+
+		dev_dbg(hdev->dev, "collective wait: Sequence %llu found, sob_id: %u,  wait for sob_val: %u\n",
+				job->cs->sequence,
+				cs_cmpl->hw_sob->sob_id,
+				cs_cmpl->sob_val);
+	}
+
 	/* Add to wait CBs using slave monitor */
 	wait_prop.data = (void *) job->user_cb;
 	wait_prop.sob_base = cs_cmpl->hw_sob->sob_id;
@@ -1225,7 +1230,7 @@ static void gaudi_collective_slave_init_job(struct hl_device *hdev,
 	wait_prop.size = cb_size;
 
 	dev_dbg(hdev->dev,
-		"Generate slave wait CB, sob %d, val:0x%x, mon %d, q %d\n",
+		"Generate slave wait CB, sob %d, val:%x, mon %d, q %d\n",
 		cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val,
 		prop->collective_slave_mon_id, queue_id);
 
@@ -1257,9 +1262,14 @@ static int gaudi_collective_wait_init_cs(struct hl_cs *cs)
 	gaudi = hdev->asic_specific;
 	cprop = &gaudi->collective_props;
 
-	/* copy the SOB id and value of the signal CS */
-	cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
-	cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
+	/* In encaps signals case the SOB info will be retrieved from
+	 * the handle in gaudi_collective_slave_init_job.
+	 */
+	if (!cs->encaps_signals) {
+		/* copy the SOB id and value of the signal CS */
+		cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
+		cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
+	}
 
 	/* check again if the signal cs already completed.
 	 * if yes then don't send any wait cs since the hw_sob
@@ -1336,7 +1346,8 @@ static int gaudi_collective_wait_init_cs(struct hl_cs *cs)
 
 static int gaudi_collective_wait_create_job(struct hl_device *hdev,
 		struct hl_ctx *ctx, struct hl_cs *cs,
-		enum hl_collective_mode mode, u32 queue_id, u32 wait_queue_id)
+		enum hl_collective_mode mode, u32 queue_id, u32 wait_queue_id,
+		u32 encaps_signal_offset)
 {
 	struct hw_queue_properties *hw_queue_prop;
 	struct hl_cs_counters_atomic *cntr;
@@ -1396,6 +1407,13 @@ static int gaudi_collective_wait_create_job(struct hl_device *hdev,
 	job->user_cb_size = cb_size;
 	job->hw_queue_id = queue_id;
 
+	/* since its guaranteed to have only one chunk in the collective wait
+	 * cs, we can use this chunk to set the encapsulated signal offset
+	 * in the jobs.
+	 */
+	if (cs->encaps_signals)
+		job->encaps_sig_wait_offset = encaps_signal_offset;
+
 	/*
 	 * No need in parsing, user CB is the patched CB.
 	 * We call hl_cb_destroy() out of two reasons - we don't need
@@ -1424,8 +1442,9 @@ static int gaudi_collective_wait_create_job(struct hl_device *hdev,
 }
 
 static int gaudi_collective_wait_create_jobs(struct hl_device *hdev,
-		struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
-		u32 collective_engine_id)
+		struct hl_ctx *ctx, struct hl_cs *cs,
+		u32 wait_queue_id, u32 collective_engine_id,
+		u32 encaps_signal_offset)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
 	struct hw_queue_properties *hw_queue_prop;
@@ -1475,7 +1494,8 @@ static int gaudi_collective_wait_create_jobs(struct hl_device *hdev,
 		if (i == 0) {
 			queue_id = wait_queue_id;
 			rc = gaudi_collective_wait_create_job(hdev, ctx, cs,
-				HL_COLLECTIVE_MASTER, queue_id, wait_queue_id);
+				HL_COLLECTIVE_MASTER, queue_id,
+				wait_queue_id, encaps_signal_offset);
 		} else {
 			if (nic_idx < NIC_NUMBER_OF_ENGINES) {
 				if (gaudi->hw_cap_initialized &
@@ -1495,7 +1515,8 @@ static int gaudi_collective_wait_create_jobs(struct hl_device *hdev,
 			}
 
 			rc = gaudi_collective_wait_create_job(hdev, ctx, cs,
-				HL_COLLECTIVE_SLAVE, queue_id, wait_queue_id);
+				HL_COLLECTIVE_SLAVE, queue_id,
+				wait_queue_id, encaps_signal_offset);
 		}
 
 		if (rc)
@@ -5909,78 +5930,6 @@ static int gaudi_memset_registers(struct hl_device *hdev, u64 reg_base,
 	return rc;
 }
 
-static int gaudi_schedule_register_memset(struct hl_device *hdev,
-		u32 hw_queue_id, u64 reg_base, u32 num_regs, u32 val)
-{
-	struct hl_ctx *ctx;
-	struct hl_pending_cb *pending_cb;
-	struct packet_msg_long *pkt;
-	u32 cb_size, ctl;
-	struct hl_cb *cb;
-	int i, rc;
-
-	mutex_lock(&hdev->fpriv_list_lock);
-	ctx = hdev->compute_ctx;
-
-	/* If no compute context available or context is going down
-	 * memset registers directly
-	 */
-	if (!ctx || kref_read(&ctx->refcount) == 0) {
-		rc = gaudi_memset_registers(hdev, reg_base, num_regs, val);
-		mutex_unlock(&hdev->fpriv_list_lock);
-		return rc;
-	}
-
-	mutex_unlock(&hdev->fpriv_list_lock);
-
-	cb_size = (sizeof(*pkt) * num_regs) +
-			sizeof(struct packet_msg_prot) * 2;
-
-	if (cb_size > SZ_2M) {
-		dev_err(hdev->dev, "CB size must be smaller than %uMB", SZ_2M);
-		return -ENOMEM;
-	}
-
-	pending_cb = kzalloc(sizeof(*pending_cb), GFP_KERNEL);
-	if (!pending_cb)
-		return -ENOMEM;
-
-	cb = hl_cb_kernel_create(hdev, cb_size, false);
-	if (!cb) {
-		kfree(pending_cb);
-		return -EFAULT;
-	}
-
-	pkt = cb->kernel_address;
-
-	ctl = FIELD_PREP(GAUDI_PKT_LONG_CTL_OP_MASK, 0); /* write the value */
-	ctl |= FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_LONG);
-	ctl |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1);
-	ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1);
-	ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
-
-	for (i = 0; i < num_regs ; i++, pkt++) {
-		pkt->ctl = cpu_to_le32(ctl);
-		pkt->value = cpu_to_le32(val);
-		pkt->addr = cpu_to_le64(reg_base + (i * 4));
-	}
-
-	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
-
-	pending_cb->cb = cb;
-	pending_cb->cb_size = cb_size;
-	/* The queue ID MUST be an external queue ID. Otherwise, we will
-	 * have undefined behavior
-	 */
-	pending_cb->hw_queue_id = hw_queue_id;
-
-	spin_lock(&ctx->pending_cb_lock);
-	list_add_tail(&pending_cb->cb_node, &ctx->pending_cb_list);
-	spin_unlock(&ctx->pending_cb_lock);
-
-	return 0;
-}
-
 static int gaudi_restore_sm_registers(struct hl_device *hdev)
 {
 	u64 base_addr;
@@ -9031,16 +8980,12 @@ static u32 gaudi_gen_wait_cb(struct hl_device *hdev,
 static void gaudi_reset_sob(struct hl_device *hdev, void *data)
 {
 	struct hl_hw_sob *hw_sob = (struct hl_hw_sob *) data;
-	int rc;
 
 	dev_dbg(hdev->dev, "reset SOB, q_idx: %d, sob_id: %d\n", hw_sob->q_idx,
 		hw_sob->sob_id);
 
-	rc = gaudi_schedule_register_memset(hdev, hw_sob->q_idx,
-			CFG_BASE + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 +
-			hw_sob->sob_id * 4, 1, 0);
-	if (rc)
-		dev_err(hdev->dev, "failed resetting sob %u", hw_sob->sob_id);
+	WREG32(mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 +
+			hw_sob->sob_id * 4, 0);
 
 	kref_init(&hw_sob->kref);
 }
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 8a689bf42397e6..c070cd14753e4d 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5487,7 +5487,7 @@ static int goya_collective_wait_init_cs(struct hl_cs *cs)
 
 static int goya_collective_wait_create_jobs(struct hl_device *hdev,
 		struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
-		u32 collective_engine_id)
+		u32 collective_engine_id, u32 encaps_signal_offset)
 {
 	return -EINVAL;
 }

From 4956ab3df6862c3ae7f79fa58220612709ce60cf Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Wed, 14 Jul 2021 11:01:21 +0300
Subject: [PATCH 32/55] habanalabs: remove redundant warning message

This warning is redundant as we will print a notice in case the device
is still in use after the FD was closed. No need to print the same
message per context.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/context.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index 007f3a48601cdb..22978303ad63f1 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -185,9 +185,6 @@ void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx)
 {
 	if (kref_put(&ctx->refcount, hl_ctx_do_release) == 1)
 		return;
-
-	dev_warn(hdev->dev,
-		"user process released device but its command submissions are still executing\n");
 }
 
 int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)

From 05d8c605a0becc7028c224dc55ce2b3c1b2d6aa7 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Thu, 15 Jul 2021 10:48:43 +0300
Subject: [PATCH 33/55] habanalabs: expose server type in INFO IOCTL

Add the server type property to the hl_info_hw_ip_info structure
that is exposed to the user via the INFO IOCTL.

This is needed by the userspace s/w stack to know the connections map
of the internal links that connect the ASIC among themselves inside the
server.

The F/W will tell us, as part of the NIC information, the server type
that the GAUDI is located in.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c  |  2 +-
 drivers/misc/habanalabs/common/habanalabs.h   |  3 ++
 .../misc/habanalabs/common/habanalabs_ioctl.c |  2 +
 drivers/misc/habanalabs/gaudi/gaudi.c         |  2 +
 drivers/misc/habanalabs/goya/goya.c           |  2 +
 .../misc/habanalabs/include/common/cpucp_if.h | 11 +++++
 include/uapi/misc/habanalabs.h                | 49 +++++++++++++++++--
 7 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 869c6057ae3164..0be3f5414f0bad 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  */
 
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index b72fcc9255aa89..6c5c7c832af3a3 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -489,6 +489,8 @@ struct hl_hints_range {
  *                                       reserved for the user
  * @first_available_cq: first available CQ for the user.
  * @user_interrupt_count: number of user interrupts.
+ * @server_type: Server type that the ASIC is currently installed in.
+ *               The value is according to enum hl_server_type in uapi file.
  * @tpc_enabled_mask: which TPCs are enabled.
  * @completion_queues_count: number of completion queues.
  * @fw_security_enabled: true if security measures are enabled in firmware,
@@ -570,6 +572,7 @@ struct asic_fixed_properties {
 	u16				first_available_user_msix_interrupt;
 	u16				first_available_cq[HL_MAX_DCORES];
 	u16				user_interrupt_count;
+	u16				server_type;
 	u8				tpc_enabled_mask;
 	u8				completion_queues_count;
 	u8				fw_security_enabled;
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index f4dda7b4acdd92..86c3257d9ae1fb 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -94,6 +94,8 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 
 	hw_ip.first_available_interrupt_id =
 			prop->first_available_user_msix_interrupt;
+	hw_ip.server_type = prop->server_type;
+
 	return copy_to_user(out, &hw_ip,
 		min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0;
 }
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 6187e2e802bc85..d3bd58404c38e5 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -642,6 +642,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	prop->hard_reset_done_by_fw = false;
 	prop->gic_interrupts_enable = true;
 
+	prop->server_type = HL_SERVER_TYPE_UNKNOWN;
+
 	return 0;
 }
 
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index c070cd14753e4d..8d890c7cce081e 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -469,6 +469,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	prop->hard_reset_done_by_fw = false;
 	prop->gic_interrupts_enable = true;
 
+	prop->server_type = HL_SERVER_TYPE_UNKNOWN;
+
 	return 0;
 }
 
diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index 80b1d5a9d9f1eb..2d6f8ea35375e0 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -700,6 +700,15 @@ struct cpucp_mac_addr {
 	__u8 mac_addr[ETH_ALEN];
 };
 
+enum cpucp_serdes_type {
+	TYPE_1_SERDES_TYPE,
+	TYPE_2_SERDES_TYPE,
+	HLS1_SERDES_TYPE,
+	HLS1H_SERDES_TYPE,
+	UNKNOWN_SERDES_TYPE,
+	MAX_NUM_SERDES_TYPE = UNKNOWN_SERDES_TYPE
+};
+
 struct cpucp_nic_info {
 	struct cpucp_mac_addr mac_addrs[CPUCP_MAX_NICS];
 	__le64 link_mask[CPUCP_NIC_MASK_ARR_LEN];
@@ -708,6 +717,8 @@ struct cpucp_nic_info {
 	__le64 link_ext_mask[CPUCP_NIC_MASK_ARR_LEN];
 	__u8 qsfp_eeprom[CPUCP_NIC_QSFP_EEPROM_MAX_LEN];
 	__le64 auto_neg_mask[CPUCP_NIC_MASK_ARR_LEN];
+	__le16 serdes_type; /* enum cpucp_serdes_type */
+	__u8 reserved[6];
 };
 
 #endif /* CPUCP_IF_H */
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index eca86c54591658..b877a5b7ee7477 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -279,6 +279,14 @@ enum hl_device_status {
 	HL_DEVICE_STATUS_NEEDS_RESET
 };
 
+enum hl_server_type {
+	HL_SERVER_TYPE_UNKNOWN = 0,
+	HL_SERVER_GAUDI_HLS1 = 1,
+	HL_SERVER_GAUDI_HLS1H = 2,
+	HL_SERVER_GAUDI_TYPE1 = 3,
+	HL_SERVER_GAUDI_TYPE2 = 4
+};
+
 /* Opcode for management ioctl
  *
  * HW_IP_INFO            - Receive information about different IP blocks in the
@@ -337,17 +345,50 @@ enum hl_device_status {
 #define HL_INFO_VERSION_MAX_LEN	128
 #define HL_INFO_CARD_NAME_MAX_LEN	16
 
+/**
+ * struct hl_info_hw_ip_info - hardware information on various IPs in the ASIC
+ * @sram_base_address: The first SRAM physical base address that is free to be
+ *                     used by the user.
+ * @dram_base_address: The first DRAM virtual or physical base address that is
+ *                     free to be used by the user.
+ * @dram_size: The DRAM size that is available to the user.
+ * @sram_size: The SRAM size that is available to the user.
+ * @num_of_events: The number of events that can be received from the f/w. This
+ *                 is needed so the user can what is the size of the h/w events
+ *                 array he needs to pass to the kernel when he wants to fetch
+ *                 the event counters.
+ * @device_id: PCI device ID of the ASIC.
+ * @module_id: Module ID of the ASIC for mezzanine cards in servers
+ *             (From OCP spec).
+ * @first_available_interrupt_id: The first available interrupt ID for the user
+ *                                to be used when it works with user interrupts.
+ *                                Relevant for Gaudi2 and later.
+ * @server_type: Server type that the Gaudi ASIC is currently installed in.
+ *               The value is according to enum hl_server_type
+ * @cpld_version: CPLD version on the board.
+ * @psoc_pci_pll_nr: PCI PLL NR value. Needed by the profiler in some ASICs.
+ * @psoc_pci_pll_nf: PCI PLL NF value. Needed by the profiler in some ASICs.
+ * @psoc_pci_pll_od: PCI PLL OD value. Needed by the profiler in some ASICs.
+ * @psoc_pci_pll_div_factor: PCI PLL DIV factor value. Needed by the profiler
+ *                           in some ASICs.
+ * @tpc_enabled_mask: Bit-mask that represents which TPCs are enabled. Relevant
+ *                    for Goya/Gaudi only.
+ * @dram_enabled: Whether the DRAM is enabled.
+ * @cpucp_version: The CPUCP f/w version.
+ * @card_name: The card name as passed by the f/w.
+ * @dram_page_size: The DRAM physical page size.
+ */
 struct hl_info_hw_ip_info {
 	__u64 sram_base_address;
 	__u64 dram_base_address;
 	__u64 dram_size;
 	__u32 sram_size;
 	__u32 num_of_events;
-	__u32 device_id; /* PCI Device ID */
-	__u32 module_id; /* For mezzanine cards in servers (From OCP spec.) */
+	__u32 device_id;
+	__u32 module_id;
 	__u32 reserved;
 	__u16 first_available_interrupt_id;
-	__u16 reserved2;
+	__u16 server_type;
 	__u32 cpld_version;
 	__u32 psoc_pci_pll_nr;
 	__u32 psoc_pci_pll_nf;
@@ -358,7 +399,7 @@ struct hl_info_hw_ip_info {
 	__u8 pad[2];
 	__u8 cpucp_version[HL_INFO_VERSION_MAX_LEN];
 	__u8 card_name[HL_INFO_CARD_NAME_MAX_LEN];
-	__u64 reserved3;
+	__u64 reserved2;
 	__u64 dram_page_size;
 };
 

From 2e62870110f8e3fae9e1a976f3c3bf4ebe44e672 Mon Sep 17 00:00:00 2001
From: Ohad Sharabi <osharabi@habana.ai>
Date: Sun, 18 Jul 2021 12:06:41 +0300
Subject: [PATCH 34/55] habanalabs: convert PCI BAR offset to u64

Done as the bar size can exceed 4GB.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 6c5c7c832af3a3..43d938bd320ef4 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -931,7 +931,7 @@ struct pci_mem_region {
 	u64 region_base;
 	u64 region_size;
 	u64 bar_size;
-	u32 offset_in_bar;
+	u64 offset_in_bar;
 	u8 bar_id;
 	u8 used;
 };

From 55ce8213ee60a9a58d68cee95f22b7777471f9c8 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Sun, 11 Apr 2021 08:26:50 +0300
Subject: [PATCH 35/55] habanalabs: define uAPI to export FD for DMA-BUF

User process might want to share the device memory with another
driver/device, and to allow it to access it over PCIe (P2P).

To enable this, we utilize the dma-buf mechanism and add a dma-buf
exporter support, so the other driver can import the device memory and
access it.

The device memory is allocated using our existing allocation uAPI,
where the user will get a handle that represents the allocation.

The user will then need to call the new
uAPI (HL_MEM_OP_EXPORT_DMABUF_FD) and give the handle as a parameter.

The driver will return a FD that represents the DMA-BUF object that
was created to match that allocation.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 include/uapi/misc/habanalabs.h | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index b877a5b7ee7477..e3425bcc6d15da 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -952,6 +952,10 @@ union hl_wait_cs_args {
 #define HL_MEM_OP_UNMAP			3
 /* Opcode to map a hw block */
 #define HL_MEM_OP_MAP_BLOCK		4
+/* Opcode to create DMA-BUF object for an existing device memory allocation
+ * and to export an FD of that DMA-BUF back to the caller
+ */
+#define HL_MEM_OP_EXPORT_DMABUF_FD	5
 
 /* Memory flags */
 #define HL_MEM_CONTIGUOUS	0x1
@@ -1023,11 +1027,26 @@ struct hl_mem_in {
 			/* Virtual address returned from HL_MEM_OP_MAP */
 			__u64 device_virt_addr;
 		} unmap;
+
+		/* HL_MEM_OP_EXPORT_DMABUF_FD */
+		struct {
+			/* Handle returned from HL_MEM_OP_ALLOC. In Gaudi,
+			 * where we don't have MMU for the device memory, the
+			 * driver expects a physical address (instead of
+			 * a handle) in the device memory space.
+			 */
+			__u64 handle;
+			/* Size of memory allocation. Relevant only for GAUDI */
+			__u64 mem_size;
+		} export_dmabuf_fd;
 	};
 
 	/* HL_MEM_OP_* */
 	__u32 op;
-	/* HL_MEM_* flags */
+	/* HL_MEM_* flags.
+	 * For the HL_MEM_OP_EXPORT_DMABUF_FD opcode, this field holds the
+	 * DMA-BUF file/FD flags.
+	 */
 	__u32 flags;
 	/* Context ID - Currently not in use */
 	__u32 ctx_id;
@@ -1064,6 +1083,13 @@ struct hl_mem_out {
 
 			__u32 pad;
 		};
+
+		/* Returned in HL_MEM_OP_EXPORT_DMABUF_FD. Represents the
+		 * DMA-BUF object that was created to describe a memory
+		 * allocation on the device's memory space. The FD should be
+		 * passed to the importer driver
+		 */
+		__u64 fd;
 	};
 };
 

From 3ec2198abc5ff38ed256a16c3f57011c4a19619b Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Tue, 15 Jun 2021 10:55:43 +0300
Subject: [PATCH 36/55] habanalabs: add support for dma-buf exporter

Implement the calls to the dma-buf kernel api to create a dma-buf
object backed by FD.

We block the option to mmap the DMA-BUF object because we don't support
DIRECT_IO and implicit P2P. We only implement support for explicit P2P
through importing the FD of the DMA-BUF.

In the export phase, we provide to the DMA-BUF object an array of pages
that represent the device's memory area. During the map callback,
we convert the array of pages into an SGT. We split/merge the pages
according to the dma max segment size of the importer.

To get the DMA address of the PCI bar, we use the dma_map_resources()
kernel API, because our device memory is not backed by page struct
and this API doesn't need page struct to map the physical address to
a DMA address.

We set the orig_nents member of the SGT to be 0, to indicate to other
drivers that we don't support CPU mappings.

Note that in Habanalabs's ASICs, the device memory is pinned and
immutable. Therefore, there is no need for dynamic mappings and pinning
callbacks.

Also note that in GAUDI we don't have an MMU towards the device memory
and the user works on physical addresses. Therefore, the user doesn't
pass through the kernel driver to allocate memory there. As a result,
only for GAUDI we receive from the user a device memory physical address
(instead of a handle) and a size.

We check the p2p distance using pci_p2pdma_distance_many() and refusing
to map dmabuf in case the distance doesn't allow p2p.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Gal Pressman <galpress@amazon.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/Kconfig             |   1 +
 drivers/misc/habanalabs/common/habanalabs.h |  22 +
 drivers/misc/habanalabs/common/memory.c     | 522 +++++++++++++++++++-
 drivers/misc/habanalabs/gaudi/gaudi.c       |   1 +
 drivers/misc/habanalabs/goya/goya.c         |   1 +
 5 files changed, 543 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/habanalabs/Kconfig b/drivers/misc/habanalabs/Kconfig
index 293d79811372ce..c82d2e7b203540 100644
--- a/drivers/misc/habanalabs/Kconfig
+++ b/drivers/misc/habanalabs/Kconfig
@@ -8,6 +8,7 @@ config HABANA_AI
 	depends on PCI && HAS_IOMEM
 	select GENERIC_ALLOCATOR
 	select HWMON
+	select DMA_SHARED_BUFFER
 	help
 	  Enables PCIe card driver for Habana's AI Processors (AIP) that are
 	  designed to accelerate Deep Learning inference and training workloads.
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 43d938bd320ef4..3daccd775caf21 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -26,6 +26,7 @@
 #include <linux/sched/signal.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/coresight.h>
+#include <linux/dma-buf.h>
 
 #define HL_NAME				"habanalabs"
 
@@ -1343,6 +1344,23 @@ struct hl_cs_counters_atomic {
 	atomic64_t validation_drop_cnt;
 };
 
+/**
+ * struct hl_dmabuf_wrapper - a dma-buf wrapper object.
+ * @dmabuf: pointer to dma-buf object.
+ * @ctx: pointer to the dma-buf owner's context.
+ * @phys_pg_pack: pointer to physical page pack if the dma-buf was exported for
+ *                memory allocation handle.
+ * @device_address: physical address of the device's memory. Relevant only
+ *                  if phys_pg_pack is NULL (dma-buf was exported from address).
+ *                  The total size can be taken from the dmabuf object.
+ */
+struct hl_dmabuf_wrapper {
+	struct dma_buf			*dmabuf;
+	struct hl_ctx			*ctx;
+	struct hl_vm_phys_pg_pack	*phys_pg_pack;
+	uint64_t			device_address;
+};
+
 /**
  * struct hl_ctx - user/kernel context.
  * @mem_hash: holds mapping from virtual address to virtual memory area
@@ -1651,6 +1669,7 @@ struct hl_vm_hw_block_list_node {
  * @npages: num physical pages in the pack.
  * @total_size: total size of all the pages in this list.
  * @mapping_cnt: number of shared mappings.
+ * @exporting_cnt: number of dma-buf exporting.
  * @asid: the context related to this list.
  * @page_size: size of each page in the pack.
  * @flags: HL_MEM_* flags related to this list.
@@ -1665,6 +1684,7 @@ struct hl_vm_phys_pg_pack {
 	u64			npages;
 	u64			total_size;
 	atomic_t		mapping_cnt;
+	u32			exporting_cnt;
 	u32			asid;
 	u32			page_size;
 	u32			flags;
@@ -2381,6 +2401,7 @@ struct multi_cs_data {
  *                          the error will be ignored by the driver during
  *                          device initialization. Mainly used to debug and
  *                          workaround firmware bugs
+ * @dram_pci_bar_start: start bus address of PCIe bar towards DRAM.
  * @last_successful_open_jif: timestamp (jiffies) of the last successful
  *                            device open.
  * @last_open_session_duration_jif: duration (jiffies) of the last device open
@@ -2521,6 +2542,7 @@ struct hl_device {
 	u64				max_power;
 	u64				clock_gating_mask;
 	u64				boot_error_status_mask;
+	u64				dram_pci_bar_start;
 	u64				last_successful_open_jif;
 	u64				last_open_session_duration_jif;
 	u64				open_counter;
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index efc460e9db5d2f..a75207611b9bde 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  */
 
@@ -11,11 +11,13 @@
 
 #include <linux/uaccess.h>
 #include <linux/slab.h>
+#include <linux/pci-p2pdma.h>
 
 #define HL_MMU_DEBUG	0
 
 /* use small pages for supporting non-pow2 (32M/40M/48M) DRAM phys page sizes */
-#define DRAM_POOL_PAGE_SIZE SZ_8M
+#define DRAM_POOL_PAGE_SIZE		SZ_8M
+
 
 /*
  * The va ranges in context object contain a list with the available chunks of
@@ -347,6 +349,13 @@ static int free_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args)
 			return -EINVAL;
 		}
 
+		if (phys_pg_pack->exporting_cnt) {
+			dev_err(hdev->dev,
+				"handle %u is exported, cannot free\n",	handle);
+			spin_unlock(&vm->idr_lock);
+			return -EINVAL;
+		}
+
 		/*
 		 * must remove from idr before the freeing of the physical
 		 * pages as the refcount of the pool is also the trigger of the
@@ -1504,13 +1513,492 @@ int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
 	return 0;
 }
 
+static int set_dma_sg(struct scatterlist *sg, u64 bar_address, u64 chunk_size,
+			struct device *dev, enum dma_data_direction dir)
+{
+	dma_addr_t addr;
+	int rc;
+
+	addr = dma_map_resource(dev, bar_address, chunk_size, dir,
+				DMA_ATTR_SKIP_CPU_SYNC);
+	rc = dma_mapping_error(dev, addr);
+	if (rc)
+		return rc;
+
+	sg_set_page(sg, NULL, chunk_size, 0);
+	sg_dma_address(sg) = addr;
+	sg_dma_len(sg) = chunk_size;
+
+	return 0;
+}
+
+static int alloc_sgt_from_device_pages(struct hl_device *hdev,
+					struct sg_table **sgt, u64 *pages,
+					u64 npages, u64 page_size,
+					struct device *dev,
+					enum dma_data_direction dir)
+{
+	u64 chunk_size, bar_address, dma_max_seg_size;
+	struct asic_fixed_properties *prop;
+	int rc, i, j, nents, cur_page;
+	struct scatterlist *sg;
+
+	prop = &hdev->asic_prop;
+
+	dma_max_seg_size = dma_get_max_seg_size(dev);
+
+	/* We would like to align the max segment size to PAGE_SIZE, so the
+	 * SGL will contain aligned addresses that can be easily mapped to
+	 * an MMU
+	 */
+	dma_max_seg_size = ALIGN_DOWN(dma_max_seg_size, PAGE_SIZE);
+	if (dma_max_seg_size < PAGE_SIZE) {
+		dev_err_ratelimited(hdev->dev,
+				"dma_max_seg_size %llu can't be smaller than PAGE_SIZE\n",
+				dma_max_seg_size);
+		return -EINVAL;
+	}
+
+	*sgt = kzalloc(sizeof(**sgt), GFP_KERNEL);
+	if (!*sgt)
+		return -ENOMEM;
+
+	/* If the size of each page is larger than the dma max segment size,
+	 * then we can't combine pages and the number of entries in the SGL
+	 * will just be the
+	 * <number of pages> * <chunks of max segment size in each page>
+	 */
+	if (page_size > dma_max_seg_size)
+		nents = npages * DIV_ROUND_UP_ULL(page_size, dma_max_seg_size);
+	else
+		/* Get number of non-contiguous chunks */
+		for (i = 1, nents = 1, chunk_size = page_size ; i < npages ; i++) {
+			if (pages[i - 1] + page_size != pages[i] ||
+					chunk_size + page_size > dma_max_seg_size) {
+				nents++;
+				chunk_size = page_size;
+				continue;
+			}
+
+			chunk_size += page_size;
+		}
+
+	rc = sg_alloc_table(*sgt, nents, GFP_KERNEL | __GFP_ZERO);
+	if (rc)
+		goto error_free;
+
+	/* Because we are not going to include a CPU list we want to have some
+	 * chance that other users will detect this by setting the orig_nents
+	 * to 0 and using only nents (length of DMA list) when going over the
+	 * sgl
+	 */
+	(*sgt)->orig_nents = 0;
+
+	cur_page = 0;
+
+	if (page_size > dma_max_seg_size) {
+		u64 size_left, cur_device_address = 0;
+
+		size_left = page_size;
+
+		/* Need to split each page into the number of chunks of
+		 * dma_max_seg_size
+		 */
+		for_each_sgtable_dma_sg((*sgt), sg, i) {
+			if (size_left == page_size)
+				cur_device_address =
+					pages[cur_page] - prop->dram_base_address;
+			else
+				cur_device_address += dma_max_seg_size;
+
+			chunk_size = min(size_left, dma_max_seg_size);
+
+			bar_address = hdev->dram_pci_bar_start + cur_device_address;
+
+			rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);
+			if (rc)
+				goto error_unmap;
+
+			if (size_left > dma_max_seg_size) {
+				size_left -= dma_max_seg_size;
+			} else {
+				cur_page++;
+				size_left = page_size;
+			}
+		}
+	} else {
+		/* Merge pages and put them into the scatterlist */
+		for_each_sgtable_dma_sg((*sgt), sg, i) {
+			chunk_size = page_size;
+			for (j = cur_page + 1 ; j < npages ; j++) {
+				if (pages[j - 1] + page_size != pages[j] ||
+						chunk_size + page_size > dma_max_seg_size)
+					break;
+
+				chunk_size += page_size;
+			}
+
+			bar_address = hdev->dram_pci_bar_start +
+					(pages[cur_page] - prop->dram_base_address);
+
+			rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);
+			if (rc)
+				goto error_unmap;
+
+			cur_page = j;
+		}
+	}
+
+	return 0;
+
+error_unmap:
+	for_each_sgtable_dma_sg((*sgt), sg, i) {
+		if (!sg_dma_len(sg))
+			continue;
+
+		dma_unmap_resource(dev, sg_dma_address(sg),
+					sg_dma_len(sg), dir,
+					DMA_ATTR_SKIP_CPU_SYNC);
+	}
+
+	/* Need to restore orig_nents because sg_free_table use that field */
+	(*sgt)->orig_nents = nents;
+	sg_free_table(*sgt);
+
+error_free:
+	kfree(*sgt);
+	return rc;
+}
+
+static int hl_dmabuf_attach(struct dma_buf *dmabuf,
+				struct dma_buf_attachment *attachment)
+{
+	struct hl_dmabuf_wrapper *hl_dmabuf;
+	struct hl_device *hdev;
+	int rc;
+
+	hl_dmabuf = dmabuf->priv;
+	hdev = hl_dmabuf->ctx->hdev;
+
+	rc = pci_p2pdma_distance_many(hdev->pdev, &attachment->dev, 1, true);
+
+	if (rc < 0)
+		attachment->peer2peer = false;
+
+	return 0;
+}
+
+static struct sg_table *hl_map_dmabuf(struct dma_buf_attachment *attachment,
+					enum dma_data_direction dir)
+{
+	struct dma_buf *dma_buf = attachment->dmabuf;
+	struct hl_vm_phys_pg_pack *phys_pg_pack;
+	struct hl_dmabuf_wrapper *hl_dmabuf;
+	struct hl_device *hdev;
+	struct sg_table *sgt;
+	int rc;
+
+	hl_dmabuf = dma_buf->priv;
+	hdev = hl_dmabuf->ctx->hdev;
+	phys_pg_pack = hl_dmabuf->phys_pg_pack;
+
+	if (!attachment->peer2peer) {
+		dev_err(hdev->dev,
+			"Failed to map dmabuf because p2p is disabled\n");
+		return ERR_PTR(-EPERM);
+	}
+
+	if (phys_pg_pack)
+		rc = alloc_sgt_from_device_pages(hdev, &sgt,
+						phys_pg_pack->pages,
+						phys_pg_pack->npages,
+						phys_pg_pack->page_size,
+						attachment->dev,
+						dir);
+	else
+		rc = alloc_sgt_from_device_pages(hdev, &sgt,
+						&hl_dmabuf->device_address,
+						1,
+						hl_dmabuf->dmabuf->size,
+						attachment->dev,
+						dir);
+
+	if (rc) {
+		dev_err(hdev->dev,
+			"failed (%d) to initialize sgt for dmabuf\n",
+			rc);
+		return ERR_PTR(rc);
+	}
+
+	return sgt;
+}
+
+static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment,
+				  struct sg_table *sgt,
+				  enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sgtable_dma_sg(sgt, sg, i)
+		dma_unmap_resource(attachment->dev, sg_dma_address(sg),
+					sg_dma_len(sg), dir,
+					DMA_ATTR_SKIP_CPU_SYNC);
+
+	/* Need to restore orig_nents because sg_free_table use that field */
+	sgt->orig_nents = sgt->nents;
+	sg_free_table(sgt);
+	kfree(sgt);
+}
+
+static void hl_release_dmabuf(struct dma_buf *dmabuf)
+{
+	struct hl_dmabuf_wrapper *hl_dmabuf = dmabuf->priv;
+	struct hl_ctx *ctx = hl_dmabuf->ctx;
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_vm *vm = &hdev->vm;
+
+	if (hl_dmabuf->phys_pg_pack) {
+		spin_lock(&vm->idr_lock);
+		hl_dmabuf->phys_pg_pack->exporting_cnt--;
+		spin_unlock(&vm->idr_lock);
+	}
+
+	hl_ctx_put(hl_dmabuf->ctx);
+
+	kfree(hl_dmabuf);
+}
+
+static const struct dma_buf_ops habanalabs_dmabuf_ops = {
+	.attach = hl_dmabuf_attach,
+	.map_dma_buf = hl_map_dmabuf,
+	.unmap_dma_buf = hl_unmap_dmabuf,
+	.release = hl_release_dmabuf,
+};
+
+static int export_dmabuf_common(struct hl_ctx *ctx,
+				struct hl_dmabuf_wrapper *hl_dmabuf,
+				u64 total_size, int flags, int *dmabuf_fd)
+{
+	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+	struct hl_device *hdev = ctx->hdev;
+	int rc, fd;
+
+	exp_info.ops = &habanalabs_dmabuf_ops;
+	exp_info.size = total_size;
+	exp_info.flags = flags;
+	exp_info.priv = hl_dmabuf;
+
+	hl_dmabuf->dmabuf = dma_buf_export(&exp_info);
+	if (IS_ERR(hl_dmabuf->dmabuf)) {
+		dev_err(hdev->dev, "failed to export dma-buf\n");
+		return PTR_ERR(hl_dmabuf->dmabuf);
+	}
+
+	fd = dma_buf_fd(hl_dmabuf->dmabuf, flags);
+	if (fd < 0) {
+		dev_err(hdev->dev,
+			"failed to get a file descriptor for a dma-buf\n");
+		rc = fd;
+		goto err_dma_buf_put;
+	}
+
+	hl_dmabuf->ctx = ctx;
+	hl_ctx_get(hdev, hl_dmabuf->ctx);
+
+	*dmabuf_fd = fd;
+
+	return 0;
+
+err_dma_buf_put:
+	dma_buf_put(hl_dmabuf->dmabuf);
+	return rc;
+}
+
+/**
+ * export_dmabuf_from_addr() - export a dma-buf object for the given memory
+ *                             address and size.
+ * @ctx: pointer to the context structure.
+ * @device_addr:  device memory physical address.
+ * @size: size of device memory.
+ * @flags: DMA-BUF file/FD flags.
+ * @dmabuf_fd: pointer to result FD that represents the dma-buf object.
+ *
+ * Create and export a dma-buf object for an existing memory allocation inside
+ * the device memory, and return a FD which is associated with the dma-buf
+ * object.
+ *
+ * Return: 0 on success, non-zero for failure.
+ */
+static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 device_addr,
+					u64 size, int flags, int *dmabuf_fd)
+{
+	struct hl_dmabuf_wrapper *hl_dmabuf;
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop;
+	u64 bar_address;
+	int rc;
+
+	prop = &hdev->asic_prop;
+
+	if (!IS_ALIGNED(device_addr, PAGE_SIZE)) {
+		dev_err_ratelimited(hdev->dev,
+			"address of exported device memory should be aligned to 0x%lx, address 0x%llx\n",
+			PAGE_SIZE, device_addr);
+		return -EINVAL;
+	}
+
+	if (size < PAGE_SIZE) {
+		dev_err_ratelimited(hdev->dev,
+			"size %llu of exported device memory should be equal to or greater than %lu\n",
+			size, PAGE_SIZE);
+		return -EINVAL;
+	}
+
+	if (device_addr < prop->dram_user_base_address ||
+				device_addr + size > prop->dram_end_address ||
+				device_addr + size < device_addr) {
+		dev_err_ratelimited(hdev->dev,
+			"DRAM memory range is outside of DRAM boundaries, address 0x%llx, size 0x%llx\n",
+			device_addr, size);
+		return -EINVAL;
+	}
+
+	bar_address = hdev->dram_pci_bar_start +
+			(device_addr - prop->dram_base_address);
+
+	if (bar_address + size >
+			hdev->dram_pci_bar_start + prop->dram_pci_bar_size ||
+			bar_address + size < bar_address) {
+		dev_err_ratelimited(hdev->dev,
+			"DRAM memory range is outside of PCI BAR boundaries, address 0x%llx, size 0x%llx\n",
+			device_addr, size);
+		return -EINVAL;
+	}
+
+	hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL);
+	if (!hl_dmabuf)
+		return -ENOMEM;
+
+	hl_dmabuf->device_address = device_addr;
+
+	rc = export_dmabuf_common(ctx, hl_dmabuf, size, flags, dmabuf_fd);
+	if (rc)
+		goto err_free_dmabuf_wrapper;
+
+	return 0;
+
+err_free_dmabuf_wrapper:
+	kfree(hl_dmabuf);
+	return rc;
+}
+
+/**
+ * export_dmabuf_from_handle() - export a dma-buf object for the given memory
+ *                               handle.
+ * @ctx: pointer to the context structure.
+ * @handle: device memory allocation handle.
+ * @flags: DMA-BUF file/FD flags.
+ * @dmabuf_fd: pointer to result FD that represents the dma-buf object.
+ *
+ * Create and export a dma-buf object for an existing memory allocation inside
+ * the device memory, and return a FD which is associated with the dma-buf
+ * object.
+ *
+ * Return: 0 on success, non-zero for failure.
+ */
+static int export_dmabuf_from_handle(struct hl_ctx *ctx, u64 handle, int flags,
+					int *dmabuf_fd)
+{
+	struct hl_vm_phys_pg_pack *phys_pg_pack;
+	struct hl_dmabuf_wrapper *hl_dmabuf;
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop;
+	struct hl_vm *vm = &hdev->vm;
+	u64 bar_address;
+	u32 idr_handle;
+	int rc, i;
+
+	prop = &hdev->asic_prop;
+
+	idr_handle = lower_32_bits(handle);
+
+	spin_lock(&vm->idr_lock);
+
+	phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, idr_handle);
+	if (!phys_pg_pack) {
+		spin_unlock(&vm->idr_lock);
+		dev_err_ratelimited(hdev->dev, "no match for handle 0x%x\n",
+				idr_handle);
+		return -EINVAL;
+	}
+
+	/* increment now to avoid freeing device memory while exporting */
+	phys_pg_pack->exporting_cnt++;
+
+	spin_unlock(&vm->idr_lock);
+
+	if (phys_pg_pack->vm_type != VM_TYPE_PHYS_PACK) {
+		dev_err_ratelimited(hdev->dev,
+				"handle 0x%llx is not for DRAM memory\n",
+				handle);
+		rc = -EINVAL;
+		goto err_dec_exporting_cnt;
+	}
+
+	for (i = 0 ; i < phys_pg_pack->npages ; i++) {
+
+		bar_address = hdev->dram_pci_bar_start +
+						(phys_pg_pack->pages[i] -
+						prop->dram_base_address);
+
+		if (bar_address + phys_pg_pack->page_size >
+			hdev->dram_pci_bar_start + prop->dram_pci_bar_size ||
+			bar_address + phys_pg_pack->page_size < bar_address) {
+
+			dev_err_ratelimited(hdev->dev,
+				"DRAM memory range is outside of PCI BAR boundaries, address 0x%llx, size 0x%x\n",
+				phys_pg_pack->pages[i],
+				phys_pg_pack->page_size);
+
+			rc = -EINVAL;
+			goto err_dec_exporting_cnt;
+		}
+	}
+
+	hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL);
+	if (!hl_dmabuf) {
+		rc = -ENOMEM;
+		goto err_dec_exporting_cnt;
+	}
+
+	hl_dmabuf->phys_pg_pack = phys_pg_pack;
+
+	rc = export_dmabuf_common(ctx, hl_dmabuf, phys_pg_pack->total_size,
+				flags, dmabuf_fd);
+	if (rc)
+		goto err_free_dmabuf_wrapper;
+
+	return 0;
+
+err_free_dmabuf_wrapper:
+	kfree(hl_dmabuf);
+
+err_dec_exporting_cnt:
+	spin_lock(&vm->idr_lock);
+	phys_pg_pack->exporting_cnt--;
+	spin_unlock(&vm->idr_lock);
+
+	return rc;
+}
+
 static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args)
 {
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_ctx *ctx = hpriv->ctx;
 	u64 block_handle, device_addr = 0;
 	u32 handle = 0, block_size;
-	int rc;
+	int rc, dmabuf_fd = -EBADF;
 
 	switch (args->in.op) {
 	case HL_MEM_OP_ALLOC:
@@ -1559,6 +2047,16 @@ static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args)
 		args->out.block_size = block_size;
 		break;
 
+	case HL_MEM_OP_EXPORT_DMABUF_FD:
+		rc = export_dmabuf_from_addr(ctx,
+				args->in.export_dmabuf_fd.handle,
+				args->in.export_dmabuf_fd.mem_size,
+				args->in.flags,
+				&dmabuf_fd);
+		memset(args, 0, sizeof(*args));
+		args->out.fd = dmabuf_fd;
+		break;
+
 	default:
 		dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
 		rc = -ENOTTY;
@@ -1577,7 +2075,7 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 	struct hl_ctx *ctx = hpriv->ctx;
 	u64 block_handle, device_addr = 0;
 	u32 handle = 0, block_size;
-	int rc;
+	int rc, dmabuf_fd = -EBADF;
 
 	if (!hl_device_operational(hdev, &status)) {
 		dev_warn_ratelimited(hdev->dev,
@@ -1668,6 +2166,22 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 		args->out.block_size = block_size;
 		break;
 
+	case HL_MEM_OP_EXPORT_DMABUF_FD:
+		if (hdev->asic_prop.dram_supports_virtual_memory)
+			rc = export_dmabuf_from_handle(ctx,
+					args->in.export_dmabuf_fd.handle,
+					args->in.flags,
+					&dmabuf_fd);
+		else
+			rc = export_dmabuf_from_addr(ctx,
+					args->in.export_dmabuf_fd.handle,
+					args->in.export_dmabuf_fd.mem_size,
+					args->in.flags,
+					&dmabuf_fd);
+		memset(args, 0, sizeof(*args));
+		args->out.fd = dmabuf_fd;
+		break;
+
 	default:
 		dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
 		rc = -ENOTTY;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index d3bd58404c38e5..d9b7df9c18c8c4 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -778,6 +778,7 @@ static int gaudi_early_init(struct hl_device *hdev)
 	}
 
 	prop->dram_pci_bar_size = pci_resource_len(pdev, HBM_BAR_ID);
+	hdev->dram_pci_bar_start = pci_resource_start(pdev, HBM_BAR_ID);
 
 	/* If FW security is enabled at this point it means no access to ELBI */
 	if (hdev->asic_prop.fw_security_enabled) {
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 8d890c7cce081e..fad130f6a1186a 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -622,6 +622,7 @@ static int goya_early_init(struct hl_device *hdev)
 	}
 
 	prop->dram_pci_bar_size = pci_resource_len(pdev, DDR_BAR_ID);
+	hdev->dram_pci_bar_start = pci_resource_start(pdev, DDR_BAR_ID);
 
 	/* If FW security is enabled at this point it means no access to ELBI */
 	if (hdev->asic_prop.fw_security_enabled) {

From fb83e2fe484e860e2566a4c3c7b479cb672571b1 Mon Sep 17 00:00:00 2001
From: Ohad Sharabi <osharabi@habana.ai>
Date: Thu, 15 Jul 2021 15:20:44 +0300
Subject: [PATCH 37/55] habanalabs: make set_pci_regions asic function

In order to better support variants of the same ASIC
the set_pci_regions function is now an ASIC function which
allows each ASIC to implement it internally, thus keeping
all definitions static to the file.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h | 2 ++
 drivers/misc/habanalabs/gaudi/gaudi.c       | 5 +++--
 drivers/misc/habanalabs/goya/goya.c         | 5 +++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 3daccd775caf21..d40778eccb144b 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1160,6 +1160,7 @@ struct fw_load_mgr {
  * @init_cpu_scrambler_dram: Enable CPU specific DRAM scrambling
  * @state_dump_init: initialize constants required for state dump
  * @get_sob_addr: get SOB base address offset.
+ * @set_pci_memory_regions: setting properties of PCI memory regions
  */
 struct hl_asic_funcs {
 	int (*early_init)(struct hl_device *hdev);
@@ -1288,6 +1289,7 @@ struct hl_asic_funcs {
 	void (*init_cpu_scrambler_dram)(struct hl_device *hdev);
 	void (*state_dump_init)(struct hl_device *hdev);
 	u32 (*get_sob_addr)(struct hl_device *hdev, u32 sob_id);
+	void (*set_pci_memory_regions)(struct hl_device *hdev);
 };
 
 
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index d9b7df9c18c8c4..5b84d663dced2b 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -1858,7 +1858,7 @@ static int gaudi_sw_init(struct hl_device *hdev)
 	hdev->supports_staged_submission = true;
 	hdev->supports_wait_for_multi_cs = true;
 
-	gaudi_set_pci_memory_regions(hdev);
+	hdev->asic_funcs->set_pci_memory_regions(hdev);
 
 	return 0;
 
@@ -9378,7 +9378,8 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.init_firmware_loader = gaudi_init_firmware_loader,
 	.init_cpu_scrambler_dram = gaudi_init_scrambler_hbm,
 	.state_dump_init = gaudi_state_dump_init,
-	.get_sob_addr = gaudi_get_sob_addr
+	.get_sob_addr = gaudi_get_sob_addr,
+	.set_pci_memory_regions = gaudi_set_pci_memory_regions
 };
 
 /**
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index fad130f6a1186a..9d66bdda8789a2 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -963,7 +963,7 @@ static int goya_sw_init(struct hl_device *hdev)
 	hdev->allow_external_soft_reset = true;
 	hdev->supports_wait_for_multi_cs = false;
 
-	goya_set_pci_memory_regions(hdev);
+	hdev->asic_funcs->set_pci_memory_regions(hdev);
 
 	return 0;
 
@@ -5670,7 +5670,8 @@ static const struct hl_asic_funcs goya_funcs = {
 	.init_firmware_loader = goya_init_firmware_loader,
 	.init_cpu_scrambler_dram = goya_cpu_init_scrambler_dram,
 	.state_dump_init = goya_state_dump_init,
-	.get_sob_addr = &goya_get_sob_addr
+	.get_sob_addr = &goya_get_sob_addr,
+	.set_pci_memory_regions = goya_set_pci_memory_regions,
 };
 
 /*

From fd8d55cd9a1a5df4e43ec075a523555e5a04328e Mon Sep 17 00:00:00 2001
From: Koby Elbaz <kelbaz@habana.ai>
Date: Mon, 19 Jul 2021 08:28:34 +0300
Subject: [PATCH 38/55] habanalabs: clear msg_to_cpu_reg to avoid misread after
 reset

For some ASICs, the f/w reads the msg_to_cpu_reg value after
reset, and for some it doesn't.
Therefore, to be sure f/w doesn't read a wrong value after reset, we
need to clear this register before the reset occurs.

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c | 28 +++++++++-----------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 0be3f5414f0bad..c232d197b57ab5 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -667,17 +667,15 @@ int hl_fw_cpucp_info_get(struct hl_device *hdev,
 	hdev->event_queue.check_eqe_index = false;
 
 	/* Read FW application security bits again */
-	if (hdev->asic_prop.fw_cpu_boot_dev_sts0_valid) {
-		hdev->asic_prop.fw_app_cpu_boot_dev_sts0 =
-						RREG32(sts_boot_dev_sts0_reg);
-		if (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 &
+	if (prop->fw_cpu_boot_dev_sts0_valid) {
+		prop->fw_app_cpu_boot_dev_sts0 = RREG32(sts_boot_dev_sts0_reg);
+		if (prop->fw_app_cpu_boot_dev_sts0 &
 				CPU_BOOT_DEV_STS0_EQ_INDEX_EN)
 			hdev->event_queue.check_eqe_index = true;
 	}
 
-	if (hdev->asic_prop.fw_cpu_boot_dev_sts1_valid)
-		hdev->asic_prop.fw_app_cpu_boot_dev_sts1 =
-						RREG32(sts_boot_dev_sts1_reg);
+	if (prop->fw_cpu_boot_dev_sts1_valid)
+		prop->fw_app_cpu_boot_dev_sts1 = RREG32(sts_boot_dev_sts1_reg);
 
 out:
 	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
@@ -1012,6 +1010,11 @@ void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev)
 	} else {
 		WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE);
 		msleep(static_loader->cpu_reset_wait_msec);
+
+		/* Must clear this register in order to prevent preboot
+		 * from reading WFE after reboot
+		 */
+		WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_NA);
 	}
 
 	hdev->device_cpu_is_halted = true;
@@ -1242,11 +1245,6 @@ static void hl_fw_preboot_update_state(struct hl_device *hdev)
 	 *               b. Check whether hard reset is done by boot cpu
 	 * 3. FW application - a. Fetch fw application security status
 	 *                     b. Check whether hard reset is done by fw app
-	 *
-	 * Preboot:
-	 * Check security status bit (CPU_BOOT_DEV_STS0_ENABLED). If set, then-
-	 * check security enabled bit (CPU_BOOT_DEV_STS0_SECURITY_EN)
-	 * If set, then mark GIC controller to be disabled.
 	 */
 	prop->hard_reset_done_by_fw =
 		!!(cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN);
@@ -2126,8 +2124,7 @@ static void hl_fw_linux_update_state(struct hl_device *hdev,
 
 	/* Read FW application security bits */
 	if (prop->fw_cpu_boot_dev_sts0_valid) {
-		prop->fw_app_cpu_boot_dev_sts0 =
-				RREG32(cpu_boot_dev_sts0_reg);
+		prop->fw_app_cpu_boot_dev_sts0 = RREG32(cpu_boot_dev_sts0_reg);
 
 		if (prop->fw_app_cpu_boot_dev_sts0 &
 				CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
@@ -2147,8 +2144,7 @@ static void hl_fw_linux_update_state(struct hl_device *hdev,
 	}
 
 	if (prop->fw_cpu_boot_dev_sts1_valid) {
-		prop->fw_app_cpu_boot_dev_sts1 =
-				RREG32(cpu_boot_dev_sts1_reg);
+		prop->fw_app_cpu_boot_dev_sts1 = RREG32(cpu_boot_dev_sts1_reg);
 
 		dev_dbg(hdev->dev,
 			"Firmware application CPU status1 %#x\n",

From 5fc7062bcc61c33e6278ff896e60b32146574991 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Tue, 20 Jul 2021 09:16:05 +0300
Subject: [PATCH 39/55] habanalabs: add validity check for event ID received
 from F/W

Currently there is no validity check for event ID received from F/W,
Thus exposing driver to memory overrun.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 6 ++++++
 drivers/misc/habanalabs/goya/goya.c   | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 5b84d663dced2b..4ea45bdb9a71d0 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7881,6 +7881,12 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	u8 cause;
 	int rc;
 
+	if (event_type >= GAUDI_EVENT_SIZE) {
+		dev_err(hdev->dev, "Event type %u exceeds maximum of %u",
+				event_type, GAUDI_EVENT_SIZE - 1);
+		return;
+	}
+
 	gaudi->events_stat[event_type]++;
 	gaudi->events_stat_aggregate[event_type]++;
 
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 9d66bdda8789a2..d54c700c31cd58 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4805,6 +4805,12 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 				>> EQ_CTL_EVENT_TYPE_SHIFT);
 	struct goya_device *goya = hdev->asic_specific;
 
+	if (event_type >= GOYA_ASYNC_EVENT_ID_SIZE) {
+		dev_err(hdev->dev, "Event type %u exceeds maximum of %u",
+				event_type, GOYA_ASYNC_EVENT_ID_SIZE - 1);
+		return;
+	}
+
 	goya->events_stat[event_type]++;
 	goya->events_stat_aggregate[event_type]++;
 

From d7be7d52e513bbc232669b8f1772cfee6add0dd0 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Tue, 27 Jul 2021 10:54:22 +0300
Subject: [PATCH 40/55] habanalabs/gaudi: scrub HBM to a specific value

In order to enhance debuggability, we will scrub the whole HBM to
a specific value, in case HBM scrubbing is enabled. Scrubbing will be
performed after reset and after user closes the FD.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 4ea45bdb9a71d0..6852bb8e55db29 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -4194,6 +4194,11 @@ static int gaudi_hw_init(struct hl_device *hdev)
 		goto disable_msi;
 	}
 
+	/* Scrub both SRAM and DRAM */
+	rc = hdev->asic_funcs->scrub_device_mem(hdev, 0, 0);
+	if (rc)
+		return rc;
+
 	/* Perform read from the device to flush all configuration */
 	RREG32(mmHW_STATE);
 
@@ -4758,8 +4763,8 @@ static int gaudi_hbm_scrubbing(struct hl_device *hdev)
 				"Doing HBM scrubbing for 0x%09llx - 0x%09llx\n",
 				cur_addr, cur_addr + chunk_size);
 
-			WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset, 0);
-			WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset, 0);
+			WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset, 0xdeadbeaf);
+			WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset, 0xdeadbeaf);
 			WREG32(mmDMA0_CORE_DST_BASE_LO + dma_offset,
 						lower_32_bits(cur_addr));
 			WREG32(mmDMA0_CORE_DST_BASE_HI + dma_offset,

From 887b8d6f3b1a2f7d8104056d84ea71a52cdd314c Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Wed, 28 Jul 2021 00:16:31 +0300
Subject: [PATCH 41/55] habanalabs/gaudi: move scrubbing to late init

HW init is mostly about configuring registers. Therefore, it is better
to activate DMAs only in late init and afterwards.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 6852bb8e55db29..d8d296b66f9df1 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -1567,6 +1567,11 @@ static int gaudi_late_init(struct hl_device *hdev)
 		return rc;
 	}
 
+	/* Scrub both SRAM and DRAM */
+	rc = hdev->asic_funcs->scrub_device_mem(hdev, 0, 0);
+	if (rc)
+		goto disable_pci_access;
+
 	rc = gaudi_fetch_psoc_frequency(hdev);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to fetch psoc frequency\n");
@@ -4194,11 +4199,6 @@ static int gaudi_hw_init(struct hl_device *hdev)
 		goto disable_msi;
 	}
 
-	/* Scrub both SRAM and DRAM */
-	rc = hdev->asic_funcs->scrub_device_mem(hdev, 0, 0);
-	if (rc)
-		return rc;
-
 	/* Perform read from the device to flush all configuration */
 	RREG32(mmHW_STATE);
 

From 5e96d3a112da4e090898081f3da4fa1a8e2bec37 Mon Sep 17 00:00:00 2001
From: Yuri Nudelman <ynudelman@habana.ai>
Date: Tue, 27 Jul 2021 17:39:42 +0300
Subject: [PATCH 42/55] habanalabs: save pid per userptr

Currently userptr endpoint in debugfs prints out virtual addresses
in the user process memory space, without specifying their owner process
ID. User space virtual address is meaningless without knowing the owner
process.

Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/debugfs.c    | 8 ++++----
 drivers/misc/habanalabs/common/habanalabs.h | 2 ++
 drivers/misc/habanalabs/common/memory.c     | 1 +
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 2c587af28f9bba..264424c96959d3 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -209,12 +209,12 @@ static int userptr_show(struct seq_file *s, void *data)
 		if (first) {
 			first = false;
 			seq_puts(s, "\n");
-			seq_puts(s, " user virtual address     size             dma dir\n");
+			seq_puts(s, " pid      user virtual address     size             dma dir\n");
 			seq_puts(s, "----------------------------------------------------------\n");
 		}
-		seq_printf(s,
-			"    0x%-14llx      %-10llu    %-30s\n",
-			userptr->addr, userptr->size, dma_dir[userptr->dir]);
+		seq_printf(s, " %-7d  0x%-14llx      %-10llu    %-30s\n",
+				userptr->pid, userptr->addr, userptr->size,
+				dma_dir[userptr->dir]);
 	}
 
 	spin_unlock(&dev_entry->userptr_spinlock);
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index d40778eccb144b..b950b514061007 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1452,6 +1452,7 @@ struct hl_ctx_mgr {
  * @sgt: pointer to the scatter-gather table that holds the pages.
  * @dir: for DMA unmapping, the direction must be supplied, so save it.
  * @debugfs_list: node in debugfs list of command submissions.
+ * @pid: the pid of the user process owning the memory
  * @addr: user-space virtual address of the start of the memory area.
  * @size: size of the memory area to pin & map.
  * @dma_mapped: true if the SG was mapped to DMA addresses, false otherwise.
@@ -1464,6 +1465,7 @@ struct hl_userptr {
 	struct sg_table		*sgt;
 	enum dma_data_direction dir;
 	struct list_head	debugfs_list;
+	pid_t			pid;
 	u64			addr;
 	u64			size;
 	u8			dma_mapped;
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index a75207611b9bde..2f990d185cfd92 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -2276,6 +2276,7 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
 		return -EINVAL;
 	}
 
+	userptr->pid = current->pid;
 	userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_KERNEL);
 	if (!userptr->sgt)
 		return -ENOMEM;

From 4f44a8a152023e034a6352f372c0c82875ff23aa Mon Sep 17 00:00:00 2001
From: Yuri Nudelman <ynudelman@habana.ai>
Date: Thu, 29 Jul 2021 11:54:50 +0300
Subject: [PATCH 43/55] habanalabs: fix mmu node address resolution in debugfs

The address resolution via debugfs was not taking into consideration the
page offset, resulting in a wrong address.

Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 264424c96959d3..6a7df57883d955 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -349,7 +349,7 @@ static int mmu_show(struct seq_file *s, void *data)
 		return 0;
 	}
 
-	phys_addr = hops_info.hop_info[hops_info.used_hops - 1].hop_pte_val;
+	hl_mmu_va_to_pa(ctx, virt_addr, &phys_addr);
 
 	if (hops_info.scrambled_vaddr &&
 		(dev_entry->mmu_addr != hops_info.scrambled_vaddr))

From 0760cbcc62b6ede403744829e7b0ad4860a4d352 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Sun, 1 Aug 2021 22:24:18 +0300
Subject: [PATCH 44/55] habanalabs/gaudi: minimize number of register reads

Because the register reads might be trapped by the hypervisor in
certain deployments, minimize the number of reads during runtime by
moving static initializations to functions that occur during device
initialization instead of context open.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c           | 15 +++++++++++----
 drivers/misc/habanalabs/gaudi/gaudi_coresight.c |  5 -----
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index d8d296b66f9df1..1b98233ffc4b8f 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -1596,6 +1596,11 @@ static int gaudi_late_init(struct hl_device *hdev)
 		goto disable_pci_access;
 	}
 
+	/* We only support a single ASID for the user, so for the sake of optimization, just
+	 * initialize the ASID one time during device initialization with the fixed value of 1
+	 */
+	gaudi_mmu_prepare(hdev, 1);
+
 	return 0;
 
 disable_pci_access:
@@ -6793,6 +6798,9 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
 				asid);
 	}
 
+	gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_ARUSER, asid);
+	gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_AWUSER, asid);
+
 	hdev->asic_funcs->set_clock_gating(hdev);
 
 	mutex_unlock(&gaudi->clk_gate_mutex);
@@ -6842,7 +6850,8 @@ static int gaudi_send_job_on_qman0(struct hl_device *hdev,
 
 	dma_offset = gaudi_dma_assignment[GAUDI_PCI_DMA_1] * DMA_CORE_OFFSET;
 
-	WREG32_OR(mmDMA0_CORE_PROT + dma_offset, BIT(DMA0_CORE_PROT_VAL_SHIFT));
+	WREG32(mmDMA0_CORE_PROT + dma_offset,
+			BIT(DMA0_CORE_PROT_ERR_VAL_SHIFT) | BIT(DMA0_CORE_PROT_VAL_SHIFT));
 
 	rc = hl_hw_queue_send_cb_no_cmpl(hdev, GAUDI_QUEUE_ID_DMA_0_0,
 					job->job_cb_size, cb->bus_address);
@@ -6863,8 +6872,7 @@ static int gaudi_send_job_on_qman0(struct hl_device *hdev,
 	}
 
 free_fence_ptr:
-	WREG32_AND(mmDMA0_CORE_PROT + dma_offset,
-			~BIT(DMA0_CORE_PROT_VAL_SHIFT));
+	WREG32(mmDMA0_CORE_PROT + dma_offset, BIT(DMA0_CORE_PROT_ERR_VAL_SHIFT));
 
 	hdev->asic_funcs->asic_dma_pool_free(hdev, (void *) fence_ptr,
 					fence_dma_addr);
@@ -8653,7 +8661,6 @@ static int gaudi_ctx_init(struct hl_ctx *ctx)
 	if (ctx->asid == HL_KERNEL_ASID_ID)
 		return 0;
 
-	gaudi_mmu_prepare(ctx->hdev, ctx->asid);
 	return gaudi_internal_cb_pool_init(ctx->hdev, ctx);
 }
 
diff --git a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
index c2a27ed1c4d11e..5349c1be13f988 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
@@ -622,11 +622,6 @@ static int gaudi_config_etr(struct hl_device *hdev,
 			return -EINVAL;
 		}
 
-		gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_ARUSER,
-						hdev->compute_ctx->asid);
-		gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_AWUSER,
-						hdev->compute_ctx->asid);
-
 		msb = upper_32_bits(input->buffer_address) >> 8;
 		msb &= PSOC_GLOBAL_CONF_TRACE_ADDR_MSB_MASK;
 		WREG32(mmPSOC_GLOBAL_CONF_TRACE_ADDR, msb);

From b95251e1ae7760db049704139877d402c941179c Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Tue, 3 Aug 2021 15:53:46 +0300
Subject: [PATCH 45/55] habanalabs: update to latest firmware headers

Add several new packets between driver and firmware.
Add matching compatibility bits for backward compatibility.
Add support for 4K event types.
Add information about pcie errors.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../misc/habanalabs/include/common/cpucp_if.h | 92 +++++++++++++++++--
 .../habanalabs/include/common/hl_boot_if.h    | 23 +++++
 2 files changed, 109 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index 2d6f8ea35375e0..7afee76ae0b75b 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -98,6 +98,18 @@ struct hl_eq_fw_alive {
 	__u8 pad[7];
 };
 
+enum hl_pcie_addr_dec_cause {
+	PCIE_ADDR_DEC_HBW_ERR_RESP,
+	PCIE_ADDR_DEC_LBW_ERR_RESP,
+	PCIE_ADDR_DEC_TLP_BLOCKED_BY_RR
+};
+
+struct hl_eq_pcie_addr_dec_data {
+	/* enum hl_pcie_addr_dec_cause */
+	__u8 addr_dec_cause;
+	__u8 pad[7];
+};
+
 struct hl_eq_entry {
 	struct hl_eq_header hdr;
 	union {
@@ -106,6 +118,7 @@ struct hl_eq_entry {
 		struct hl_eq_sm_sei_data sm_sei_data;
 		struct cpucp_pkt_sync_err pkt_sync_err;
 		struct hl_eq_fw_alive fw_alive;
+		struct hl_eq_pcie_addr_dec_data pcie_addr_dec_data;
 		__le64 data[7];
 	};
 };
@@ -116,7 +129,7 @@ struct hl_eq_entry {
 #define EQ_CTL_READY_MASK		0x80000000
 
 #define EQ_CTL_EVENT_TYPE_SHIFT		16
-#define EQ_CTL_EVENT_TYPE_MASK		0x03FF0000
+#define EQ_CTL_EVENT_TYPE_MASK		0x0FFF0000
 
 #define EQ_CTL_INDEX_SHIFT		0
 #define EQ_CTL_INDEX_MASK		0x0000FFFF
@@ -300,7 +313,7 @@ enum pq_init_status {
  *       The packet's arguments specify the desired sensor and the field to
  *       set.
  *
- * CPUCP_PACKET_PCIE_THROUGHPUT_GET
+ * CPUCP_PACKET_PCIE_THROUGHPUT_GET -
  *       Get throughput of PCIe.
  *       The packet's arguments specify the transaction direction (TX/RX).
  *       The window measurement is 10[msec], and the return value is in KB/sec.
@@ -309,19 +322,19 @@ enum pq_init_status {
  *       Replay count measures number of "replay" events, which is basicly
  *       number of retries done by PCIe.
  *
- * CPUCP_PACKET_TOTAL_ENERGY_GET
+ * CPUCP_PACKET_TOTAL_ENERGY_GET -
  *       Total Energy is measurement of energy from the time FW Linux
  *       is loaded. It is calculated by multiplying the average power
  *       by time (passed from armcp start). The units are in MilliJouls.
  *
- * CPUCP_PACKET_PLL_INFO_GET
+ * CPUCP_PACKET_PLL_INFO_GET -
  *       Fetch frequencies of PLL from the required PLL IP.
  *       The packet's arguments specify the device PLL type
  *       Pll type is the PLL from device pll_index enum.
  *       The result is composed of 4 outputs, each is 16-bit
  *       frequency in MHz.
  *
- * CPUCP_PACKET_POWER_GET
+ * CPUCP_PACKET_POWER_GET -
  *       Fetch the present power consumption of the device (Current * Voltage).
  *
  * CPUCP_PACKET_NIC_PFC_SET -
@@ -345,6 +358,24 @@ enum pq_init_status {
  * CPUCP_PACKET_MSI_INFO_SET -
  *       set the index number for each supported msi type going from
  *       host to device
+ *
+ * CPUCP_PACKET_NIC_XPCS91_REGS_GET -
+ *       Fetch the un/correctable counters values from the NIC MAC.
+ *
+ * CPUCP_PACKET_NIC_STAT_REGS_GET -
+ *       Fetch various NIC MAC counters from the NIC STAT.
+ *
+ * CPUCP_PACKET_NIC_STAT_REGS_CLR -
+ *       Clear the various NIC MAC counters in the NIC STAT.
+ *
+ * CPUCP_PACKET_NIC_STAT_REGS_ALL_GET -
+ *       Fetch all NIC MAC counters from the NIC STAT.
+ *
+ * CPUCP_PACKET_IS_IDLE_CHECK -
+ *       Check if the device is IDLE in regard to the DMA/compute engines
+ *       and QMANs. The f/w will return a bitmask where each bit represents
+ *       a different engine or QMAN according to enum cpucp_idle_mask.
+ *       The bit will be 1 if the engine is NOT idle.
  */
 
 enum cpucp_packet_id {
@@ -385,6 +416,11 @@ enum cpucp_packet_id {
 	CPUCP_PACKET_NIC_LPBK_SET,		/* internal */
 	CPUCP_PACKET_NIC_MAC_CFG,		/* internal */
 	CPUCP_PACKET_MSI_INFO_SET,		/* internal */
+	CPUCP_PACKET_NIC_XPCS91_REGS_GET,	/* internal */
+	CPUCP_PACKET_NIC_STAT_REGS_GET,		/* internal */
+	CPUCP_PACKET_NIC_STAT_REGS_CLR,		/* internal */
+	CPUCP_PACKET_NIC_STAT_REGS_ALL_GET,	/* internal */
+	CPUCP_PACKET_IS_IDLE_CHECK,		/* internal */
 };
 
 #define CPUCP_PACKET_FENCE_VAL	0xFE8CE7A5
@@ -414,6 +450,11 @@ enum cpucp_packet_id {
 #define CPUCP_PKT_VAL_LPBK_IN2_SHIFT	1
 #define CPUCP_PKT_VAL_LPBK_IN2_MASK	0x000000000000001Eull
 
+#define CPUCP_PKT_VAL_MAC_CNT_IN1_SHIFT	0
+#define CPUCP_PKT_VAL_MAC_CNT_IN1_MASK	0x0000000000000001ull
+#define CPUCP_PKT_VAL_MAC_CNT_IN2_SHIFT	1
+#define CPUCP_PKT_VAL_MAC_CNT_IN2_MASK	0x00000000FFFFFFFEull
+
 /* heartbeat status bits */
 #define CPUCP_PKT_HB_STATUS_EQ_FAULT_SHIFT		0
 #define CPUCP_PKT_HB_STATUS_EQ_FAULT_MASK		0x00000001
@@ -467,7 +508,8 @@ struct cpucp_packet {
 		__le32 status_mask;
 	};
 
-	__le32 reserved;
+	/* For NIC requests */
+	__le32 port_index;
 };
 
 struct cpucp_unmask_irq_arr_packet {
@@ -476,6 +518,12 @@ struct cpucp_unmask_irq_arr_packet {
 	__le32 irqs[0];
 };
 
+struct cpucp_nic_status_packet {
+	struct cpucp_packet cpucp_pkt;
+	__le32 length;
+	__le32 data[0];
+};
+
 struct cpucp_array_data_packet {
 	struct cpucp_packet cpucp_pkt;
 	__le32 length;
@@ -721,4 +769,36 @@ struct cpucp_nic_info {
 	__u8 reserved[6];
 };
 
+/*
+ * struct cpucp_nic_status - describes the status of a NIC port.
+ * @port: NIC port index.
+ * @bad_format_cnt: e.g. CRC.
+ * @responder_out_of_sequence_psn_cnt: e.g NAK.
+ * @high_ber_reinit_cnt: link reinit due to high BER.
+ * @correctable_err_cnt: e.g. bit-flip.
+ * @uncorrectable_err_cnt: e.g. MAC errors.
+ * @retraining_cnt: re-training counter.
+ * @up: is port up.
+ * @pcs_link: has PCS link.
+ * @phy_ready: is PHY ready.
+ * @auto_neg: is Autoneg enabled.
+ * @timeout_retransmission_cnt: timeout retransmission events
+ * @high_ber_cnt: high ber events
+ */
+struct cpucp_nic_status {
+	__le32 port;
+	__le32 bad_format_cnt;
+	__le32 responder_out_of_sequence_psn_cnt;
+	__le32 high_ber_reinit;
+	__le32 correctable_err_cnt;
+	__le32 uncorrectable_err_cnt;
+	__le32 retraining_cnt;
+	__u8 up;
+	__u8 pcs_link;
+	__u8 phy_ready;
+	__u8 auto_neg;
+	__le32 timeout_retransmission_cnt;
+	__le32 high_ber_cnt;
+};
+
 #endif /* CPUCP_IF_H */
diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h
index 1f296784fa2bcb..3099653234e4a3 100644
--- a/drivers/misc/habanalabs/include/common/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h
@@ -210,6 +210,10 @@
  *					configured and is ready for use.
  *					Initialized in: ppboot
  *
+ * CPU_BOOT_DEV_STS0_FW_NIC_MAC_EN	NIC MAC channels init is done by FW and
+ *					any access to them is done via the FW.
+ *					Initialized in: linux
+ *
  * CPU_BOOT_DEV_STS0_DYN_PLL_EN		Dynamic PLL configuration is enabled.
  *					FW sends to host a bitmap of supported
  *					PLLs.
@@ -233,6 +237,21 @@
  *					prevent IRQs overriding each other.
  *					Initialized in: linux
  *
+ * CPU_BOOT_DEV_STS0_FW_NIC_STAT_XPCS91_EN
+ *					NIC STAT and XPCS91 access is restricted
+ *					and is done via FW only.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN
+ *					NIC STAT get all is supported.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN
+ *					F/W checks if the device is idle by reading defined set
+ *					of registers. It returns a bitmask of all the engines,
+ *					where a bit is set if the engine is not idle.
+ *					Initialized in: linux
+ *
  * CPU_BOOT_DEV_STS0_ENABLED		Device status register enabled.
  *					This is a main indication that the
  *					running FW populates the device status
@@ -260,10 +279,14 @@
 #define CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN			(1 << 15)
 #define CPU_BOOT_DEV_STS0_FW_LD_COM_EN			(1 << 16)
 #define CPU_BOOT_DEV_STS0_FW_IATU_CONF_EN		(1 << 17)
+#define CPU_BOOT_DEV_STS0_FW_NIC_MAC_EN			(1 << 18)
 #define CPU_BOOT_DEV_STS0_DYN_PLL_EN			(1 << 19)
 #define CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN		(1 << 20)
 #define CPU_BOOT_DEV_STS0_EQ_INDEX_EN			(1 << 21)
 #define CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN		(1 << 22)
+#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_XPCS91_EN		(1 << 23)
+#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN		(1 << 24)
+#define CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN		(1 << 25)
 #define CPU_BOOT_DEV_STS0_ENABLED			(1 << 31)
 #define CPU_BOOT_DEV_STS1_ENABLED			(1 << 31)
 

From df205b38529a463c445fa2934b70b4232b98bff8 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Tue, 3 Aug 2021 13:22:06 +0300
Subject: [PATCH 46/55] habanalabs/gaudi: increase boot fit timeout

Various f/w versions have different timeouts, so increase the default
timeout to accommodate all the options.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c | 4 ++++
 drivers/misc/habanalabs/gaudi/gaudi.c        | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index c232d197b57ab5..8d2568c63f19e8 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -1062,6 +1062,10 @@ static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
 		dev_err(hdev->dev,
 			"Device boot progress - Thermal Sensor initialization failed\n");
 		break;
+	case CPU_BOOT_STATUS_SECURITY_READY:
+		dev_err(hdev->dev,
+			"Device boot progress - Stuck in preboot after security initialization\n");
+		break;
 	default:
 		dev_err(hdev->dev,
 			"Device boot progress - Invalid status code %d\n",
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 1b98233ffc4b8f..3fbcdb013a7e8b 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -76,7 +76,7 @@
 #define GAUDI_PLDM_MMU_TIMEOUT_USEC	(MMU_CONFIG_TIMEOUT_USEC * 100)
 #define GAUDI_PLDM_QMAN0_TIMEOUT_USEC	(HL_DEVICE_TIMEOUT_USEC * 30)
 #define GAUDI_PLDM_TPC_KERNEL_WAIT_USEC	(HL_DEVICE_TIMEOUT_USEC * 30)
-#define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC	1000000		/* 1s */
+#define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC	4000000		/* 4s */
 #define GAUDI_MSG_TO_CPU_TIMEOUT_USEC	4000000		/* 4s */
 #define GAUDI_WAIT_FOR_BL_TIMEOUT_USEC	15000000	/* 15s */
 

From a5dd6618bf2c41b212faf3146411b0ddc580ee60 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Tue, 3 Aug 2021 08:13:39 +0300
Subject: [PATCH 47/55] habanalabs/gaudi: restore user registers when context
 opens

Because we don't have multiple contexts in GAUDI, and to minimize
calls to is_idle function (which uses many register reads), move
the call to clear the user registers to the opening of the single
user context.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 3fbcdb013a7e8b..e9a8ed96fe65a4 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -6088,7 +6088,7 @@ static int gaudi_restore_user_registers(struct hl_device *hdev)
 
 static int gaudi_context_switch(struct hl_device *hdev, u32 asid)
 {
-	return gaudi_restore_user_registers(hdev);
+	return 0;
 }
 
 static int gaudi_mmu_clear_pgt_range(struct hl_device *hdev)
@@ -8658,10 +8658,20 @@ static void gaudi_internal_cb_pool_fini(struct hl_device *hdev,
 
 static int gaudi_ctx_init(struct hl_ctx *ctx)
 {
+	int rc;
+
 	if (ctx->asid == HL_KERNEL_ASID_ID)
 		return 0;
 
-	return gaudi_internal_cb_pool_init(ctx->hdev, ctx);
+	rc = gaudi_internal_cb_pool_init(ctx->hdev, ctx);
+	if (rc)
+		return rc;
+
+	rc = gaudi_restore_user_registers(ctx->hdev);
+	if (rc)
+		gaudi_internal_cb_pool_fini(ctx->hdev, ctx);
+
+	return rc;
 }
 
 static void gaudi_ctx_fini(struct hl_ctx *ctx)

From e4273083be7f4e613bd8ecea7748c8a3b3b739ab Mon Sep 17 00:00:00 2001
From: Alon Mizrahi <amizrahi@habana.ai>
Date: Thu, 22 Jul 2021 18:12:18 +0300
Subject: [PATCH 48/55] habanalabs/gaudi: add monitored SOBs to state dump

Current "state dump" is lacking of monitored SOB IDs. Add for
convenience.

Signed-off-by: Alon Mizrahi <amizrahi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c  | 38 ++++++++++++++++++++++++--
 drivers/misc/habanalabs/gaudi/gaudiP.h |  1 +
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index e9a8ed96fe65a4..d18a1ab428978d 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -108,6 +108,8 @@
 
 #define BIN_REG_STRING_SIZE	sizeof("0b10101010101010101010101010101010")
 
+#define MONITOR_SOB_STRING_SIZE		256
+
 static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = {
 		"gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3",
 		"gaudi cq 1_0", "gaudi cq 1_1", "gaudi cq 1_2", "gaudi cq 1_3",
@@ -9185,6 +9187,34 @@ static int gaudi_monitor_valid(struct hl_mon_state_dump *mon)
 		mon->status);
 }
 
+static void gaudi_fill_sobs_from_mon(char *sobs, struct hl_mon_state_dump *mon)
+{
+	const size_t max_write = 10;
+	u32 gid, mask, sob;
+	int i, offset;
+
+	/* Sync object ID is calculated as follows:
+	 * (8 * group_id + cleared bits in mask)
+	 */
+	gid = FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_MASK,
+			mon->arm_data);
+	mask = FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_MASK,
+			mon->arm_data);
+
+	for (i = 0, offset = 0; mask && offset < MONITOR_SOB_STRING_SIZE -
+		max_write; mask >>= 1, i++) {
+		if (!(mask & 1)) {
+			sob = gid * MONITOR_MAX_SOBS + i;
+
+			if (offset > 0)
+				offset += snprintf(sobs + offset, max_write,
+							", ");
+
+			offset += snprintf(sobs + offset, max_write, "%u", sob);
+		}
+	}
+}
+
 static int gaudi_print_single_monitor(char **buf, size_t *size, size_t *offset,
 				struct hl_device *hdev,
 				struct hl_mon_state_dump *mon)
@@ -9192,14 +9222,17 @@ static int gaudi_print_single_monitor(char **buf, size_t *size, size_t *offset,
 	const char *name;
 	char scratch_buf1[BIN_REG_STRING_SIZE],
 		scratch_buf2[BIN_REG_STRING_SIZE];
+	char monitored_sobs[MONITOR_SOB_STRING_SIZE] = {0};
 
 	name = hl_state_dump_get_monitor_name(hdev, mon);
 	if (!name)
 		name = "";
 
+	gaudi_fill_sobs_from_mon(monitored_sobs, mon);
+
 	return hl_snprintf_resize(
 		buf, size, offset,
-		"Mon id: %u%s, wait for group id: %u mask %s to reach val: %u and write %u to address 0x%llx. Pending: %s",
+		"Mon id: %u%s, wait for group id: %u mask %s to reach val: %u and write %u to address 0x%llx. Pending: %s. Means sync objects [%s] are being monitored.",
 		mon->id, name,
 		FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_MASK,
 				mon->arm_data),
@@ -9216,7 +9249,8 @@ static int gaudi_print_single_monitor(char **buf, size_t *size, size_t *offset,
 			scratch_buf2, sizeof(scratch_buf2),
 			FIELD_GET(
 				SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_PENDING_MASK,
-				mon->status)));
+				mon->status)),
+		monitored_sobs);
 }
 
 
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index 838e98b0d43d01..eacc5eadda97b3 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -117,6 +117,7 @@
 	(((mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_511 - \
 	mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_0) + 4) >> 2)
 
+#define MONITOR_MAX_SOBS	8
 
 /* DRAM Memory Map */
 

From a59aceb2d6e42cbc391691c2d7f6d07fd800f8b0 Mon Sep 17 00:00:00 2001
From: Ohad Sharabi <osharabi@habana.ai>
Date: Sun, 1 Aug 2021 23:02:07 +0300
Subject: [PATCH 49/55] habanalabs: modify multi-CS to wait on stream masters

During the integration, the multi-CS requirements were refined:
- The multi CS call shall wait on "per-ASIC" predefined stream masters
  instead of set of streams.
- Stream masters are set of QIDs used by the upper SW layers (synapse)
  for completion (must be an external/HW queue).

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../habanalabs/common/command_submission.c    | 50 ++++++++++++-------
 drivers/misc/habanalabs/common/habanalabs.h   | 22 ++++----
 drivers/misc/habanalabs/common/hw_queue.c     |  3 +-
 drivers/misc/habanalabs/gaudi/gaudi.c         | 22 +++++++-
 drivers/misc/habanalabs/gaudi/gaudiP.h        |  2 +
 drivers/misc/habanalabs/goya/goya.c           |  6 +++
 6 files changed, 77 insertions(+), 28 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index d71bd48cbc4448..3a67265312eeda 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -487,14 +487,15 @@ static void force_complete_multi_cs(struct hl_device *hdev)
  *
  * @hdev: pointer to habanalabs device structure
  * @cs: CS structure
- *
- * The function signals waiting entity that its waiting stream has common
- * stream with the completed CS.
+ * The function signals a waiting entity that has an overlapping stream masters
+ * with the completed CS.
  * For example:
- * - a completed CS worked on streams 0 and 1, multi CS completion
- *   is actively waiting on stream 3. don't send signal as no common stream
- * - a completed CS worked on streams 0 and 1, multi CS completion
- *   is actively waiting on streams 1 and 3. send signal as stream 1 is common
+ * - a completed CS worked on stream master QID 4, multi CS completion
+ *   is actively waiting on stream master QIDs 3, 5. don't send signal as no
+ *   common stream master QID
+ * - a completed CS worked on stream master QID 4, multi CS completion
+ *   is actively waiting on stream master QIDs 3, 4. send signal as stream
+ *   master QID 4 is common
  */
 static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
 {
@@ -518,10 +519,11 @@ static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
 		 * complete if:
 		 * 1. still waiting for completion
 		 * 2. the completed CS has at least one overlapping stream
-		 *    with the streams in the completion
+		 *    master with the stream masters in the completion
 		 */
 		if (mcs_compl->used &&
-				(fence->stream_map & mcs_compl->stream_map)) {
+				(fence->stream_master_qid_map &
+					mcs_compl->stream_master_qid_map)) {
 			/* extract the timestamp only of first completed CS */
 			if (!mcs_compl->timestamp)
 				mcs_compl->timestamp =
@@ -1228,6 +1230,17 @@ static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
 	return 0;
 }
 
+static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid)
+{
+	int i;
+
+	for (i = 0; i < hdev->stream_master_qid_arr_size; i++)
+		if (qid == hdev->stream_master_qid_arr[i])
+			return BIT(i);
+
+	return 0;
+}
+
 static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 				u32 num_chunks, u64 *cs_seq, u32 flags,
 				u32 encaps_signals_handle, u32 timeout)
@@ -1241,7 +1254,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 	struct hl_cs *cs;
 	struct hl_cb *cb;
 	u64 user_sequence;
-	u8 stream_map = 0;
+	u8 stream_master_qid_map = 0;
 	int rc, i;
 
 	cntr = &hdev->aggregated_cs_counters;
@@ -1310,7 +1323,9 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 			 * queues of this CS
 			 */
 			if (hdev->supports_wait_for_multi_cs)
-				stream_map |= BIT((chunk->queue_index % 4));
+				stream_master_qid_map |=
+					get_stream_master_qid_mask(hdev,
+							chunk->queue_index);
 		}
 
 		job = hl_cs_allocate_job(hdev, queue_type,
@@ -1378,7 +1393,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 	 * fence object for multi-CS completion
 	 */
 	if (hdev->supports_wait_for_multi_cs)
-		cs->fence->stream_map = stream_map;
+		cs->fence->stream_master_qid_map = stream_master_qid_map;
 
 	rc = hl_hw_queue_schedule_cs(cs);
 	if (rc) {
@@ -2332,7 +2347,7 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
 			break;
 		}
 
-		mcs_data->stream_map |= fence->stream_map;
+		mcs_data->stream_master_qid_map |= fence->stream_master_qid_map;
 
 		if (status == CS_WAIT_STATUS_BUSY)
 			continue;
@@ -2394,7 +2409,8 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
  * hl_wait_multi_cs_completion_init - init completion structure
  *
  * @hdev: pointer to habanalabs device structure
- * @stream_map: stream map, set bit indicates stream to wait on
+ * @stream_master_bitmap: stream master QIDs map, set bit indicates stream
+ *                        master QID to wait on
  *
  * @return valid completion struct pointer on success, otherwise error pointer
  *
@@ -2404,7 +2420,7 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
  */
 static struct multi_cs_completion *hl_wait_multi_cs_completion_init(
 							struct hl_device *hdev,
-							u8 stream_map)
+							u8 stream_master_bitmap)
 {
 	struct multi_cs_completion *mcs_compl;
 	int i;
@@ -2416,7 +2432,7 @@ static struct multi_cs_completion *hl_wait_multi_cs_completion_init(
 		if (!mcs_compl->used) {
 			mcs_compl->used = 1;
 			mcs_compl->timestamp = 0;
-			mcs_compl->stream_map = stream_map;
+			mcs_compl->stream_master_qid_map = stream_master_bitmap;
 			reinit_completion(&mcs_compl->completion);
 			spin_unlock(&mcs_compl->lock);
 			break;
@@ -2464,7 +2480,7 @@ static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data)
 	long completion_rc;
 
 	mcs_compl = hl_wait_multi_cs_completion_init(hdev,
-							mcs_data->stream_map);
+					mcs_data->stream_master_qid_map);
 	if (IS_ERR(mcs_compl))
 		return PTR_ERR(mcs_compl);
 
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index b950b514061007..74c2a587d907d2 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -593,18 +593,18 @@ struct asic_fixed_properties {
  * @completion: fence is implemented using completion
  * @refcount: refcount for this fence
  * @cs_sequence: sequence of the corresponding command submission
+ * @stream_master_qid_map: streams masters QID bitmap to represent all streams
+ *                         masters QIDs that multi cs is waiting on
  * @error: mark this fence with error
  * @timestamp: timestamp upon completion
- * @stream_map: streams bitmap to represent all streams that multi cs is
- *              waiting on
  */
 struct hl_fence {
 	struct completion	completion;
 	struct kref		refcount;
 	u64			cs_sequence;
+	u32			stream_master_qid_map;
 	int			error;
 	ktime_t			timestamp;
-	u8			stream_map;
 };
 
 /**
@@ -1161,6 +1161,7 @@ struct fw_load_mgr {
  * @state_dump_init: initialize constants required for state dump
  * @get_sob_addr: get SOB base address offset.
  * @set_pci_memory_regions: setting properties of PCI memory regions
+ * @get_stream_master_qid_arr: get pointer to stream masters QID array
  */
 struct hl_asic_funcs {
 	int (*early_init)(struct hl_device *hdev);
@@ -1290,6 +1291,7 @@ struct hl_asic_funcs {
 	void (*state_dump_init)(struct hl_device *hdev);
 	u32 (*get_sob_addr)(struct hl_device *hdev, u32 sob_id);
 	void (*set_pci_memory_regions)(struct hl_device *hdev);
+	u32* (*get_stream_master_qid_arr)(void);
 };
 
 
@@ -2283,16 +2285,16 @@ struct hl_mmu_funcs {
  * @completion: completion of any of the CS in the list
  * @lock: spinlock for the completion structure
  * @timestamp: timestamp for the multi-CS completion
+ * @stream_master_qid_map: bitmap of all stream masters on which the multi-CS
+ *                        is waiting
  * @used: 1 if in use, otherwise 0
- * @stream_map: bitmap of all HW/external queues streams on which the multi-CS
- *              is waiting
  */
 struct multi_cs_completion {
 	struct completion	completion;
 	spinlock_t		lock;
 	s64			timestamp;
+	u32			stream_master_qid_map;
 	u8			used;
-	u8			stream_map;
 };
 
 /**
@@ -2304,9 +2306,9 @@ struct multi_cs_completion {
  * @timestamp: timestamp of first completed CS
  * @wait_status: wait for CS status
  * @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0)
+ * @stream_master_qid_map: bitmap of all stream master QIDs on which the
+ *                         multi-CS is waiting
  * @arr_len: fence_arr and seq_arr array length
- * @stream_map: bitmap of all HW/external queues streams on which the multi-CS
- *              is waiting
  * @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0)
  * @update_ts: update timestamp. 1- update the timestamp, otherwise 0.
  */
@@ -2318,8 +2320,8 @@ struct multi_cs_data {
 	s64		timestamp;
 	long		wait_status;
 	u32		completion_bitmap;
+	u32		stream_master_qid_map;
 	u8		arr_len;
-	u8		stream_map;
 	u8		gone_cs;
 	u8		update_ts;
 };
@@ -2541,6 +2543,7 @@ struct hl_device {
 
 	struct multi_cs_completion	multi_cs_completion[
 							MULTI_CS_MAX_USER_CTX];
+	u32				*stream_master_qid_arr;
 	atomic64_t			dram_used_mem;
 	u64				timeout_jiffies;
 	u64				max_power;
@@ -2592,6 +2595,7 @@ struct hl_device {
 	u8				skip_reset_on_timeout;
 	u8				device_cpu_is_halted;
 	u8				supports_wait_for_multi_cs;
+	u8				stream_master_qid_arr_size;
 
 	/* Parameters for bring-up */
 	u64				nic_ports_mask;
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index 6d3beccad91bfe..76b7de8f1406e2 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -721,7 +721,8 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 
 		/* update stream map of the first CS */
 		if (hdev->supports_wait_for_multi_cs)
-			staged_cs->fence->stream_map |= cs->fence->stream_map;
+			staged_cs->fence->stream_master_qid_map |=
+					cs->fence->stream_master_qid_map;
 	}
 
 	list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list);
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index d18a1ab428978d..2f6d019c79e103 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -110,6 +110,17 @@
 
 #define MONITOR_SOB_STRING_SIZE		256
 
+static u32 gaudi_stream_master[GAUDI_STREAM_MASTER_ARR_SIZE] = {
+	GAUDI_QUEUE_ID_DMA_0_0,
+	GAUDI_QUEUE_ID_DMA_0_1,
+	GAUDI_QUEUE_ID_DMA_0_2,
+	GAUDI_QUEUE_ID_DMA_0_3,
+	GAUDI_QUEUE_ID_DMA_1_0,
+	GAUDI_QUEUE_ID_DMA_1_1,
+	GAUDI_QUEUE_ID_DMA_1_2,
+	GAUDI_QUEUE_ID_DMA_1_3
+};
+
 static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = {
 		"gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3",
 		"gaudi cq 1_0", "gaudi cq 1_1", "gaudi cq 1_2", "gaudi cq 1_3",
@@ -1871,6 +1882,9 @@ static int gaudi_sw_init(struct hl_device *hdev)
 	hdev->supports_wait_for_multi_cs = true;
 
 	hdev->asic_funcs->set_pci_memory_regions(hdev);
+	hdev->stream_master_qid_arr =
+				hdev->asic_funcs->get_stream_master_qid_arr();
+	hdev->stream_master_qid_arr_size = GAUDI_STREAM_MASTER_ARR_SIZE;
 
 	return 0;
 
@@ -9353,6 +9367,11 @@ static void gaudi_state_dump_init(struct hl_device *hdev)
 	sds->funcs = gaudi_state_dump_funcs;
 }
 
+static u32 *gaudi_get_stream_master_qid_arr(void)
+{
+	return gaudi_stream_master;
+}
+
 static const struct hl_asic_funcs gaudi_funcs = {
 	.early_init = gaudi_early_init,
 	.early_fini = gaudi_early_fini,
@@ -9441,7 +9460,8 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.init_cpu_scrambler_dram = gaudi_init_scrambler_hbm,
 	.state_dump_init = gaudi_state_dump_init,
 	.get_sob_addr = gaudi_get_sob_addr,
-	.set_pci_memory_regions = gaudi_set_pci_memory_regions
+	.set_pci_memory_regions = gaudi_set_pci_memory_regions,
+	.get_stream_master_qid_arr = gaudi_get_stream_master_qid_arr
 };
 
 /**
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index eacc5eadda97b3..2f0928c0fa8fdf 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -36,6 +36,8 @@
 #define NUMBER_OF_INTERRUPTS		(NUMBER_OF_CMPLT_QUEUES + \
 						NUMBER_OF_CPU_HW_QUEUES)
 
+#define GAUDI_STREAM_MASTER_ARR_SIZE	8
+
 #if (NUMBER_OF_INTERRUPTS > GAUDI_MSI_ENTRIES)
 #error "Number of MSI interrupts must be smaller or equal to GAUDI_MSI_ENTRIES"
 #endif
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index d54c700c31cd58..af3f84d8f7104a 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5589,6 +5589,11 @@ static u32 goya_get_sob_addr(struct hl_device *hdev, u32 sob_id)
 	return 0;
 }
 
+static u32 *goya_get_stream_master_qid_arr(void)
+{
+	return NULL;
+}
+
 static const struct hl_asic_funcs goya_funcs = {
 	.early_init = goya_early_init,
 	.early_fini = goya_early_fini,
@@ -5678,6 +5683,7 @@ static const struct hl_asic_funcs goya_funcs = {
 	.state_dump_init = goya_state_dump_init,
 	.get_sob_addr = &goya_get_sob_addr,
 	.set_pci_memory_regions = goya_set_pci_memory_regions,
+	.get_stream_master_qid_arr = goya_get_stream_master_qid_arr,
 };
 
 /*

From ffd27f8349a5d17542403a870869fdaf31eb20a5 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Thu, 5 Aug 2021 17:36:24 +0300
Subject: [PATCH 50/55] habanalabs/gaudi: fetch TPC/MME ECC errors from F/W

In case F/W security is enabled driver cannot access ECC registers,
hence driver must fetch the ECC info from F/W.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 2f6d019c79e103..6671d1aca8e143 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7458,6 +7458,11 @@ static void gaudi_handle_ecc_event(struct hl_device *hdev, u16 event_type,
 	bool extract_info_from_fw;
 	int rc;
 
+	if (hdev->asic_prop.fw_security_enabled) {
+		extract_info_from_fw = true;
+		goto extract_ecc_info;
+	}
+
 	switch (event_type) {
 	case GAUDI_EVENT_PCIE_CORE_SERR ... GAUDI_EVENT_PCIE_PHY_DERR:
 	case GAUDI_EVENT_DMA0_SERR_ECC ... GAUDI_EVENT_MMU_DERR:
@@ -7530,6 +7535,7 @@ static void gaudi_handle_ecc_event(struct hl_device *hdev, u16 event_type,
 		return;
 	}
 
+extract_ecc_info:
 	if (extract_info_from_fw) {
 		ecc_address = le64_to_cpu(ecc_data->ecc_address);
 		ecc_syndrom = le64_to_cpu(ecc_data->ecc_syndrom);

From 02bbd7a52b854f5f7a6ec8700a8df124b9fe9841 Mon Sep 17 00:00:00 2001
From: Yuri Nudelman <ynudelman@habana.ai>
Date: Thu, 29 Jul 2021 11:44:28 +0300
Subject: [PATCH 51/55] habanalabs: add userptr_lookup node in debugfs

It is useful to have the ability to see which user address was pinned
to which physical address during the initial mapping. We already have
all that info stored, but no means to search this data (which may be
quite large).

Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../ABI/testing/debugfs-driver-habanalabs     |  8 +++
 drivers/misc/habanalabs/common/debugfs.c      | 72 +++++++++++++++++++
 drivers/misc/habanalabs/common/habanalabs.h   | 19 +++++
 drivers/misc/habanalabs/common/memory.c       | 21 +-----
 4 files changed, 101 insertions(+), 19 deletions(-)

diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index e29156511388e2..284e2dfa61cd28 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -241,6 +241,14 @@ Description:    Displays a list with information about the currently user
                 pointers (user virtual addresses) that are pinned and mapped
                 to DMA addresses
 
+What:           /sys/kernel/debug/habanalabs/hl<n>/userptr_lookup
+Date:           Aug 2021
+KernelVersion:  5.15
+Contact:        ogabbay@kernel.org
+Description:    Allows to search for specific user pointers (user virtual
+                addresses) that are pinned and mapped to DMA addresses, and see
+                their resolution to the specific dma address.
+
 What:           /sys/kernel/debug/habanalabs/hl<n>/vm
 Date:           Jan 2019
 KernelVersion:  5.1
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 6a7df57883d955..985f1f3dbd20f4 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -320,6 +320,77 @@ static int vm_show(struct seq_file *s, void *data)
 	return 0;
 }
 
+static int userptr_lookup_show(struct seq_file *s, void *data)
+{
+	struct hl_debugfs_entry *entry = s->private;
+	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
+	struct scatterlist *sg;
+	struct hl_userptr *userptr;
+	bool first = true;
+	u64 total_npages, npages, sg_start, sg_end;
+	dma_addr_t dma_addr;
+	int i;
+
+	spin_lock(&dev_entry->userptr_spinlock);
+
+	list_for_each_entry(userptr, &dev_entry->userptr_list, debugfs_list) {
+		if (dev_entry->userptr_lookup >= userptr->addr &&
+		dev_entry->userptr_lookup < userptr->addr + userptr->size) {
+			total_npages = 0;
+			for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents,
+					i) {
+				npages = hl_get_sg_info(sg, &dma_addr);
+				sg_start = userptr->addr +
+					total_npages * PAGE_SIZE;
+				sg_end = userptr->addr +
+					(total_npages + npages) * PAGE_SIZE;
+
+				if (dev_entry->userptr_lookup >= sg_start &&
+				    dev_entry->userptr_lookup < sg_end) {
+					dma_addr += (dev_entry->userptr_lookup -
+							sg_start);
+					if (first) {
+						first = false;
+						seq_puts(s, "\n");
+						seq_puts(s, " user virtual address         dma address       pid        region start     region size\n");
+						seq_puts(s, "---------------------------------------------------------------------------------------\n");
+					}
+					seq_printf(s, " 0x%-18llx  0x%-16llx  %-8u  0x%-16llx %-12llu\n",
+						dev_entry->userptr_lookup,
+						(u64)dma_addr, userptr->pid,
+						userptr->addr, userptr->size);
+				}
+				total_npages += npages;
+			}
+		}
+	}
+
+	spin_unlock(&dev_entry->userptr_spinlock);
+
+	if (!first)
+		seq_puts(s, "\n");
+
+	return 0;
+}
+
+static ssize_t userptr_lookup_write(struct file *file, const char __user *buf,
+		size_t count, loff_t *f_pos)
+{
+	struct seq_file *s = file->private_data;
+	struct hl_debugfs_entry *entry = s->private;
+	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
+	ssize_t rc;
+	u64 value;
+
+	rc = kstrtoull_from_user(buf, count, 16, &value);
+	if (rc)
+		return rc;
+
+	dev_entry->userptr_lookup = value;
+
+	return count;
+}
+
 static int mmu_show(struct seq_file *s, void *data)
 {
 	struct hl_debugfs_entry *entry = s->private;
@@ -1175,6 +1246,7 @@ static const struct hl_info_list hl_debugfs_list[] = {
 	{"command_submission_jobs", command_submission_jobs_show, NULL},
 	{"userptr", userptr_show, NULL},
 	{"vm", vm_show, NULL},
+	{"userptr_lookup", userptr_lookup_show, userptr_lookup_write},
 	{"mmu", mmu_show, mmu_asid_va_write},
 	{"engines", engines_show, NULL}
 };
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 74c2a587d907d2..465fd909a7b7b9 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1836,6 +1836,7 @@ struct hl_debugfs_entry {
  * @state_dump_sem: protects state_dump.
  * @addr: next address to read/write from/to in read/write32.
  * @mmu_addr: next virtual address to translate to physical address in mmu_show.
+ * @userptr_lookup: the target user ptr to look up for on demand.
  * @mmu_asid: ASID to use while translating in mmu_show.
  * @state_dump_head: index of the latest state dump
  * @i2c_bus: generic u8 debugfs file for bus value to use in i2c_data_read.
@@ -1863,6 +1864,7 @@ struct hl_dbg_device_entry {
 	struct rw_semaphore		state_dump_sem;
 	u64				addr;
 	u64				mmu_addr;
+	u64				userptr_lookup;
 	u32				mmu_asid;
 	u32				state_dump_head;
 	u8				i2c_bus;
@@ -2669,6 +2671,23 @@ struct hl_ioctl_desc {
  * Kernel module functions that can be accessed by entire module
  */
 
+/**
+ * hl_get_sg_info() - get number of pages and the DMA address from SG list.
+ * @sg: the SG list.
+ * @dma_addr: pointer to DMA address to return.
+ *
+ * Calculate the number of consecutive pages described by the SG list. Take the
+ * offset of the address in the first page, add to it the length and round it up
+ * to the number of needed pages.
+ */
+static inline u32 hl_get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr)
+{
+	*dma_addr = sg_dma_address(sg);
+
+	return ((((*dma_addr) & (PAGE_SIZE - 1)) + sg_dma_len(sg)) +
+			(PAGE_SIZE - 1)) >> PAGE_SHIFT;
+}
+
 /**
  * hl_mem_area_inside_range() - Checks whether address+size are inside a range.
  * @address: The start address of the area we want to validate.
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index 2f990d185cfd92..72954abf20fe59 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -805,23 +805,6 @@ int hl_unreserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
 	return rc;
 }
 
-/**
- * get_sg_info() - get number of pages and the DMA address from SG list.
- * @sg: the SG list.
- * @dma_addr: pointer to DMA address to return.
- *
- * Calculate the number of consecutive pages described by the SG list. Take the
- * offset of the address in the first page, add to it the length and round it up
- * to the number of needed pages.
- */
-static u32 get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr)
-{
-	*dma_addr = sg_dma_address(sg);
-
-	return ((((*dma_addr) & (PAGE_SIZE - 1)) + sg_dma_len(sg)) +
-			(PAGE_SIZE - 1)) >> PAGE_SHIFT;
-}
-
 /**
  * init_phys_pg_pack_from_userptr() - initialize physical page pack from host
  *                                    memory
@@ -872,7 +855,7 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
 	 */
 	total_npages = 0;
 	for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) {
-		npages = get_sg_info(sg, &dma_addr);
+		npages = hl_get_sg_info(sg, &dma_addr);
 
 		total_npages += npages;
 
@@ -901,7 +884,7 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
 
 	j = 0;
 	for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) {
-		npages = get_sg_info(sg, &dma_addr);
+		npages = hl_get_sg_info(sg, &dma_addr);
 
 		/* align down to physical page size and save the offset */
 		if (first) {

From 3d1a2c29bb0fbedb30353f2ae63979e735618c59 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Sun, 8 Aug 2021 20:36:48 +0300
Subject: [PATCH 52/55] habanalabs/gaudi: unmask out of bounds SLM access
 interrupt

The out of bounds SLM access TPC interrupt indicates a severe compiler
bug and needs to be informed to user.
This interrupt is currently masked so unmask it.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 6671d1aca8e143..c8bd76e306794d 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -2686,7 +2686,7 @@ static void gaudi_init_golden_registers(struct hl_device *hdev)
 				tpc_id < TPC_NUMBER_OF_ENGINES;
 				tpc_id++, tpc_offset += TPC_CFG_OFFSET) {
 		/* Mask all arithmetic interrupts from TPC */
-		WREG32(mmTPC0_CFG_TPC_INTR_MASK + tpc_offset, 0x8FFF);
+		WREG32(mmTPC0_CFG_TPC_INTR_MASK + tpc_offset, 0x8FFE);
 		/* Set 16 cache lines */
 		WREG32_FIELD(TPC0_CFG_MSS_CONFIG, tpc_offset,
 				ICACHE_FETCH_LINE_NUM, 2);

From 4b1b3b53982b6b713ea9f806ed5c9057099e46aa Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Tue, 10 Aug 2021 21:02:33 +0300
Subject: [PATCH 53/55] habanalabs/gaudi: define DC POWER for secured PMC

In secured mode, the CGM is disabled. Therefore, the DC power is
higher. Without taking it into consideration, the utilization is
12-15% at idle.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c  | 6 +++++-
 drivers/misc/habanalabs/gaudi/gaudiP.h | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index c8bd76e306794d..ba1fcdc16f5094 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -506,7 +506,11 @@ static inline void set_default_power_values(struct hl_device *hdev)
 
 	if (hdev->card_type == cpucp_card_type_pmc) {
 		prop->max_power_default = MAX_POWER_DEFAULT_PMC;
-		prop->dc_power_default = DC_POWER_DEFAULT_PMC;
+
+		if (prop->fw_security_enabled)
+			prop->dc_power_default = DC_POWER_DEFAULT_PMC_SEC;
+		else
+			prop->dc_power_default = DC_POWER_DEFAULT_PMC;
 	} else {
 		prop->max_power_default = MAX_POWER_DEFAULT_PCI;
 		prop->dc_power_default = DC_POWER_DEFAULT_PCI;
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index 2f0928c0fa8fdf..7addb31e3eee3a 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -52,6 +52,8 @@
 #define DC_POWER_DEFAULT_PCI		60000		/* 60W */
 #define DC_POWER_DEFAULT_PMC		60000		/* 60W */
 
+#define DC_POWER_DEFAULT_PMC_SEC	97000		/* 97W */
+
 #define GAUDI_CPU_TIMEOUT_USEC		30000000	/* 30s */
 
 #define TPC_ENABLED_MASK		0xFF

From aaa24035ec31121d965b86460dce2164a6ed30a5 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Wed, 11 Aug 2021 15:39:00 +0300
Subject: [PATCH 54/55] habanalabs/gaudi: size should be printed in decimal

It's more readable for the size to be in decimal.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index ba1fcdc16f5094..f244a1b8abca6a 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7266,7 +7266,7 @@ static void gaudi_print_sw_config_stream_data(struct hl_device *hdev, u32 stream
 
 	cq_ptr = (((u64) RREG32(cq_ptr_hi)) << 32) | RREG32(cq_ptr_lo);
 	size = RREG32(cq_tsize);
-	dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %x\n",
+	dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %u\n",
 							stream, cq_ptr, size);
 }
 
@@ -7322,7 +7322,7 @@ static void gaudi_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
 
 		addr = le64_to_cpu(bd->ptr);
 
-		dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %x\n",
+		dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %u\n",
 							stream, ci, addr, len);
 
 		/* get previous ci, wrap if needed */

From a3f369db34e33236f994d4ca3f13655109394d06 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Tue, 17 Aug 2021 13:36:06 +0300
Subject: [PATCH 55/55] habanalabs/gaudi: invalidate PMMU mem cache on init

This must be done to clear the internal mem cache so we won't get
ecc errors on the first invalidation.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index f244a1b8abca6a..ca9c3b9f08a180 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -3906,6 +3906,9 @@ static int gaudi_mmu_init(struct hl_device *hdev)
 	WREG32(mmSTLB_CACHE_INV_BASE_39_8, MMU_CACHE_MNG_ADDR >> 8);
 	WREG32(mmSTLB_CACHE_INV_BASE_49_40, MMU_CACHE_MNG_ADDR >> 40);
 
+	/* mem cache invalidation */
+	WREG32(mmSTLB_MEM_CACHE_INVALIDATION, 1);
+
 	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, 0);
 
 	WREG32(mmMMU_UP_MMU_ENABLE, 1);