Skip to content

Commit

Permalink
habanalabs: add "in device creation" status
Browse files Browse the repository at this point in the history
On init, the disabled state is cleared right before hw_init and that
causes the device to report on "Operational" state before the device
initialization is finished. Although the char device is not yet exposed
to the user at this stage, the sysfs entries are exposed.

This can cause errors in monitoring applications that use the sysfs
entries.

In order to avoid this, a new state "in device creation" is introduced
to ne reported when the device is not disabled but is still in init
flow.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
  • Loading branch information
oshpigelman authored and intel-lab-lkp committed Aug 20, 2021
1 parent 1bd35e7 commit c88e360
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 13 deletions.
3 changes: 3 additions & 0 deletions drivers/misc/habanalabs/common/device.c
Expand Up @@ -23,6 +23,8 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
status = HL_DEVICE_STATUS_NEEDS_RESET;
else if (hdev->disabled)
status = HL_DEVICE_STATUS_MALFUNCTION;
else if (!hdev->init_done)
status = HL_DEVICE_STATUS_IN_DEVICE_CREATION;
else
status = HL_DEVICE_STATUS_OPERATIONAL;

Expand All @@ -44,6 +46,7 @@ bool hl_device_operational(struct hl_device *hdev,
case HL_DEVICE_STATUS_NEEDS_RESET:
return false;
case HL_DEVICE_STATUS_OPERATIONAL:
case HL_DEVICE_STATUS_IN_DEVICE_CREATION:
default:
return true;
}
Expand Down
2 changes: 1 addition & 1 deletion drivers/misc/habanalabs/common/habanalabs.h
Expand Up @@ -2010,7 +2010,7 @@ struct hl_state_dump_specs {

#define HL_STR_MAX 32

#define HL_DEV_STS_MAX (HL_DEVICE_STATUS_NEEDS_RESET + 1)
#define HL_DEV_STS_MAX (HL_DEVICE_STATUS_LAST + 1)

/* Theoretical limit only. A single host can only contain up to 4 or 8 PCIe
* x16 cards. In extreme cases, there are hosts that can accommodate 16 cards.
Expand Down
8 changes: 6 additions & 2 deletions drivers/misc/habanalabs/common/habanalabs_drv.c
Expand Up @@ -317,12 +317,16 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
hdev->asic_prop.fw_security_enabled = false;

/* Assign status description string */
strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
"disabled", HL_STR_MAX);
strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL],
"operational", HL_STR_MAX);
strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET],
"in reset", HL_STR_MAX);
strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
"disabled", HL_STR_MAX);
strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET],
"needs reset", HL_STR_MAX);
strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
"in device creation", HL_STR_MAX);

hdev->major = hl_major;
hdev->reset_on_lockup = reset_on_lockup;
Expand Down
14 changes: 5 additions & 9 deletions drivers/misc/habanalabs/common/sysfs.c
Expand Up @@ -285,16 +285,12 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct hl_device *hdev = dev_get_drvdata(dev);
char *str;
char str[HL_STR_MAX];

if (atomic_read(&hdev->in_reset))
str = "In reset";
else if (hdev->disabled)
str = "Malfunction";
else if (hdev->needs_reset)
str = "Needs Reset";
else
str = "Operational";
strncpy(str, hdev->status[hl_device_status(hdev)], HL_STR_MAX);

/* use uppercase for backward compatibility */
str[0] = 'A' + (str[0] - 'a');

return sprintf(buf, "%s\n", str);
}
Expand Down
4 changes: 3 additions & 1 deletion include/uapi/misc/habanalabs.h
Expand Up @@ -276,7 +276,9 @@ enum hl_device_status {
HL_DEVICE_STATUS_OPERATIONAL,
HL_DEVICE_STATUS_IN_RESET,
HL_DEVICE_STATUS_MALFUNCTION,
HL_DEVICE_STATUS_NEEDS_RESET
HL_DEVICE_STATUS_NEEDS_RESET,
HL_DEVICE_STATUS_IN_DEVICE_CREATION,
HL_DEVICE_STATUS_LAST = HL_DEVICE_STATUS_IN_DEVICE_CREATION
};

enum hl_server_type {
Expand Down

0 comments on commit c88e360

Please sign in to comment.