Skip to content

Commit

Permalink
pmem: add pmem_recovery_write() dax op
Browse files Browse the repository at this point in the history
pmem_recovery_write() consists of clearing poison via DSM,
clearing page HWPoison bit, re-state _PAGE_PRESENT bit,
cacheflush, write, and finally clearing bad-block record.

A competing pread thread is held off during recovery write
by the presence of bad-block record. A competing recovery_write
thread is serialized by a lock.

Signed-off-by: Jane Chu <jane.chu@oracle.com>
  • Loading branch information
jchu314atgithub authored and intel-lab-lkp committed Jan 28, 2022
1 parent 6a98064 commit 7190481
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 6 deletions.
82 changes: 76 additions & 6 deletions drivers/nvdimm/pmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,14 @@ static void hwpoison_clear(struct pmem_device *pmem,
}
}

static void pmem_clear_badblocks(struct pmem_device *pmem, sector_t sector,
long cleared_blks)
{
badblocks_clear(&pmem->bb, sector, cleared_blks);
if (pmem->bb_state)
sysfs_notify_dirent(pmem->bb_state);
}

static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
phys_addr_t offset, unsigned int len)
{
Expand All @@ -88,9 +96,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
dev_dbg(dev, "%#llx clear %ld sector%s\n",
(unsigned long long) sector, cleared,
cleared > 1 ? "s" : "");
badblocks_clear(&pmem->bb, sector, cleared);
if (pmem->bb_state)
sysfs_notify_dirent(pmem->bb_state);
pmem_clear_badblocks(pmem, sector, cleared);
}

arch_invalidate_pmem(pmem->virt_addr + offset, len);
Expand Down Expand Up @@ -257,10 +263,15 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
__weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
{
bool bad_pmem;
bool do_recovery = false;
resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;

if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
PFN_PHYS(nr_pages))))
bad_pmem = is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
PFN_PHYS(nr_pages));
if (bad_pmem && kaddr)
do_recovery = dax_recovery_started(pmem->dax_dev, kaddr);
if (bad_pmem && !do_recovery)
return -EIO;

if (kaddr)
Expand Down Expand Up @@ -301,10 +312,68 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
}

/*
* The recovery write thread started out as a normal pwrite thread and
* when the filesystem was told about potential media error in the
* range, filesystem turns the normal pwrite to a dax_recovery_write.
*
* The recovery write consists of clearing poison via DSM, clearing page
* HWPoison bit, reenable page-wide read-write permission, flush the
* caches and finally write. A competing pread thread needs to be held
* off during the recovery process since data read back might not be valid.
* And that's achieved by placing the badblock records clearing after
* the completion of the recovery write.
*
* Any competing recovery write thread needs to be serialized, and this is
* done via pmem device level lock .recovery_lock.
*/
static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
return 0;
size_t rc, len, off;
phys_addr_t pmem_off;
struct pmem_device *pmem = dax_get_private(dax_dev);
struct device *dev = pmem->bb.dev;
sector_t sector;
long cleared, cleared_blk;

mutex_lock(&pmem->recovery_lock);

/* If no poison found in the range, go ahead with write */
off = (unsigned long)addr & ~PAGE_MASK;
len = PFN_PHYS(PFN_UP(off + bytes));
if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
rc = _copy_from_iter_flushcache(addr, bytes, i);
goto write_done;
}

/* Not page-aligned range cannot be recovered */
if (off || !(PAGE_ALIGNED(bytes))) {
dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n",
addr, bytes);
rc = (size_t) -EIO;
goto write_done;
}

pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
sector = (pmem_off - pmem->data_offset) / 512;
cleared = nvdimm_clear_poison(dev, pmem->phys_addr + pmem_off, len);
cleared_blk = cleared / 512;
if (cleared_blk > 0) {
hwpoison_clear(pmem, pmem->phys_addr + pmem_off, cleared);
} else {
dev_warn(dev, "pmem_recovery_write: cleared_blk: %ld\n",
cleared_blk);
rc = (size_t) -EIO;
goto write_done;
}
arch_invalidate_pmem(pmem->virt_addr + pmem_off, bytes);
rc = _copy_from_iter_flushcache(addr, bytes, i);
pmem_clear_badblocks(pmem, sector, cleared_blk);

write_done:
mutex_unlock(&pmem->recovery_lock);
return rc;
}

static const struct dax_operations pmem_dax_ops = {
Expand Down Expand Up @@ -495,6 +564,7 @@ static int pmem_attach_disk(struct device *dev,
goto out_cleanup_dax;
dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
set_dax_recovery(dax_dev);
mutex_init(&pmem->recovery_lock);
pmem->dax_dev = dax_dev;

rc = device_add_disk(dev, disk, pmem_attribute_groups);
Expand Down
1 change: 1 addition & 0 deletions drivers/nvdimm/pmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ struct pmem_device {
struct dax_device *dax_dev;
struct gendisk *disk;
struct dev_pagemap pgmap;
struct mutex recovery_lock;
};

long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
Expand Down

0 comments on commit 7190481

Please sign in to comment.