Skip to content

Commit

Permalink
dax: add DAX_RECOVERY flag and .recovery_write dev_pgmap_ops
Browse files Browse the repository at this point in the history
Introduce DAX_RECOVERY flag to dax_direct_access(). The flag is
not set by default in dax_direct_access() such that the helper
does not translate a pmem range to kernel virtual address if the
range contains uncorrectable errors.  When the flag is set,
the helper ignores the UEs and return kernel virtual adderss so
that the caller may get on with data recovery via write.

Also introduce a new dev_pagemap_ops .recovery_write function.
The function is applicable to FSDAX device only. The device
page backend driver provides .recovery_write function if the
device has underlying mechanism to clear the uncorrectable
errors on the fly.

Signed-off-by: Jane Chu <jane.chu@oracle.com>
  • Loading branch information
jchu314atgithub authored and intel-lab-lkp committed Mar 19, 2022
1 parent 291591f commit 203570f
Show file tree
Hide file tree
Showing 16 changed files with 107 additions and 33 deletions.
23 changes: 21 additions & 2 deletions drivers/dax/super.c
Expand Up @@ -29,6 +29,7 @@ struct dax_device {
void *private;
unsigned long flags;
const struct dax_operations *ops;
struct dev_pagemap *pgmap;
};

static dev_t dax_devt;
Expand Down Expand Up @@ -117,14 +118,15 @@ enum dax_device_flags {
* @dax_dev: a dax_device instance representing the logical memory range
* @pgoff: offset in pages from the start of the device to translate
* @nr_pages: number of consecutive pages caller can handle relative to @pfn
* @flags: by default 0, set to DAX_RECOVERY to kick start dax recovery
* @kaddr: output parameter that returns a virtual address mapping of pfn
* @pfn: output parameter that returns an absolute pfn translation of @pgoff
*
* Return: negative errno if an error occurs, otherwise the number of
* pages accessible at the device relative @pgoff.
*/
long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
void **kaddr, pfn_t *pfn)
int flags, void **kaddr, pfn_t *pfn)
{
long avail;

Expand All @@ -137,7 +139,7 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
if (nr_pages < 0)
return -EINVAL;

avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages,
avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, flags,
kaddr, pfn);
if (!avail)
return -ERANGE;
Expand Down Expand Up @@ -194,6 +196,18 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
}
EXPORT_SYMBOL_GPL(dax_zero_page_range);

size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *iter)
{
struct dev_pagemap *pgmap = dax_dev->pgmap;

if (!pgmap || !pgmap->ops->recovery_write)
return -EIO;
return pgmap->ops->recovery_write(pgmap, pgoff, addr, bytes,
(void *)iter);
}
EXPORT_SYMBOL_GPL(dax_recovery_write);

#ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_wb_cache_pmem(void *addr, size_t size);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
Expand Down Expand Up @@ -248,6 +262,11 @@ void set_dax_nomc(struct dax_device *dax_dev)
set_bit(DAXDEV_NOMC, &dax_dev->flags);
}
EXPORT_SYMBOL_GPL(set_dax_nomc);
void set_dax_pgmap(struct dax_device *dax_dev, struct dev_pagemap *pgmap)
{
dax_dev->pgmap = pgmap;
}
EXPORT_SYMBOL_GPL(set_dax_pgmap);

bool dax_alive(struct dax_device *dax_dev)
{
Expand Down
4 changes: 2 additions & 2 deletions drivers/md/dm-linear.c
Expand Up @@ -173,11 +173,11 @@ static struct dax_device *linear_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff)
}

static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
long nr_pages, int flags, void **kaddr, pfn_t *pfn)
{
struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff);

return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
return dax_direct_access(dax_dev, pgoff, nr_pages, flags, kaddr, pfn);
}

static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
Expand Down
5 changes: 3 additions & 2 deletions drivers/md/dm-log-writes.c
Expand Up @@ -912,11 +912,12 @@ static struct dax_device *log_writes_dax_pgoff(struct dm_target *ti,
}

static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
long nr_pages, int flags,
void **kaddr, pfn_t *pfn)
{
struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);

return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
return dax_direct_access(dax_dev, pgoff, nr_pages, flags, kaddr, pfn);
}

static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
Expand Down
4 changes: 2 additions & 2 deletions drivers/md/dm-stripe.c
Expand Up @@ -317,11 +317,11 @@ static struct dax_device *stripe_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff)
}

static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
long nr_pages, int flags, void **kaddr, pfn_t *pfn)
{
struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff);

return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
return dax_direct_access(dax_dev, pgoff, nr_pages, flags, kaddr, pfn);
}

static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
Expand Down
2 changes: 1 addition & 1 deletion drivers/md/dm-target.c
Expand Up @@ -142,7 +142,7 @@ static void io_err_release_clone_rq(struct request *clone,
}

static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
long nr_pages, int flags, void **kaddr, pfn_t *pfn)
{
return -EIO;
}
Expand Down
5 changes: 3 additions & 2 deletions drivers/md/dm-writecache.c
Expand Up @@ -286,7 +286,8 @@ static int persistent_memory_claim(struct dm_writecache *wc)

id = dax_read_lock();

da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, &wc->memory_map, &pfn);
da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, 0,
&wc->memory_map, &pfn);
if (da < 0) {
wc->memory_map = NULL;
r = da;
Expand All @@ -309,7 +310,7 @@ static int persistent_memory_claim(struct dm_writecache *wc)
do {
long daa;
daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, p - i,
NULL, &pfn);
0, NULL, &pfn);
if (daa <= 0) {
r = daa ? daa : -EINVAL;
goto err3;
Expand Down
5 changes: 3 additions & 2 deletions drivers/md/dm.c
Expand Up @@ -1001,7 +1001,8 @@ static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
}

static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
long nr_pages, int flags, void **kaddr,
pfn_t *pfn)
{
struct mapped_device *md = dax_get_private(dax_dev);
sector_t sector = pgoff * PAGE_SECTORS;
Expand All @@ -1019,7 +1020,7 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
if (len < 1)
goto out;
nr_pages = min(len, nr_pages);
ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
ret = ti->type->direct_access(ti, pgoff, nr_pages, flags, kaddr, pfn);

out:
dm_put_live_table(md, srcu_idx);
Expand Down
27 changes: 23 additions & 4 deletions drivers/nvdimm/pmem.c
Expand Up @@ -238,11 +238,11 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,

/* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
__weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
long nr_pages, int flags, void **kaddr, pfn_t *pfn)
{
resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;

if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
if (!flags && unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
PFN_PHYS(nr_pages))))
return -EIO;

Expand Down Expand Up @@ -277,18 +277,30 @@ static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
}

static long pmem_dax_direct_access(struct dax_device *dax_dev,
pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
pgoff_t pgoff, long nr_pages, int flags, void **kaddr,
pfn_t *pfn)
{
struct pmem_device *pmem = dax_get_private(dax_dev);

return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
return __pmem_direct_access(pmem, pgoff, nr_pages, flags, kaddr, pfn);
}

static const struct dax_operations pmem_dax_ops = {
.direct_access = pmem_dax_direct_access,
.zero_page_range = pmem_dax_zero_page_range,
};

static size_t pmem_recovery_write(struct dev_pagemap *pgmap, pgoff_t pgoff,
void *addr, size_t bytes, void *iter)
{
struct pmem_device *pmem = pgmap->owner;

dev_warn(pmem->bb.dev, "%s: not yet implemented\n", __func__);

/* XXX more later */
return 0;
}

static ssize_t write_cache_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
Expand Down Expand Up @@ -349,6 +361,10 @@ static void pmem_release_disk(void *__pmem)
blk_cleanup_disk(pmem->disk);
}

static const struct dev_pagemap_ops pmem_pgmap_ops = {
.recovery_write = pmem_recovery_write,
};

static int pmem_attach_disk(struct device *dev,
struct nd_namespace_common *ndns)
{
Expand Down Expand Up @@ -380,6 +396,8 @@ static int pmem_attach_disk(struct device *dev,
rc = nvdimm_setup_pfn(nd_pfn, &pmem->pgmap);
if (rc)
return rc;
if (nd_pfn->mode == PFN_MODE_PMEM)
pmem->pgmap.ops = &pmem_pgmap_ops;
}

/* we're attaching a block device, disable raw namespace access */
Expand Down Expand Up @@ -464,6 +482,7 @@ static int pmem_attach_disk(struct device *dev,
}
set_dax_nocache(dax_dev);
set_dax_nomc(dax_dev);
set_dax_pgmap(dax_dev, &pmem->pgmap);
if (is_nvdimm_sync(nd_region))
set_dax_synchronous(dax_dev);
rc = dax_add_host(dax_dev, disk);
Expand Down
2 changes: 1 addition & 1 deletion drivers/nvdimm/pmem.h
Expand Up @@ -27,7 +27,7 @@ struct pmem_device {
};

long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn);
long nr_pages, int flag, void **kaddr, pfn_t *pfn);

#ifdef CONFIG_MEMORY_FAILURE
static inline bool test_and_clear_pmem_poison(struct page *page)
Expand Down
4 changes: 2 additions & 2 deletions drivers/s390/block/dcssblk.c
Expand Up @@ -32,7 +32,7 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode);
static void dcssblk_release(struct gendisk *disk, fmode_t mode);
static void dcssblk_submit_bio(struct bio *bio);
static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn);
long nr_pages, int flags, void **kaddr, pfn_t *pfn);

static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";

Expand Down Expand Up @@ -927,7 +927,7 @@ __dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff,

static long
dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
long nr_pages, int flags, void **kaddr, pfn_t *pfn)
{
struct dcssblk_dev_info *dev_info = dax_get_private(dax_dev);

Expand Down
32 changes: 26 additions & 6 deletions fs/dax.c
Expand Up @@ -722,7 +722,7 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter
int id;

id = dax_read_lock();
rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL);
rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, 0, &kaddr, NULL);
if (rc < 0) {
dax_read_unlock(id);
return rc;
Expand Down Expand Up @@ -1013,7 +1013,7 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
long length;

id = dax_read_lock();
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), 0,
NULL, pfnp);
if (length < 0) {
rc = length;
Expand Down Expand Up @@ -1123,7 +1123,7 @@ static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff,
void *kaddr;
long ret;

ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
ret = dax_direct_access(dax_dev, pgoff, 1, 0, &kaddr, NULL);
if (ret > 0) {
memset(kaddr + offset, 0, size);
dax_flush(dax_dev, kaddr + offset, size);
Expand Down Expand Up @@ -1240,15 +1240,27 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
const size_t size = ALIGN(length + offset, PAGE_SIZE);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
ssize_t map_len;
int flags, recov;
void *kaddr;
long nrpg;

if (fatal_signal_pending(current)) {
ret = -EINTR;
break;
}

map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
&kaddr, NULL);
recov = 0;
flags = 0;
nrpg = PHYS_PFN(size);
map_len = dax_direct_access(dax_dev, pgoff, nrpg, flags,
&kaddr, NULL);
if ((map_len == -EIO) && (iov_iter_rw(iter) == WRITE)) {
flags |= DAX_RECOVERY;
map_len = dax_direct_access(dax_dev, pgoff, nrpg,
flags, &kaddr, NULL);
if (map_len > 0)
recov++;
}
if (map_len < 0) {
ret = map_len;
break;
Expand All @@ -1260,7 +1272,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
if (map_len > end - pos)
map_len = end - pos;

if (iov_iter_rw(iter) == WRITE)
if (recov)
xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
map_len, iter);
else if (iov_iter_rw(iter) == WRITE)
xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
else
Expand All @@ -1271,6 +1286,11 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
length -= xfer;
done += xfer;

if (recov && (xfer == (ssize_t) -EIO)) {
pr_warn("dax_recovery_write failed\n");
ret = -EIO;
break;
}
if (xfer == 0)
ret = -EFAULT;
if (xfer < map_len)
Expand Down
4 changes: 2 additions & 2 deletions fs/fuse/dax.c
Expand Up @@ -1241,8 +1241,8 @@ static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd)
INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker);

id = dax_read_lock();
nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), NULL,
NULL);
nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), 0,
NULL, NULL);
dax_read_unlock(id);
if (nr_pages < 0) {
pr_debug("dax_direct_access() returned %ld\n", nr_pages);
Expand Down
12 changes: 9 additions & 3 deletions include/linux/dax.h
Expand Up @@ -14,14 +14,17 @@ struct iomap_ops;
struct iomap_iter;
struct iomap;

/* Flag to communicate for DAX recovery operation */
#define DAX_RECOVERY 0x1

struct dax_operations {
/*
* direct_access: translate a device-relative
* logical-page-offset into an absolute physical pfn. Return the
* number of pages available for DAX at that pfn.
*/
long (*direct_access)(struct dax_device *, pgoff_t, long,
void **, pfn_t *);
long (*direct_access)(struct dax_device *dax_dev, pgoff_t pgoff,
long nr_pages, int flags, void **kaddr, pfn_t *pfn);
/*
* Validate whether this device is usable as an fsdax backing
* device.
Expand All @@ -40,6 +43,8 @@ void dax_write_cache(struct dax_device *dax_dev, bool wc);
bool dax_write_cache_enabled(struct dax_device *dax_dev);
bool dax_synchronous(struct dax_device *dax_dev);
void set_dax_synchronous(struct dax_device *dax_dev);
size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t bytes, struct iov_iter *i);
/*
* Check if given mapping is supported by the file / underlying device.
*/
Expand Down Expand Up @@ -91,6 +96,7 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,

void set_dax_nocache(struct dax_device *dax_dev);
void set_dax_nomc(struct dax_device *dax_dev);
void set_dax_pgmap(struct dax_device *dax_dev, struct dev_pagemap *pgmap);

struct writeback_control;
#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
Expand Down Expand Up @@ -178,7 +184,7 @@ static inline void dax_read_unlock(int id)
bool dax_alive(struct dax_device *dax_dev);
void *dax_get_private(struct dax_device *dax_dev);
long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
void **kaddr, pfn_t *pfn);
int flags, void **kaddr, pfn_t *pfn);
size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t bytes, struct iov_iter *i);
size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
Expand Down

0 comments on commit 203570f

Please sign in to comment.