Skip to content

Commit

Permalink
mm, dmemfs: register and handle the dmem mce
Browse files Browse the repository at this point in the history
dmemfs register the mce handler, send signal to the procs
whose vma is mapped in mce pfn.

Signed-off-by: Haiwei Li <lihaiwei@tencent.com>
Signed-off-by: Yulei Zhang <yuleixzhang@tencent.com>
  • Loading branch information
Yulei Zhang authored and intel-lab-lkp committed Dec 7, 2020
1 parent 9c950a9 commit dc3ab3d
Show file tree
Hide file tree
Showing 5 changed files with 231 additions and 17 deletions.
141 changes: 141 additions & 0 deletions fs/dmemfs/inode.c
Expand Up @@ -36,6 +36,47 @@ MODULE_LICENSE("GPL v2");

static uint __read_mostly max_alloc_try_dpages = 1;

struct dmemfs_inode {
struct inode *inode;
struct list_head link;
};

static LIST_HEAD(dmemfs_inode_list);
static DEFINE_SPINLOCK(dmemfs_inode_lock);

static struct dmemfs_inode *
dmemfs_create_dmemfs_inode(struct inode *inode)
{
struct dmemfs_inode *dmemfs_inode;

spin_lock(&dmemfs_inode_lock);
dmemfs_inode = kmalloc(sizeof(struct dmemfs_inode), GFP_NOIO);
if (!dmemfs_inode) {
pr_err("DMEMFS: Out of memory while getting dmemfs inode\n");
goto out;
}
dmemfs_inode->inode = inode;
list_add_tail(&dmemfs_inode->link, &dmemfs_inode_list);
out:
spin_unlock(&dmemfs_inode_lock);
return dmemfs_inode;
}

static void dmemfs_delete_dmemfs_inode(struct inode *inode)
{
struct dmemfs_inode *i, *next;

spin_lock(&dmemfs_inode_lock);
list_for_each_entry_safe(i, next, &dmemfs_inode_list, link) {
if (i->inode == inode) {
list_del(&i->link);
kfree(i);
break;
}
}
spin_unlock(&dmemfs_inode_lock);
}

struct dmemfs_mount_opts {
unsigned long dpage_size;
};
Expand Down Expand Up @@ -218,6 +259,13 @@ static unsigned long dmem_pgoff_to_index(struct inode *inode, pgoff_t pgoff)
return pgoff >> (sb->s_blocksize_bits - PAGE_SHIFT);
}

static pgoff_t dmem_index_to_pgoff(struct inode *inode, unsigned long index)
{
struct super_block *sb = inode->i_sb;

return index << (sb->s_blocksize_bits - PAGE_SHIFT);
}

static void *dmem_addr_to_entry(struct inode *inode, phys_addr_t addr)
{
struct super_block *sb = inode->i_sb;
Expand Down Expand Up @@ -806,6 +854,23 @@ static void dmemfs_evict_inode(struct inode *inode)
clear_inode(inode);
}

static struct inode *dmemfs_alloc_inode(struct super_block *sb)
{
struct inode *inode;

inode = alloc_inode_nonrcu();
if (inode)
dmemfs_create_dmemfs_inode(inode);
return inode;
}

static void dmemfs_destroy_inode(struct inode *inode)
{
if (inode)
dmemfs_delete_dmemfs_inode(inode);
free_inode_nonrcu(inode);
}

/*
* Display the mount options in /proc/mounts.
*/
Expand All @@ -819,9 +884,11 @@ static int dmemfs_show_options(struct seq_file *m, struct dentry *root)
}

static const struct super_operations dmemfs_ops = {
.alloc_inode = dmemfs_alloc_inode,
.statfs = dmemfs_statfs,
.evict_inode = dmemfs_evict_inode,
.drop_inode = generic_delete_inode,
.destroy_inode = dmemfs_destroy_inode,
.show_options = dmemfs_show_options,
};

Expand Down Expand Up @@ -901,17 +968,91 @@ static struct file_system_type dmemfs_fs_type = {
.kill_sb = dmemfs_kill_sb,
};

static struct inode *
dmemfs_find_inode_by_addr(phys_addr_t addr, pgoff_t *pgoff)
{
struct dmemfs_inode *di;
struct inode *inode;
struct address_space *mapping;
void *entry, **slot;
void *mce_entry;

list_for_each_entry(di, &dmemfs_inode_list, link) {
inode = di->inode;
mapping = inode->i_mapping;
mce_entry = dmem_addr_to_entry(inode, addr);
XA_STATE(xas, &mapping->i_pages, 0);
rcu_read_lock();

xas_for_each(&xas, entry, ULONG_MAX) {
if (xas_retry(&xas, entry))
continue;

if (unlikely(entry != xas_reload(&xas)))
goto retry;

if (mce_entry != entry)
continue;
*pgoff = dmem_index_to_pgoff(inode, xas.xa_index);
rcu_read_unlock();
return inode;
retry:
xas_reset(&xas);
}
rcu_read_unlock();
}
return NULL;
}

static int dmemfs_mce_handler(struct notifier_block *this, unsigned long pfn,
void *v)
{
struct dmem_mce_notifier_info *info =
(struct dmem_mce_notifier_info *)v;
int flags = info->flags;
struct inode *inode;
phys_addr_t mce_addr = __pfn_to_phys(pfn);
pgoff_t pgoff;

spin_lock(&dmemfs_inode_lock);
inode = dmemfs_find_inode_by_addr(mce_addr, &pgoff);
if (!inode || !atomic_read(&inode->i_count))
goto out;

collect_procs_and_signal_inode(inode, pgoff, pfn, flags);
out:
spin_unlock(&dmemfs_inode_lock);
return 0;
}

static struct notifier_block dmemfs_mce_notifier = {
.notifier_call = dmemfs_mce_handler,
};

static int __init dmemfs_init(void)
{
int ret;

pr_info("dmemfs initialized\n");
ret = register_filesystem(&dmemfs_fs_type);
if (ret)
goto reg_fs_fail;

ret = dmem_register_mce_notifier(&dmemfs_mce_notifier);
if (ret)
goto reg_notifier_fail;

return 0;

reg_notifier_fail:
unregister_filesystem(&dmemfs_fs_type);
reg_fs_fail:
return ret;
}

static void __exit dmemfs_uninit(void)
{
dmem_unregister_mce_notifier(&dmemfs_mce_notifier);
unregister_filesystem(&dmemfs_fs_type);
}

Expand Down
7 changes: 7 additions & 0 deletions include/linux/dmem.h
Expand Up @@ -23,6 +23,13 @@ bool is_dmem_pfn(unsigned long pfn);
#define dmem_free_page(addr) dmem_free_pages(addr, 1)

bool dmem_memory_failure(unsigned long pfn, int flags);

struct dmem_mce_notifier_info {
int flags;
};

int dmem_register_mce_notifier(struct notifier_block *nb);
int dmem_unregister_mce_notifier(struct notifier_block *nb);
#else
static inline int dmem_reserve_init(void)
{
Expand Down
2 changes: 2 additions & 0 deletions include/linux/mm.h
Expand Up @@ -3041,6 +3041,8 @@ extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
extern void collect_procs_and_signal_inode(struct inode *inode, pgoff_t pgoff,
unsigned long pfn, int flags);
extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p, int access);
Expand Down
34 changes: 34 additions & 0 deletions mm/dmem.c
Expand Up @@ -70,6 +70,7 @@ struct dmem_node {

struct dmem_pool {
struct mutex lock;
struct raw_notifier_head mce_notifier_chain;

unsigned long region_num;
unsigned long registered_pages;
Expand All @@ -92,6 +93,7 @@ struct dmem_pool {

static struct dmem_pool dmem_pool = {
.lock = __MUTEX_INITIALIZER(dmem_pool.lock),
.mce_notifier_chain = RAW_NOTIFIER_INIT(dmem_pool.mce_notifier_chain),
};

#define DMEM_PAGE_SIZE (1UL << dmem_pool.dpage_shift)
Expand Down Expand Up @@ -121,6 +123,35 @@ static struct dmem_pool dmem_pool = {
#define for_each_dmem_region(_dnode, _dregion) \
list_for_each_entry(_dregion, &(_dnode)->regions, node)

int dmem_register_mce_notifier(struct notifier_block *nb)
{
int ret;

mutex_lock(&dmem_pool.lock);
ret = raw_notifier_chain_register(&dmem_pool.mce_notifier_chain, nb);
mutex_unlock(&dmem_pool.lock);
return ret;
}
EXPORT_SYMBOL(dmem_register_mce_notifier);

int dmem_unregister_mce_notifier(struct notifier_block *nb)
{
int ret;

mutex_lock(&dmem_pool.lock);
ret = raw_notifier_chain_unregister(&dmem_pool.mce_notifier_chain, nb);
mutex_unlock(&dmem_pool.lock);
return ret;
}
EXPORT_SYMBOL(dmem_unregister_mce_notifier);

static int dmem_mce_notify(unsigned long pfn,
struct dmem_mce_notifier_info *info)
{
return raw_notifier_call_chain(&dmem_pool.mce_notifier_chain,
pfn, info);
}

static inline int *dmem_nodelist(int nid)
{
return nid_to_dnode(nid)->nodelist;
Expand Down Expand Up @@ -1003,6 +1034,7 @@ bool dmem_memory_failure(unsigned long pfn, int flags)
u64 pos;
phys_addr_t addr = __pfn_to_phys(pfn);
bool used = false;
struct dmem_mce_notifier_info info;

dregion = find_dmem_region(addr, &pdnode);
if (!dregion)
Expand All @@ -1022,6 +1054,8 @@ bool dmem_memory_failure(unsigned long pfn, int flags)
pos = phys_to_dpage(addr) - dregion->dpage_start_pfn;
if (__test_and_set_bit(pos, dregion->bitmap)) {
used = true;
info.flags = flags;
dmem_mce_notify(pfn, &info);
} else {
pr_info("MCE: free dpage, mark %#lx disabled in dmem\n", pfn);
dnode_count_free_dpages(pdnode, -1);
Expand Down

0 comments on commit dc3ab3d

Please sign in to comment.