Skip to content

Commit

Permalink
drm/amdkfd: deregister svm range
Browse files Browse the repository at this point in the history
When application explicitly call unmap or unmap from mmput when
application exit, driver will receive MMU_NOTIFY_UNMAP event to remove
svm range from process svms object tree and list first, unmap from GPUs
(in the following patch).

Split the svm ranges to handle partial unmapping of svm ranges. To
avoid deadlocks, updating MMU notifiers, range lists and interval trees
is done in a deferred worker. New child ranges are attached to their
parent range's child_list until the worker can update the
svm_range_list. svm_range_set_attr flushes deferred work and takes the
mmap_write_lock to guarantee that it has an up-to-date svm_range_list.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
  • Loading branch information
PhilipYangA authored and intel-lab-lkp committed Apr 1, 2021
1 parent df82ede commit 545d800
Show file tree
Hide file tree
Showing 3 changed files with 305 additions and 1 deletion.
3 changes: 3 additions & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,9 @@ struct svm_range_list {
struct mutex lock;
struct rb_root_cached objects;
struct list_head list;
struct work_struct deferred_list_work;
struct list_head deferred_range_list;
spinlock_t deferred_list_lock;
};

/* Process data */
Expand Down
285 changes: 284 additions & 1 deletion drivers/gpu/drm/amd/amdkfd/kfd_svm.c
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
INIT_LIST_HEAD(&prange->update_list);
INIT_LIST_HEAD(&prange->remove_list);
INIT_LIST_HEAD(&prange->insert_list);
INIT_LIST_HEAD(&prange->deferred_list);
INIT_LIST_HEAD(&prange->child_list);
mutex_init(&prange->lock);
svm_range_set_default_attributes(&prange->preferred_loc,
&prange->prefetch_loc,
Expand Down Expand Up @@ -412,6 +414,17 @@ svm_range_split_head(struct svm_range *prange, struct svm_range *new,
return r;
}

void svm_range_add_child(struct svm_range *prange, struct mm_struct *mm,
struct svm_range *pchild, enum svm_work_list_ops op)
{
pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n",
pchild, pchild->start, pchild->last, prange, op);

pchild->work_item.mm = mm;
pchild->work_item.op = op;
list_add_tail(&pchild->child_list, &prange->child_list);
}

/*
* Validation+GPU mapping with concurrent invalidation (MMU notifiers)
*
Expand Down Expand Up @@ -471,6 +484,30 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
return r;
}

/**
* svm_range_list_lock_and_flush_work - flush pending deferred work
*
* @svms: the svm range list
* @mm: the mm structure
*
* Context: Returns with mmap write lock held, pending deferred work flushed
*
*/
static void
svm_range_list_lock_and_flush_work(struct svm_range_list *svms,
struct mm_struct *mm)
{
retry_flush_work:
flush_work(&svms->deferred_list_work);
mmap_write_lock(mm);

if (list_empty(&svms->deferred_range_list))
return;
mmap_write_unlock(mm);
pr_debug("retry flush\n");
goto retry_flush_work;
}

struct svm_range *svm_range_clone(struct svm_range *old)
{
struct svm_range *new;
Expand Down Expand Up @@ -611,15 +648,255 @@ svm_range_handle_overlap(struct svm_range_list *svms, struct svm_range *new,
return r;
}

static void
svm_range_update_notifier_and_interval_tree(struct mm_struct *mm,
struct svm_range *prange)
{
unsigned long start;
unsigned long last;

start = prange->notifier.interval_tree.start >> PAGE_SHIFT;
last = prange->notifier.interval_tree.last >> PAGE_SHIFT;

if (prange->start == start && prange->last == last)
return;

pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
prange->svms, prange, start, last, prange->start,
prange->last);

if (start != 0 && last != 0) {
interval_tree_remove(&prange->it_node, &prange->svms->objects);
svm_range_remove_notifier(prange);
}
prange->it_node.start = prange->start;
prange->it_node.last = prange->last;

interval_tree_insert(&prange->it_node, &prange->svms->objects);
svm_range_add_notifier_locked(mm, prange);
}

static void
svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange)
{
struct mm_struct *mm = prange->work_item.mm;

switch (prange->work_item.op) {
case SVM_OP_NULL:
pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n",
svms, prange, prange->start, prange->last);
break;
case SVM_OP_UNMAP_RANGE:
pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n",
svms, prange, prange->start, prange->last);
svm_range_unlink(prange);
svm_range_remove_notifier(prange);
svm_range_free(prange);
break;
case SVM_OP_UPDATE_RANGE_NOTIFIER:
pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n",
svms, prange, prange->start, prange->last);
svm_range_update_notifier_and_interval_tree(mm, prange);
break;
case SVM_OP_ADD_RANGE:
pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange,
prange->start, prange->last);
svm_range_add_to_svms(prange);
svm_range_add_notifier_locked(mm, prange);
break;
default:
WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange,
prange->work_item.op);
}
}

static void svm_range_deferred_list_work(struct work_struct *work)
{
struct svm_range_list *svms;
struct svm_range *prange;
struct mm_struct *mm;

svms = container_of(work, struct svm_range_list, deferred_list_work);
pr_debug("enter svms 0x%p\n", svms);

spin_lock(&svms->deferred_list_lock);
while (!list_empty(&svms->deferred_range_list)) {
prange = list_first_entry(&svms->deferred_range_list,
struct svm_range, deferred_list);
spin_unlock(&svms->deferred_list_lock);
pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
prange->start, prange->last, prange->work_item.op);

mm = prange->work_item.mm;
mmap_write_lock(mm);
mutex_lock(&svms->lock);

/* Remove from deferred_list must be inside mmap write lock,
* otherwise, svm_range_list_lock_and_flush_work may hold mmap
* write lock, and continue because deferred_list is empty, then
* deferred_list handle is blocked by mmap write lock.
*/
spin_lock(&svms->deferred_list_lock);
list_del_init(&prange->deferred_list);
spin_unlock(&svms->deferred_list_lock);

while (!list_empty(&prange->child_list)) {
struct svm_range *pchild;

pchild = list_first_entry(&prange->child_list,
struct svm_range, child_list);
pr_debug("child prange 0x%p op %d\n", pchild,
pchild->work_item.op);
list_del_init(&pchild->child_list);
svm_range_handle_list_op(svms, pchild);
}

svm_range_handle_list_op(svms, prange);
mutex_unlock(&svms->lock);
mmap_write_unlock(mm);

spin_lock(&svms->deferred_list_lock);
}
spin_unlock(&svms->deferred_list_lock);

pr_debug("exit svms 0x%p\n", svms);
}

void
svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange,
struct mm_struct *mm, enum svm_work_list_ops op)
{
spin_lock(&svms->deferred_list_lock);
/* if prange is on the deferred list */
if (!list_empty(&prange->deferred_list)) {
pr_debug("update exist prange 0x%p work op %d\n", prange, op);
WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n");
if (op != SVM_OP_NULL &&
prange->work_item.op != SVM_OP_UNMAP_RANGE)
prange->work_item.op = op;
} else {
prange->work_item.op = op;
prange->work_item.mm = mm;
list_add_tail(&prange->deferred_list,
&prange->svms->deferred_range_list);
pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n",
prange, prange->start, prange->last, op);
}
spin_unlock(&svms->deferred_list_lock);
}

void schedule_deferred_list_work(struct svm_range_list *svms)
{
spin_lock(&svms->deferred_list_lock);
if (!list_empty(&svms->deferred_range_list))
schedule_work(&svms->deferred_list_work);
spin_unlock(&svms->deferred_list_lock);
}

static void
svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent,
struct svm_range *prange, unsigned long start,
unsigned long last)
{
struct svm_range *head;
struct svm_range *tail;

if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange,
prange->start, prange->last);
return;
}
if (start > prange->last || last < prange->start)
return;

head = tail = prange;
if (start > prange->start)
svm_range_split(prange, prange->start, start - 1, &tail);
if (last < tail->last)
svm_range_split(tail, last + 1, tail->last, &head);

if (head != prange && tail != prange) {
svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
} else if (tail != prange) {
svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE);
} else if (head != prange) {
svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
} else if (parent != prange) {
prange->work_item.op = SVM_OP_UNMAP_RANGE;
}
}

static void
svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
unsigned long start, unsigned long last)
{
struct svm_range_list *svms;
struct svm_range *pchild;
struct kfd_process *p;
bool unmap_parent;

p = kfd_lookup_process_by_mm(mm);
if (!p)
return;
svms = &p->svms;

pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms,
prange, prange->start, prange->last, start, last);

unmap_parent = start <= prange->start && last >= prange->last;

list_for_each_entry(pchild, &prange->child_list, child_list)
svm_range_unmap_split(mm, prange, pchild, start, last);
svm_range_unmap_split(mm, prange, prange, start, last);

if (unmap_parent)
svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE);
else
svm_range_add_list_work(svms, prange, mm,
SVM_OP_UPDATE_RANGE_NOTIFIER);
schedule_deferred_list_work(svms);

kfd_unref_process(p);
}

/**
* svm_range_cpu_invalidate_pagetables - interval notifier callback
*
* MMU range unmap notifier to remove svm ranges
*/
static bool
svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq)
{
struct svm_range *prange;
unsigned long start;
unsigned long last;

if (range->event == MMU_NOTIFY_RELEASE)
return true;

start = mni->interval_tree.start;
last = mni->interval_tree.last;
start = (start > range->start ? start : range->start) >> PAGE_SHIFT;
last = (last < (range->end - 1) ? last : range->end - 1) >> PAGE_SHIFT;
pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n",
start, last, range->start >> PAGE_SHIFT,
(range->end - 1) >> PAGE_SHIFT,
mni->interval_tree.start >> PAGE_SHIFT,
mni->interval_tree.last >> PAGE_SHIFT, range->event);

prange = container_of(mni, struct svm_range, notifier);

switch (range->event) {
case MMU_NOTIFY_UNMAP:
svm_range_unmap_from_cpu(mni->mm, prange, start, last);
break;
default:
break;
}

return true;
}

Expand All @@ -628,6 +905,9 @@ void svm_range_list_fini(struct kfd_process *p)
mutex_destroy(&p->svms.lock);

pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms);

/* Ensure list work is finished before process is destroyed */
flush_work(&p->svms.deferred_list_work);
}

int svm_range_list_init(struct kfd_process *p)
Expand All @@ -637,6 +917,9 @@ int svm_range_list_init(struct kfd_process *p)
svms->objects = RB_ROOT_CACHED;
mutex_init(&svms->lock);
INIT_LIST_HEAD(&svms->list);
INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work);
INIT_LIST_HEAD(&svms->deferred_range_list);
spin_lock_init(&svms->deferred_list_lock);

return 0;
}
Expand Down Expand Up @@ -754,7 +1037,7 @@ svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size,

mutex_lock(&process_info->lock);

mmap_write_lock(mm);
svm_range_list_lock_and_flush_work(svms, mm);

if (!svm_range_is_valid(mm, start, size)) {
pr_debug("invalid range\n");
Expand Down
18 changes: 18 additions & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_svm.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,18 @@
#include "amdgpu.h"
#include "kfd_priv.h"

enum svm_work_list_ops {
SVM_OP_NULL,
SVM_OP_UNMAP_RANGE,
SVM_OP_UPDATE_RANGE_NOTIFIER,
SVM_OP_ADD_RANGE
};

struct svm_work_list_item {
enum svm_work_list_ops op;
struct mm_struct *mm;
};

/**
* struct svm_range - shared virtual memory range
*
Expand All @@ -53,6 +65,9 @@
* @actual_loc: the actual location, 0 for CPU, or GPU id
* @granularity:migration granularity, log2 num pages
* @notifier: register mmu interval notifier
* @work_item: deferred work item information
* @deferred_list: list header used to add range to deferred list
* @child_list: list header for split ranges which are not added to svms yet
* @bitmap_access: index bitmap of GPUs which can access the range
* @bitmap_aip: index bitmap of GPUs which can access the range in place
*
Expand All @@ -78,6 +93,9 @@ struct svm_range {
uint32_t actual_loc;
uint8_t granularity;
struct mmu_interval_notifier notifier;
struct svm_work_list_item work_item;
struct list_head deferred_list;
struct list_head child_list;
DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
};
Expand Down

0 comments on commit 545d800

Please sign in to comment.