Skip to content

Commit

Permalink
mm/pte_ref: add support for user PTE page table page allocation
Browse files Browse the repository at this point in the history
When the PTE page table page is allocated and installed into the
pmd entry, it needs to take an initial reference count to prevent
the release of PTE page table page by other threads, and the caller
of pte_alloc()(or other friends) needs to reduce this reference count.

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
  • Loading branch information
Qi Zheng authored and intel-lab-lkp committed Nov 10, 2021
1 parent e034040 commit 6e3cc5b
Show file tree
Hide file tree
Showing 9 changed files with 83 additions and 21 deletions.
7 changes: 5 additions & 2 deletions include/linux/mm.h
Expand Up @@ -26,6 +26,7 @@
#include <linux/err.h>
#include <linux/page-flags.h>
#include <linux/page_ref.h>
#include <linux/pte_ref.h>
#include <linux/memremap.h>
#include <linux/overflow.h>
#include <linux/sizes.h>
Expand Down Expand Up @@ -2313,9 +2314,11 @@ enum pmd_installed_type {

static inline int pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
if (unlikely(pmd_none(*(pmd))))
enum pte_tryget_type ret = pte_try_get(pmd);

if (ret == TRYGET_FAILED_NONE || ret == TRYGET_FAILED_ZERO)
return __pte_alloc(mm, pmd);
if (unlikely(is_huge_pmd(*pmd)))
else if (ret == TRYGET_FAILED_HUGE_PMD)
return INSTALLED_HUGE_PMD;

return INSTALLED_PTE;
Expand Down
1 change: 1 addition & 0 deletions mm/debug_vm_pgtable.c
Expand Up @@ -1048,6 +1048,7 @@ static void __init destroy_args(struct pgtable_debug_args *args)

/* Free page table entries */
if (args->start_ptep) {
pte_put(args->mm, args->start_pmdp, args->vaddr);
pte_free(args->mm, args->start_ptep);
mm_dec_nr_ptes(args->mm);
}
Expand Down
8 changes: 6 additions & 2 deletions mm/filemap.c
Expand Up @@ -3217,6 +3217,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
}
}

retry:
if (pmd_none(*vmf->pmd)) {
int ret = pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);

Expand All @@ -3225,6 +3226,8 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
} else if (pmd_devmap_trans_unstable(vmf->pmd)) {
/* See comment in handle_pte_fault() */
goto out;
} else if (pte_try_get(vmf->pmd) == TRYGET_FAILED_ZERO) {
goto retry;
}

return false;
Expand Down Expand Up @@ -3301,7 +3304,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
unsigned long addr;
unsigned long addr, start;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct page *head, *page;
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
Expand All @@ -3317,7 +3320,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
goto out;
}

addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
start = addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
do {
page = find_subpage(head, xas.xa_index);
Expand Down Expand Up @@ -3348,6 +3351,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
put_page(head);
} while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL);
pte_unmap_unlock(vmf->pte, vmf->ptl);
pte_put(vma->vm_mm, vmf->pmd, start);
out:
rcu_read_unlock();
WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
Expand Down
10 changes: 7 additions & 3 deletions mm/gup.c
Expand Up @@ -694,16 +694,20 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
spin_unlock(ptl);
ret = 0;
split_huge_pmd(vma, pmd, address);
if (pmd_trans_unstable(pmd))
if (pte_try_get(pmd) == TRYGET_FAILED_HUGE_PMD)
ret = -EBUSY;
} else {
spin_unlock(ptl);
split_huge_pmd(vma, pmd, address);
ret = pte_alloc(mm, pmd) < 0 ? -ENOMEM : 0;
}

return ret ? ERR_PTR(ret) :
follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
if (ret)
return ERR_PTR(ret);

page = follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
pte_put(mm, pmd, address);
return page;
}
page = follow_trans_huge_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
Expand Down
51 changes: 41 additions & 10 deletions mm/memory.c
Expand Up @@ -441,10 +441,13 @@ enum pmd_installed_type pmd_install(struct mm_struct *mm, pmd_t *pmd,
pgtable_t *pte)
{
int ret = INSTALLED_PTE;
spinlock_t *ptl = pmd_lock(mm, pmd);
spinlock_t *ptl;

retry:
ptl = pmd_lock(mm, pmd);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
mm_inc_nr_ptes(mm);
pte_ref_init(*pte, pmd, 1);
/*
* Ensure all pte setup (eg. pte page lock and page clearing) are
* visible before the pte is made visible to other CPUs by being
Expand All @@ -464,6 +467,9 @@ enum pmd_installed_type pmd_install(struct mm_struct *mm, pmd_t *pmd,
} else if (is_huge_pmd(*pmd)) {
/* See comment in handle_pte_fault() */
ret = INSTALLED_HUGE_PMD;
} else if (!pte_get_unless_zero(pmd)) {
spin_unlock(ptl);
goto retry;
}
spin_unlock(ptl);

Expand Down Expand Up @@ -1028,6 +1034,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
struct page *prealloc = NULL;
unsigned long start = addr;

again:
progress = 0;
Expand Down Expand Up @@ -1108,6 +1115,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_unmap(orig_src_pte);
add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
pte_put(dst_mm, dst_pmd, start);
cond_resched();

if (ret == -EIO) {
Expand Down Expand Up @@ -1778,6 +1786,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
goto out;
retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
pte_unmap_unlock(pte, ptl);
pte_put(mm, pte_to_pmd(pte), addr);
out:
return retval;
}
Expand Down Expand Up @@ -1810,6 +1819,7 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
unsigned long remaining_pages_total = *num;
unsigned long pages_to_write_in_pmd;
int ret;
unsigned long start = addr;
more:
ret = -EFAULT;
pmd = walk_to_pmd(mm, addr);
Expand All @@ -1836,7 +1846,7 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
pte_unmap_unlock(start_pte, pte_lock);
ret = err;
remaining_pages_total -= pte_idx;
goto out;
goto put;
}
addr += PAGE_SIZE;
++curr_page_idx;
Expand All @@ -1845,9 +1855,13 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
pages_to_write_in_pmd -= batch_size;
remaining_pages_total -= batch_size;
}
if (remaining_pages_total)
if (remaining_pages_total) {
pte_put(mm, pmd, start);
goto more;
}
ret = 0;
put:
pte_put(mm, pmd, start);
out:
*num = remaining_pages_total;
return ret;
Expand Down Expand Up @@ -2075,6 +2089,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,

out_unlock:
pte_unmap_unlock(pte, ptl);
pte_put(mm, pte_to_pmd(pte), addr);
return VM_FAULT_NOPAGE;
}

Expand Down Expand Up @@ -2275,6 +2290,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
unsigned long start = addr;
pte_t *pte, *mapped_pte;
spinlock_t *ptl;
int err = 0;
Expand All @@ -2294,6 +2310,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(mapped_pte, ptl);
pte_put(mm, pmd, start);
return err;
}

Expand Down Expand Up @@ -2503,6 +2520,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
unsigned long start = addr;
pte_t *pte, *mapped_pte;
int err = 0;
spinlock_t *ptl;
Expand Down Expand Up @@ -2536,8 +2554,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,

arch_leave_lazy_mmu_mode();

if (mm != &init_mm)
if (mm != &init_mm) {
pte_unmap_unlock(mapped_pte, ptl);
if (create)
pte_put(mm, pmd, start);
}
return err;
}

Expand Down Expand Up @@ -3761,7 +3782,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_MISSING);
ret = handle_userfault(vmf, VM_UFFD_MISSING);
goto put;
}
goto setpte;
}
Expand Down Expand Up @@ -3804,7 +3826,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
put_page(page);
return handle_userfault(vmf, VM_UFFD_MISSING);
ret = handle_userfault(vmf, VM_UFFD_MISSING);
goto put;
}

inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
Expand All @@ -3817,14 +3840,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
goto put;
release:
put_page(page);
goto unlock;
oom_free_page:
put_page(page);
oom:
return VM_FAULT_OOM;
ret = VM_FAULT_OOM;
put:
pte_put(vma->vm_mm, vmf->pmd, vmf->address);
return ret;
}

/*
Expand Down Expand Up @@ -4031,7 +4057,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return ret;
}

if (pmd_none(*vmf->pmd)) {
retry:
ret = pte_try_get(vmf->pmd);
if (ret == TRYGET_FAILED_NONE) {
int alloc_ret;

if (PageTransCompound(page)) {
Expand All @@ -4047,9 +4075,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)

if (unlikely(alloc_ret != INSTALLED_PTE))
return alloc_ret < 0 ? VM_FAULT_OOM : 0;
} else if (pmd_devmap_trans_unstable(vmf->pmd)) {
} else if (ret == TRYGET_FAILED_HUGE_PMD) {
/* See comment in handle_pte_fault() */
return 0;
} else if (ret == TRYGET_FAILED_ZERO) {
goto retry;
}

vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
Expand All @@ -4063,6 +4093,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)

update_mmu_tlb(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
pte_put(vma->vm_mm, vmf->pmd, vmf->address);
return ret;
}

Expand Down
9 changes: 6 additions & 3 deletions mm/migrate.c
Expand Up @@ -2736,9 +2736,9 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
goto abort;

if (unlikely(anon_vma_prepare(vma)))
goto abort;
goto put;
if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
goto abort;
goto put;

/*
* The memory barrier inside __SetPageUptodate makes sure that
Expand All @@ -2764,7 +2764,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
* device memory.
*/
pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
goto abort;
goto put;
}
} else {
entry = mk_pte(page, vma->vm_page_prot);
Expand Down Expand Up @@ -2811,11 +2811,14 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
}

pte_unmap_unlock(ptep, ptl);
pte_put(mm, pmdp, addr);
*src = MIGRATE_PFN_MIGRATE;
return;

unlock_abort:
pte_unmap_unlock(ptep, ptl);
put:
pte_put(mm, pmdp, addr);
abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
Expand Down
1 change: 1 addition & 0 deletions mm/mlock.c
Expand Up @@ -398,6 +398,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
break;
}
pte_unmap_unlock(pte, ptl);
pte_put(vma->vm_mm, pte_to_pmd(pte), start);
return start;
}

Expand Down
1 change: 1 addition & 0 deletions mm/mremap.c
Expand Up @@ -555,6 +555,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
break;
move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
new_pmd, new_addr, need_rmap_locks);
pte_put(new_vma->vm_mm, new_pmd, new_addr);
}

mmu_notifier_invalidate_range_end(&range);
Expand Down
16 changes: 15 additions & 1 deletion mm/userfaultfd.c
Expand Up @@ -574,6 +574,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,

while (src_addr < src_start + len) {
pmd_t dst_pmdval;
enum pte_tryget_type tryget_type;

BUG_ON(dst_addr >= dst_start + len);

Expand All @@ -583,6 +584,14 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
break;
}

again:
/*
* After the management of the PTE page changes to the refcount
* mode, the PTE page may be released by another thread(rcu mode),
* so the rcu lock is held here to prevent the PTE page from
* being released.
*/
rcu_read_lock();
dst_pmdval = pmd_read_atomic(dst_pmd);
/*
* If the dst_pmd is mapped as THP don't
Expand All @@ -593,7 +602,9 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
break;
}

if (unlikely(pmd_none(dst_pmdval))) {
tryget_type = pte_try_get(&dst_pmdval);
rcu_read_unlock();
if (unlikely(tryget_type == TRYGET_FAILED_NONE)) {
int ret = __pte_alloc(dst_mm, dst_pmd);

/*
Expand All @@ -607,13 +618,16 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
err = -EFAULT;
break;
}
} else if (unlikely(tryget_type == TRYGET_FAILED_ZERO)) {
goto again;
}

BUG_ON(pmd_none(*dst_pmd));
BUG_ON(pmd_trans_huge(*dst_pmd));

err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
src_addr, &page, mcopy_mode, wp_copy);
pte_put(dst_mm, dst_pmd, dst_addr);
cond_resched();

if (unlikely(err == -ENOENT)) {
Expand Down

0 comments on commit 6e3cc5b

Please sign in to comment.