Skip to content
This repository was archived by the owner on Nov 8, 2023. It is now read-only.

Commit f394576

Browse files
committed
iommufd: PFN handling for iopt_pages
The top of the data structure provides an IO Address Space (IOAS) that is similar to a VFIO container. The IOAS allows map/unmap of memory into ranges of IOVA called iopt_areas. Multiple IOMMU domains (IO page tables) and in-kernel accesses (like VFIO mdevs) can be attached to the IOAS to access the PFNs that those IOVA areas cover. The IO Address Space (IOAS) datastructure is composed of: - struct io_pagetable holding the IOVA map - struct iopt_areas representing populated portions of IOVA - struct iopt_pages representing the storage of PFNs - struct iommu_domain representing each IO page table in the system IOMMU - struct iopt_pages_access representing in-kernel accesses of PFNs (ie VFIO mdevs) - struct xarray pinned_pfns holding a list of pages pinned by in-kernel accesses This patch introduces the lowest part of the datastructure - the movement of PFNs in a tiered storage scheme: 1) iopt_pages::pinned_pfns xarray 2) Multiple iommu_domains 3) The origin of the PFNs, i.e. the userspace pointer PFN have to be copied between all combinations of tiers, depending on the configuration. The interface is an iterator called a 'pfn_reader' which determines which tier each PFN is stored and loads it into a list of PFNs held in a struct pfn_batch. Each step of the iterator will fill up the pfn_batch, then the caller can use the pfn_batch to send the PFNs to the required destination. Repeating this loop will read all the PFNs in an IOVA range. The pfn_reader and pfn_batch also keep track of the pinned page accounting. While PFNs are always stored and accessed as full PAGE_SIZE units the iommu_domain tier can store with a sub-page offset/length to support IOMMUs with a smaller IOPTE size than PAGE_SIZE. Link: https://lore.kernel.org/r/8-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Reviewed-by: Kevin Tian <kevin.tian@intel.com> Tested-by: Nicolin Chen <nicolinc@nvidia.com> Tested-by: Yi Liu <yi.l.liu@intel.com> Tested-by: Lixiao Yang <lixiao.yang@intel.com> Tested-by: Matthew Rosato <mjrosato@linux.ibm.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
1 parent ce5a23c commit f394576

File tree

7 files changed

+1262
-1
lines changed

7 files changed

+1262
-1
lines changed

.clang-format

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ ForEachMacros:
440440
- 'inet_lhash2_for_each_icsk'
441441
- 'inet_lhash2_for_each_icsk_continue'
442442
- 'inet_lhash2_for_each_icsk_rcu'
443+
- 'interval_tree_for_each_double_span'
443444
- 'interval_tree_for_each_span'
444445
- 'intlist__for_each_entry'
445446
- 'intlist__for_each_entry_safe'

drivers/iommu/iommufd/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# SPDX-License-Identifier: GPL-2.0-only
22
iommufd-y := \
3-
main.o
3+
main.o \
4+
pages.o
45

56
obj-$(CONFIG_IOMMUFD) += iommufd.o

drivers/iommu/iommufd/double_span.h

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.
3+
*/
4+
#ifndef __IOMMUFD_DOUBLE_SPAN_H
5+
#define __IOMMUFD_DOUBLE_SPAN_H
6+
7+
#include <linux/interval_tree.h>
8+
9+
/*
10+
* This is a variation of the general interval_tree_span_iter that computes the
11+
* spans over the union of two different interval trees. Used ranges are broken
12+
* up and reported based on the tree that provides the interval. The first span
13+
* always takes priority. Like interval_tree_span_iter it is greedy and the same
14+
* value of is_used will not repeat on two iteration cycles.
15+
*/
16+
struct interval_tree_double_span_iter {
17+
struct rb_root_cached *itrees[2];
18+
struct interval_tree_span_iter spans[2];
19+
union {
20+
unsigned long start_hole;
21+
unsigned long start_used;
22+
};
23+
union {
24+
unsigned long last_hole;
25+
unsigned long last_used;
26+
};
27+
/* 0 = hole, 1 = used span[0], 2 = used span[1], -1 done iteration */
28+
int is_used;
29+
};
30+
31+
void interval_tree_double_span_iter_update(
32+
struct interval_tree_double_span_iter *iter);
33+
void interval_tree_double_span_iter_first(
34+
struct interval_tree_double_span_iter *iter,
35+
struct rb_root_cached *itree1, struct rb_root_cached *itree2,
36+
unsigned long first_index, unsigned long last_index);
37+
void interval_tree_double_span_iter_next(
38+
struct interval_tree_double_span_iter *iter);
39+
40+
static inline bool
41+
interval_tree_double_span_iter_done(struct interval_tree_double_span_iter *state)
42+
{
43+
return state->is_used == -1;
44+
}
45+
46+
#define interval_tree_for_each_double_span(span, itree1, itree2, first_index, \
47+
last_index) \
48+
for (interval_tree_double_span_iter_first(span, itree1, itree2, \
49+
first_index, last_index); \
50+
!interval_tree_double_span_iter_done(span); \
51+
interval_tree_double_span_iter_next(span))
52+
53+
#endif

drivers/iommu/iommufd/io_pagetable.h

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3+
*
4+
*/
5+
#ifndef __IO_PAGETABLE_H
6+
#define __IO_PAGETABLE_H
7+
8+
#include <linux/interval_tree.h>
9+
#include <linux/mutex.h>
10+
#include <linux/kref.h>
11+
#include <linux/xarray.h>
12+
13+
#include "iommufd_private.h"
14+
15+
struct iommu_domain;
16+
17+
/*
18+
* Each io_pagetable is composed of intervals of areas which cover regions of
19+
* the iova that are backed by something. iova not covered by areas is not
20+
* populated in the page table. Each area is fully populated with pages.
21+
*
22+
* iovas are in byte units, but must be iopt->iova_alignment aligned.
23+
*
24+
* pages can be NULL, this means some other thread is still working on setting
25+
* up or tearing down the area. When observed under the write side of the
26+
* domain_rwsem a NULL pages must mean the area is still being setup and no
27+
* domains are filled.
28+
*
29+
* storage_domain points at an arbitrary iommu_domain that is holding the PFNs
30+
* for this area. It is locked by the pages->mutex. This simplifies the locking
31+
* as the pages code can rely on the storage_domain without having to get the
32+
* iopt->domains_rwsem.
33+
*
34+
* The io_pagetable::iova_rwsem protects node
35+
* The iopt_pages::mutex protects pages_node
36+
* iopt and immu_prot are immutable
37+
* The pages::mutex protects num_accesses
38+
*/
39+
struct iopt_area {
40+
struct interval_tree_node node;
41+
struct interval_tree_node pages_node;
42+
struct io_pagetable *iopt;
43+
struct iopt_pages *pages;
44+
struct iommu_domain *storage_domain;
45+
/* How many bytes into the first page the area starts */
46+
unsigned int page_offset;
47+
/* IOMMU_READ, IOMMU_WRITE, etc */
48+
int iommu_prot;
49+
unsigned int num_accesses;
50+
};
51+
52+
static inline unsigned long iopt_area_index(struct iopt_area *area)
53+
{
54+
return area->pages_node.start;
55+
}
56+
57+
static inline unsigned long iopt_area_last_index(struct iopt_area *area)
58+
{
59+
return area->pages_node.last;
60+
}
61+
62+
static inline unsigned long iopt_area_iova(struct iopt_area *area)
63+
{
64+
return area->node.start;
65+
}
66+
67+
static inline unsigned long iopt_area_last_iova(struct iopt_area *area)
68+
{
69+
return area->node.last;
70+
}
71+
72+
enum {
73+
IOPT_PAGES_ACCOUNT_NONE = 0,
74+
IOPT_PAGES_ACCOUNT_USER = 1,
75+
IOPT_PAGES_ACCOUNT_MM = 2,
76+
};
77+
78+
/*
79+
* This holds a pinned page list for multiple areas of IO address space. The
80+
* pages always originate from a linear chunk of userspace VA. Multiple
81+
* io_pagetable's, through their iopt_area's, can share a single iopt_pages
82+
* which avoids multi-pinning and double accounting of page consumption.
83+
*
84+
* indexes in this structure are measured in PAGE_SIZE units, are 0 based from
85+
* the start of the uptr and extend to npages. pages are pinned dynamically
86+
* according to the intervals in the access_itree and domains_itree, npinned
87+
* records the current number of pages pinned.
88+
*/
89+
struct iopt_pages {
90+
struct kref kref;
91+
struct mutex mutex;
92+
size_t npages;
93+
size_t npinned;
94+
size_t last_npinned;
95+
struct task_struct *source_task;
96+
struct mm_struct *source_mm;
97+
struct user_struct *source_user;
98+
void __user *uptr;
99+
bool writable:1;
100+
u8 account_mode;
101+
102+
struct xarray pinned_pfns;
103+
/* Of iopt_pages_access::node */
104+
struct rb_root_cached access_itree;
105+
/* Of iopt_area::pages_node */
106+
struct rb_root_cached domains_itree;
107+
};
108+
109+
#endif

drivers/iommu/iommufd/iommufd_private.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,30 @@ struct iommufd_ctx {
1414
struct xarray objects;
1515
};
1616

17+
/*
18+
* The IOVA to PFN map. The map automatically copies the PFNs into multiple
19+
* domains and permits sharing of PFNs between io_pagetable instances. This
20+
* supports both a design where IOAS's are 1:1 with a domain (eg because the
21+
* domain is HW customized), or where the IOAS is 1:N with multiple generic
22+
* domains. The io_pagetable holds an interval tree of iopt_areas which point
23+
* to shared iopt_pages which hold the pfns mapped to the page table.
24+
*
25+
* The locking order is domains_rwsem -> iova_rwsem -> pages::mutex
26+
*/
27+
struct io_pagetable {
28+
struct rw_semaphore domains_rwsem;
29+
struct xarray domains;
30+
unsigned int next_domain_id;
31+
32+
struct rw_semaphore iova_rwsem;
33+
struct rb_root_cached area_itree;
34+
/* IOVA that cannot become reserved, struct iopt_allowed */
35+
struct rb_root_cached allowed_itree;
36+
/* IOVA that cannot be allocated, struct iopt_reserved */
37+
struct rb_root_cached reserved_itree;
38+
u8 disable_large_pages;
39+
};
40+
1741
struct iommufd_ucmd {
1842
struct iommufd_ctx *ictx;
1943
void __user *ubuffer;

0 commit comments

Comments
 (0)