-
Notifications
You must be signed in to change notification settings - Fork 39
/
PATCH-RFC-x86-mm-pat-Restore-large-pages-after-fragmentation.mypatch
290 lines (270 loc) · 8.7 KB
/
PATCH-RFC-x86-mm-pat-Restore-large-pages-after-fragmentation.mypatch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
From c1029d896706bc3c9bd219b051ea6e2e563021a6 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 17 Apr 2020 00:32:29 +0300
Subject: [PATCH 2/2] x86/mm/pat: Restore large pages after fragmentation
Change of attributes of the pages may lead to fragmentation of direct
mapping over time and performance degradation as result.
With current code it's one way road: kernel tries to avoid splitting
large pages, but it doesn't restore them back even if page attributes
got compatible again.
Any change to the mapping may potentially allow to restore large page.
Hook up into cpa_flush() path to check if there's any pages to be
recovered in PUD_SIZE range around pages we've just touched.
CPUs don't like[1] to have to have TLB entries of different size for the
same memory, but looks like it's okay as long as these entries have
matching attributes[2]. Therefore it's critical to flush TLB before any
following changes to the mapping.
Note that we already allow for multiple TLB entries of different sizes
for the same memory now in split_large_page() path. It's not a new
situation.
set_memory_4k() provides a way to use 4k pages on purpose. Kernel must
not remap such pages as large. Re-use one of software PTE bits to
indicate such pages.
[1] See Erratum 383 of AMD Family 10h Processors
[2] https://lore.kernel.org/linux-mm/1da1b025-cabc-6f04-bde5-e50830d1ecf0@amd.com/
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
arch/x86/include/asm/pgtable_types.h | 2 +
arch/x86/mm/pat/set_memory.c | 191 ++++++++++++++++++++++++++-
2 files changed, 188 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 65c2ecd73..eb06df8f3 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -33,6 +33,7 @@
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
+#define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
/* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -55,6 +56,7 @@
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K)
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index c4aedd00c..04b3e9257 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -340,22 +340,57 @@ static void __cpa_flush_tlb(void *data)
__flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
}
+static void restore_large_pages(unsigned long addr, struct list_head *pgtables);
+
+static void cpa_restore_large_pages(struct cpa_data *cpa,
+ struct list_head *pgtables)
+{
+ unsigned long start, addr, end;
+ int i;
+
+ if (cpa->flags & CPA_PAGES_ARRAY) {
+ for (i = 0; i < cpa->numpages; i++)
+ restore_large_pages(__cpa_addr(cpa, i), pgtables);
+ return;
+ }
+
+ start = __cpa_addr(cpa, 0);
+ end = start + PAGE_SIZE * cpa->numpages;
+
+ for (addr = start; addr >= start && addr < end; addr += PUD_SIZE)
+ restore_large_pages(addr, pgtables);
+}
+
static void cpa_flush(struct cpa_data *data, int cache)
{
+ LIST_HEAD(pgtables);
+ struct page *page, *tmp;
struct cpa_data *cpa = data;
unsigned int i;
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
+ cpa_restore_large_pages(data, &pgtables);
+
if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
cpa_flush_all(cache);
+ list_for_each_entry_safe(page, tmp, &pgtables, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ }
return;
}
- if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling)
+ if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling
+ || !list_empty(&pgtables))
flush_tlb_all();
else
on_each_cpu(__cpa_flush_tlb, cpa, 1);
+ list_for_each_entry_safe(page, tmp, &pgtables, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ }
+
if (!cache)
return;
@@ -1071,6 +1105,153 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
return 0;
}
+static void restore_pmd_page(pmd_t *pmd, unsigned long addr,
+ struct list_head *pgtables)
+{
+ pgprot_t pgprot;
+ pmd_t _pmd, old_pmd;
+ pte_t *pte, first;
+ int i = 0;
+
+ pte = pte_offset_kernel(pmd, addr);
+ first = *pte;
+
+ /* Make sure alignment is suitable */
+ if (PFN_PHYS(pte_pfn(first)) & ~PMD_MASK)
+ return;
+
+ /* The page is 4k intentionally */
+ if (pte_flags(first) & _PAGE_KERNEL_4K)
+ return;
+
+ /* Check that the rest of PTEs are compatible with the first one */
+ for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) {
+ pte_t entry = *pte;
+ if (!pte_present(entry))
+ return;
+ if (pte_flags(entry) != pte_flags(first))
+ return;
+ if (pte_pfn(entry) - pte_pfn(first) != i)
+ return;
+ }
+
+ old_pmd = *pmd;
+
+ /* Success: set up a large page */
+ pgprot = pgprot_4k_2_large(pte_pgprot(first));
+ pgprot_val(pgprot) |= _PAGE_PSE;
+ _pmd = pfn_pmd(pte_pfn(first), pgprot);
+ set_pmd(pmd, _pmd);
+
+ /* Queue the page table to be freed after TLB flush */
+ list_add(&pmd_page(old_pmd)->lru, pgtables);
+
+ if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) {
+ struct page *page;
+
+ /* Update all PGD tables to use the same large page */
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr);
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+ /* Something is wrong if entries doesn't match */
+ if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd)))
+ continue;
+ set_pmd(pmd, _pmd);
+ }
+ }
+ pr_debug("2M restored at %#lx\n", addr);
+}
+
+static void restore_pud_page(pud_t *pud, unsigned long addr,
+ struct list_head *pgtables)
+{
+ bool restore_pud = direct_gbpages;
+ pmd_t *pmd, first;
+ int i;
+
+ pmd = pmd_offset(pud, addr);
+ first = *pmd;
+
+ /* Try to restore large page if possible */
+ if (pmd_present(first) && !pmd_large(first)) {
+ restore_pmd_page(pmd, addr, pgtables);
+ first = *pmd;
+ }
+
+ /*
+ * To restore PUD page all PMD entries must be large and
+ * have suitable alignment
+ */
+ if (!pmd_large(first) || (PFN_PHYS(pmd_pfn(first)) & ~PUD_MASK))
+ restore_pud = false;
+
+ /*
+ * Restore all PMD large pages when possible and track if we can
+ * restore PUD page.
+ *
+ * To restore PUD page, all following PMDs must be compatible with the
+ * first one.
+ */
+ for (i = 1, pmd++, addr += PMD_SIZE; i < PTRS_PER_PMD; i++, pmd++, addr += PMD_SIZE) {
+ pmd_t entry = *pmd;
+ if (!pmd_present(entry)) {
+ restore_pud = false;
+ continue;
+ }
+ if (!pmd_large(entry)) {
+ restore_pmd_page(pmd, addr, pgtables);
+ entry = *pmd;
+ }
+ if (!pmd_large(entry))
+ restore_pud = false;
+ if (pmd_flags(entry) != pmd_flags(first))
+ restore_pud = false;
+ if (pmd_pfn(entry) - pmd_pfn(first) != i * PTRS_PER_PTE)
+ restore_pud = false;
+ }
+
+ /* Restore PUD page and queue page table to be freed after TLB flush */
+ if (restore_pud) {
+ list_add(&pud_page(*pud)->lru, pgtables);
+ set_pud(pud, pfn_pud(pmd_pfn(first), pmd_pgprot(first)));
+ pr_debug("1G restored at %#lx\n", addr - PUD_SIZE);
+ }
+}
+
+/*
+ * Restore PMD and PUD pages in the kernel mapping around the address where
+ * possible.
+ *
+ * Caller must flush TLB and free page tables queued on the list before
+ * touching the new entries. CPU must not see TLB entries of different size
+ * with different attributes.
+ */
+static void restore_large_pages(unsigned long addr, struct list_head *pgtables)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ addr &= PUD_MASK;
+
+ spin_lock(&pgd_lock);
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd))
+ goto out;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ goto out;
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud) || pud_large(*pud))
+ goto out;
+
+ restore_pud_page(pud, addr, pgtables);
+out:
+ spin_unlock(&pgd_lock);
+}
+
static bool try_to_free_pte_page(pte_t *pte)
{
int i;
@@ -1931,8 +2112,8 @@ int set_memory_np_noalias(unsigned long addr, int numpages)
int set_memory_4k(unsigned long addr, int numpages)
{
- return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
- __pgprot(0), 1, 0, NULL);
+ return change_page_attr_set_clr(&addr, numpages,
+ __pgprot(_PAGE_KERNEL_4K), __pgprot(0), 1, 0, NULL);
}
int set_memory_nonglobal(unsigned long addr, int numpages)
--
2.26.1.107.gefe3874640