/
swap_pager.c
2574 lines (2346 loc) · 69.4 KB
/
swap_pager.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* (MPSAFE)
*
* Copyright (c) 1998-2010 The DragonFly Project. All rights reserved.
*
* This code is derived from software contributed to The DragonFly Project
* by Matthew Dillon <dillon@backplane.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of The DragonFly Project nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific, prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* Copyright (c) 1994 John S. Dyson
* Copyright (c) 1990 University of Utah.
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* New Swap System
* Matthew Dillon
*
* Radix Bitmap 'blists'.
*
* - The new swapper uses the new radix bitmap code. This should scale
* to arbitrarily small or arbitrarily large swap spaces and an almost
* arbitrary degree of fragmentation.
*
* Features:
*
* - on the fly reallocation of swap during putpages. The new system
* does not try to keep previously allocated swap blocks for dirty
* pages.
*
* - on the fly deallocation of swap
*
* - No more garbage collection required. Unnecessarily allocated swap
* blocks only exist for dirty vm_page_t's now and these are already
* cycled (in a high-load system) by the pager. We also do on-the-fly
* removal of invalidated swap blocks when a page is destroyed
* or renamed.
*
* from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
* @(#)swap_pager.c 8.9 (Berkeley) 3/21/94
* $FreeBSD: src/sys/vm/swap_pager.c,v 1.130.2.12 2002/08/31 21:15:55 dillon Exp $
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/vmmeter.h>
#include <sys/sysctl.h>
#include <sys/blist.h>
#include <sys/lock.h>
#include <sys/kcollect.h>
#include <unistd.h>
#include "opt_swap.h"
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
#include <vm/vm_pageout.h>
#include <vm/swap_pager.h>
#include <vm/vm_extern.h>
#include <vm/vm_zone.h>
#include <vm/vnode_pager.h>
#include <sys/thread2.h>
#include <sys/buf2.h>
#include <vm/vm_page2.h>
#ifndef MAX_PAGEOUT_CLUSTER
#define MAX_PAGEOUT_CLUSTER SWB_NPAGES
#endif
#define SWM_FREE 0x02 /* free, period */
#define SWM_POP 0x04 /* pop out */
#define SWBIO_READ 0x01
#define SWBIO_WRITE 0x02
#define SWBIO_SYNC 0x04
#define SWBIO_TTC 0x08 /* for VM_PAGER_TRY_TO_CACHE */
struct swfreeinfo {
vm_object_t object;
vm_pindex_t basei;
vm_pindex_t begi;
vm_pindex_t endi; /* inclusive */
};
struct swswapoffinfo {
vm_object_t object;
int devidx;
int shared;
};
/*
* vm_swap_size is in page-sized chunks now. It was DEV_BSIZE'd chunks
* in the old system.
*/
int swap_pager_full; /* swap space exhaustion (task killing) */
int swap_fail_ticks; /* when we became exhausted */
int swap_pager_almost_full; /* swap space exhaustion (w/ hysteresis)*/
swblk_t vm_swap_cache_use;
swblk_t vm_swap_anon_use;
static int vm_report_swap_allocs;
static int nsw_rcount; /* free read buffers */
static int nsw_wcount_sync; /* limit write buffers / synchronous */
static int nsw_wcount_async; /* limit write buffers / asynchronous */
static int nsw_wcount_async_max;/* assigned maximum */
static int nsw_cluster_max; /* maximum VOP I/O allowed */
struct blist *swapblist;
static int swap_async_max = 4; /* maximum in-progress async I/O's */
static int swap_burst_read = 0; /* allow burst reading */
static swblk_t swapiterator; /* linearize allocations */
int swap_user_async = 0; /* user swap pager operation can be async */
static struct spinlock swapbp_spin = SPINLOCK_INITIALIZER(&swapbp_spin, "swapbp_spin");
/* from vm_swap.c */
extern struct vnode *swapdev_vp;
extern struct swdevt *swdevt;
extern int nswdev;
#define BLK2DEVIDX(blk) (nswdev > 1 ? blk / SWB_DMMAX % nswdev : 0)
SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
SYSCTL_INT(_vm, OID_AUTO, swap_burst_read,
CTLFLAG_RW, &swap_burst_read, 0, "Allow burst reads for pageins");
SYSCTL_INT(_vm, OID_AUTO, swap_user_async,
CTLFLAG_RW, &swap_user_async, 0, "Allow async uuser swap write I/O");
#if SWBLK_BITS == 64
SYSCTL_LONG(_vm, OID_AUTO, swap_cache_use,
CTLFLAG_RD, &vm_swap_cache_use, 0, "");
SYSCTL_LONG(_vm, OID_AUTO, swap_anon_use,
CTLFLAG_RD, &vm_swap_anon_use, 0, "");
SYSCTL_LONG(_vm, OID_AUTO, swap_size,
CTLFLAG_RD, &vm_swap_size, 0, "");
#else
SYSCTL_INT(_vm, OID_AUTO, swap_cache_use,
CTLFLAG_RD, &vm_swap_cache_use, 0, "");
SYSCTL_INT(_vm, OID_AUTO, swap_anon_use,
CTLFLAG_RD, &vm_swap_anon_use, 0, "");
SYSCTL_INT(_vm, OID_AUTO, swap_size,
CTLFLAG_RD, &vm_swap_size, 0, "");
#endif
SYSCTL_INT(_vm, OID_AUTO, report_swap_allocs,
CTLFLAG_RW, &vm_report_swap_allocs, 0, "");
vm_zone_t swap_zone;
/*
* Red-Black tree for swblock entries
*
* The caller must hold vm_token
*/
RB_GENERATE2(swblock_rb_tree, swblock, swb_entry, rb_swblock_compare,
vm_pindex_t, swb_index);
int
rb_swblock_compare(struct swblock *swb1, struct swblock *swb2)
{
if (swb1->swb_index < swb2->swb_index)
return(-1);
if (swb1->swb_index > swb2->swb_index)
return(1);
return(0);
}
static
int
rb_swblock_scancmp(struct swblock *swb, void *data)
{
struct swfreeinfo *info = data;
if (swb->swb_index < info->basei)
return(-1);
if (swb->swb_index > info->endi)
return(1);
return(0);
}
static
int
rb_swblock_condcmp(struct swblock *swb, void *data)
{
struct swfreeinfo *info = data;
if (swb->swb_index < info->basei)
return(-1);
return(0);
}
/*
* pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure
* calls hooked from other parts of the VM system and do not appear here.
* (see vm/swap_pager.h).
*/
static void swap_pager_dealloc (vm_object_t object);
static int swap_pager_getpage (vm_object_t, vm_page_t *, int);
static void swap_chain_iodone(struct bio *biox);
struct pagerops swappagerops = {
swap_pager_dealloc, /* deallocate an OBJT_SWAP object */
swap_pager_getpage, /* pagein */
swap_pager_putpages, /* pageout */
swap_pager_haspage /* get backing store status for page */
};
/*
* SWB_DMMAX is in page-sized chunks with the new swap system. It was
* dev-bsized chunks in the old. SWB_DMMAX is always a power of 2.
*
* swap_*() routines are externally accessible. swp_*() routines are
* internal.
*/
int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */
int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */
static __inline void swp_sizecheck (void);
static void swp_pager_async_iodone (struct bio *bio);
/*
* Swap bitmap functions
*/
static __inline void swp_pager_freeswapspace(vm_object_t object,
swblk_t blk, int npages);
static __inline swblk_t swp_pager_getswapspace(vm_object_t object, int npages);
/*
* Metadata functions
*/
static void swp_pager_meta_convert(vm_object_t);
static void swp_pager_meta_build(vm_object_t, vm_pindex_t, swblk_t);
static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t);
static void swp_pager_meta_free_all(vm_object_t);
static swblk_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
/*
* SWP_SIZECHECK() - update swap_pager_full indication
*
* update the swap_pager_almost_full indication and warn when we are
* about to run out of swap space, using lowat/hiwat hysteresis.
*
* Clear swap_pager_full ( task killing ) indication when lowat is met.
*
* No restrictions on call
* This routine may not block.
* SMP races are ok.
*/
static __inline void
swp_sizecheck(void)
{
if (vm_swap_size < nswap_lowat) {
if (swap_pager_almost_full == 0) {
kprintf("swap_pager: out of swap space\n");
swap_pager_almost_full = 1;
swap_fail_ticks = ticks;
}
} else {
swap_pager_full = 0;
if (vm_swap_size > nswap_hiwat)
swap_pager_almost_full = 0;
}
}
/*
* Long-term data collection on 10-second interval. Return the value
* for KCOLLECT_SWAPPCT and set the values for SWAPANO and SWAPCCAC.
*
* Return total swap in the scale field. This can change if swap is
* regularly added or removed and may cause some historical confusion
* in that case, but SWAPPCT will always be historically accurate.
*/
#define PTOB(value) ((uint64_t)(value) << PAGE_SHIFT)
static uint64_t
collect_swap_callback(int n)
{
uint64_t total = vm_swap_max;
uint64_t anon = vm_swap_anon_use;
uint64_t cache = vm_swap_cache_use;
if (total == 0) /* avoid divide by zero */
total = 1;
kcollect_setvalue(KCOLLECT_SWAPANO, PTOB(anon));
kcollect_setvalue(KCOLLECT_SWAPCAC, PTOB(cache));
kcollect_setscale(KCOLLECT_SWAPANO,
KCOLLECT_SCALE(KCOLLECT_SWAPANO_FORMAT, PTOB(total)));
kcollect_setscale(KCOLLECT_SWAPCAC,
KCOLLECT_SCALE(KCOLLECT_SWAPCAC_FORMAT, PTOB(total)));
return (((anon + cache) * 10000 + (total >> 1)) / total);
}
/*
* SWAP_PAGER_INIT() - initialize the swap pager!
*
* Expected to be started from system init. NOTE: This code is run
* before much else so be careful what you depend on. Most of the VM
* system has yet to be initialized at this point.
*
* Called from the low level boot code only.
*/
static void
swap_pager_init(void *arg __unused)
{
kcollect_register(KCOLLECT_SWAPPCT, "swapuse", collect_swap_callback,
KCOLLECT_SCALE(KCOLLECT_SWAPPCT_FORMAT, 0));
kcollect_register(KCOLLECT_SWAPANO, "swapano", NULL,
KCOLLECT_SCALE(KCOLLECT_SWAPANO_FORMAT, 0));
kcollect_register(KCOLLECT_SWAPCAC, "swapcac", NULL,
KCOLLECT_SCALE(KCOLLECT_SWAPCAC_FORMAT, 0));
}
SYSINIT(vm_mem, SI_BOOT1_VM, SI_ORDER_THIRD, swap_pager_init, NULL);
/*
* SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
*
* Expected to be started from pageout process once, prior to entering
* its main loop.
*
* Called from the low level boot code only.
*/
void
swap_pager_swap_init(void)
{
int n, n2;
/*
* Number of in-transit swap bp operations. Don't
* exhaust the pbufs completely. Make sure we
* initialize workable values (0 will work for hysteresis
* but it isn't very efficient).
*
* The nsw_cluster_max is constrained by the number of pages an XIO
* holds, i.e., (MAXPHYS/PAGE_SIZE) and our locally defined
* MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are
* constrained by the swap device interleave stripe size.
*
* Currently we hardwire nsw_wcount_async to 4. This limit is
* designed to prevent other I/O from having high latencies due to
* our pageout I/O. The value 4 works well for one or two active swap
* devices but is probably a little low if you have more. Even so,
* a higher value would probably generate only a limited improvement
* with three or four active swap devices since the system does not
* typically have to pageout at extreme bandwidths. We will want
* at least 2 per swap devices, and 4 is a pretty good value if you
* have one NFS swap device due to the command/ack latency over NFS.
* So it all works out pretty well.
*/
nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
nsw_rcount = (nswbuf_kva + 1) / 2;
nsw_wcount_sync = (nswbuf_kva + 3) / 4;
nsw_wcount_async = 4;
nsw_wcount_async_max = nsw_wcount_async;
/*
* The zone is dynamically allocated so generally size it to
* maxswzone (32MB to 256GB of KVM). Set a minimum size based
* on physical memory of around 8x (each swblock can hold 16 pages).
*
* With the advent of SSDs (vs HDs) the practical (swap:memory) ratio
* has increased dramatically.
*/
n = vmstats.v_page_count / 2;
if (maxswzone && n < maxswzone / sizeof(struct swblock))
n = maxswzone / sizeof(struct swblock);
n2 = n;
do {
swap_zone = zinit(
"SWAPMETA",
sizeof(struct swblock),
n,
ZONE_INTERRUPT);
if (swap_zone != NULL)
break;
/*
* if the allocation failed, try a zone two thirds the
* size of the previous attempt.
*/
n -= ((n + 2) / 3);
} while (n > 0);
if (swap_zone == NULL)
panic("swap_pager_swap_init: swap_zone == NULL");
if (n2 != n)
kprintf("Swap zone entries reduced from %d to %d.\n", n2, n);
}
/*
* SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate
* its metadata structures.
*
* This routine is called from the mmap and fork code to create a new
* OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object
* and then converting it with swp_pager_meta_convert().
*
* We only support unnamed objects.
*
* No restrictions.
*/
vm_object_t
swap_pager_alloc(void *handle, off_t size, vm_prot_t prot, off_t offset)
{
vm_object_t object;
KKASSERT(handle == NULL);
object = vm_object_allocate_hold(OBJT_DEFAULT,
OFF_TO_IDX(offset + PAGE_MASK + size));
swp_pager_meta_convert(object);
vm_object_drop(object);
return (object);
}
/*
* SWAP_PAGER_DEALLOC() - remove swap metadata from object
*
* The swap backing for the object is destroyed. The code is
* designed such that we can reinstantiate it later, but this
* routine is typically called only when the entire object is
* about to be destroyed.
*
* The object must be locked or unreferenceable.
* No other requirements.
*/
static void
swap_pager_dealloc(vm_object_t object)
{
vm_object_hold(object);
vm_object_pip_wait(object, "swpdea");
/*
* Free all remaining metadata. We only bother to free it from
* the swap meta data. We do not attempt to free swapblk's still
* associated with vm_page_t's for this object. We do not care
* if paging is still in progress on some objects.
*/
swp_pager_meta_free_all(object);
vm_object_drop(object);
}
/************************************************************************
* SWAP PAGER BITMAP ROUTINES *
************************************************************************/
/*
* SWP_PAGER_GETSWAPSPACE() - allocate raw swap space
*
* Allocate swap for the requested number of pages. The starting
* swap block number (a page index) is returned or SWAPBLK_NONE
* if the allocation failed.
*
* Also has the side effect of advising that somebody made a mistake
* when they configured swap and didn't configure enough.
*
* The caller must hold the object.
* This routine may not block.
*/
static __inline swblk_t
swp_pager_getswapspace(vm_object_t object, int npages)
{
swblk_t blk;
lwkt_gettoken(&vm_token);
blk = blist_allocat(swapblist, npages, swapiterator);
if (blk == SWAPBLK_NONE)
blk = blist_allocat(swapblist, npages, 0);
if (blk == SWAPBLK_NONE) {
if (swap_pager_full != 2) {
if (vm_swap_max == 0)
kprintf("Warning: The system would like to "
"page to swap but no swap space "
"is configured!\n");
else
kprintf("swap_pager_getswapspace: "
"swap full allocating %d pages\n",
npages);
swap_pager_full = 2;
if (swap_pager_almost_full == 0)
swap_fail_ticks = ticks;
swap_pager_almost_full = 1;
}
} else {
/* swapiterator = blk; disable for now, doesn't work well */
swapacctspace(blk, -npages);
if (object->type == OBJT_SWAP)
vm_swap_anon_use += npages;
else
vm_swap_cache_use += npages;
swp_sizecheck();
}
lwkt_reltoken(&vm_token);
return(blk);
}
/*
* SWP_PAGER_FREESWAPSPACE() - free raw swap space
*
* This routine returns the specified swap blocks back to the bitmap.
*
* Note: This routine may not block (it could in the old swap code),
* and through the use of the new blist routines it does not block.
*
* This routine may not block.
*/
static __inline void
swp_pager_freeswapspace(vm_object_t object, swblk_t blk, int npages)
{
struct swdevt *sp = &swdevt[BLK2DEVIDX(blk)];
lwkt_gettoken(&vm_token);
sp->sw_nused -= npages;
if (object->type == OBJT_SWAP)
vm_swap_anon_use -= npages;
else
vm_swap_cache_use -= npages;
if (sp->sw_flags & SW_CLOSING) {
lwkt_reltoken(&vm_token);
return;
}
blist_free(swapblist, blk, npages);
vm_swap_size += npages;
swp_sizecheck();
lwkt_reltoken(&vm_token);
}
/*
* SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page
* range within an object.
*
* This is a globally accessible routine.
*
* This routine removes swapblk assignments from swap metadata.
*
* The external callers of this routine typically have already destroyed
* or renamed vm_page_t's associated with this range in the object so
* we should be ok.
*
* No requirements.
*/
void
swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
{
vm_object_hold(object);
swp_pager_meta_free(object, start, size);
vm_object_drop(object);
}
/*
* No requirements.
*/
void
swap_pager_freespace_all(vm_object_t object)
{
vm_object_hold(object);
swp_pager_meta_free_all(object);
vm_object_drop(object);
}
/*
* This function conditionally frees swap cache swap starting at
* (*basei) in the object. (count) swap blocks will be nominally freed.
* The actual number of blocks freed can be more or less than the
* requested number.
*
* This function nominally returns the number of blocks freed. However,
* the actual number of blocks freed may be less then the returned value.
* If the function is unable to exhaust the object or if it is able to
* free (approximately) the requested number of blocks it returns
* a value n > count.
*
* If we exhaust the object we will return a value n <= count.
*
* The caller must hold the object.
*
* WARNING! If count == 0 then -1 can be returned as a degenerate case,
* callers should always pass a count value > 0.
*/
static int swap_pager_condfree_callback(struct swblock *swap, void *data);
int
swap_pager_condfree(vm_object_t object, vm_pindex_t *basei, int count)
{
struct swfreeinfo info;
int n;
int t;
ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
info.object = object;
info.basei = *basei; /* skip up to this page index */
info.begi = count; /* max swap pages to destroy */
info.endi = count * 8; /* max swblocks to scan */
swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_condcmp,
swap_pager_condfree_callback, &info);
*basei = info.basei;
/*
* Take the higher difference swblocks vs pages
*/
n = count - (int)info.begi;
t = count * 8 - (int)info.endi;
if (n < t)
n = t;
if (n < 1)
n = 1;
return(n);
}
/*
* The idea is to free whole meta-block to avoid fragmenting
* the swap space or disk I/O. We only do this if NO VM pages
* are present.
*
* We do not have to deal with clearing PG_SWAPPED in related VM
* pages because there are no related VM pages.
*
* The caller must hold the object.
*/
static int
swap_pager_condfree_callback(struct swblock *swap, void *data)
{
struct swfreeinfo *info = data;
vm_object_t object = info->object;
int i;
for (i = 0; i < SWAP_META_PAGES; ++i) {
if (vm_page_lookup(object, swap->swb_index + i))
break;
}
info->basei = swap->swb_index + SWAP_META_PAGES;
if (i == SWAP_META_PAGES) {
info->begi -= swap->swb_count;
swap_pager_freespace(object, swap->swb_index, SWAP_META_PAGES);
}
--info->endi;
if ((int)info->begi < 0 || (int)info->endi < 0)
return(-1);
lwkt_yield();
return(0);
}
/*
* Called by vm_page_alloc() when a new VM page is inserted
* into a VM object. Checks whether swap has been assigned to
* the page and sets PG_SWAPPED as necessary.
*
* (m) must be busied by caller and remains busied on return.
*/
void
swap_pager_page_inserted(vm_page_t m)
{
if (m->object->swblock_count) {
vm_object_hold(m->object);
if (swp_pager_meta_ctl(m->object, m->pindex, 0) != SWAPBLK_NONE)
vm_page_flag_set(m, PG_SWAPPED);
vm_object_drop(m->object);
}
}
/*
* SWAP_PAGER_RESERVE() - reserve swap blocks in object
*
* Assigns swap blocks to the specified range within the object. The
* swap blocks are not zerod. Any previous swap assignment is destroyed.
*
* Returns 0 on success, -1 on failure.
*
* The caller is responsible for avoiding races in the specified range.
* No other requirements.
*/
int
swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
{
int n = 0;
swblk_t blk = SWAPBLK_NONE;
vm_pindex_t beg = start; /* save start index */
vm_object_hold(object);
while (size) {
if (n == 0) {
n = BLIST_MAX_ALLOC;
while ((blk = swp_pager_getswapspace(object, n)) ==
SWAPBLK_NONE)
{
n >>= 1;
if (n == 0) {
swp_pager_meta_free(object, beg,
start - beg);
vm_object_drop(object);
return(-1);
}
}
}
swp_pager_meta_build(object, start, blk);
--size;
++start;
++blk;
--n;
}
swp_pager_meta_free(object, start, n);
vm_object_drop(object);
return(0);
}
/*
* SWAP_PAGER_COPY() - copy blocks from source pager to destination pager
* and destroy the source.
*
* Copy any valid swapblks from the source to the destination. In
* cases where both the source and destination have a valid swapblk,
* we keep the destination's.
*
* This routine is allowed to block. It may block allocating metadata
* indirectly through swp_pager_meta_build() or if paging is still in
* progress on the source.
*
* XXX vm_page_collapse() kinda expects us not to block because we
* supposedly do not need to allocate memory, but for the moment we
* *may* have to get a little memory from the zone allocator, but
* it is taken from the interrupt memory. We should be ok.
*
* The source object contains no vm_page_t's (which is just as well)
* The source object is of type OBJT_SWAP.
*
* The source and destination objects must be held by the caller.
*/
void
swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
vm_pindex_t base_index, int destroysource)
{
vm_pindex_t i;
ASSERT_LWKT_TOKEN_HELD(vm_object_token(srcobject));
ASSERT_LWKT_TOKEN_HELD(vm_object_token(dstobject));
/*
* transfer source to destination.
*/
for (i = 0; i < dstobject->size; ++i) {
swblk_t dstaddr;
/*
* Locate (without changing) the swapblk on the destination,
* unless it is invalid in which case free it silently, or
* if the destination is a resident page, in which case the
* source is thrown away.
*/
dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
if (dstaddr == SWAPBLK_NONE) {
/*
* Destination has no swapblk and is not resident,
* copy source.
*/
swblk_t srcaddr;
srcaddr = swp_pager_meta_ctl(srcobject,
base_index + i, SWM_POP);
if (srcaddr != SWAPBLK_NONE)
swp_pager_meta_build(dstobject, i, srcaddr);
} else {
/*
* Destination has valid swapblk or it is represented
* by a resident page. We destroy the sourceblock.
*/
swp_pager_meta_ctl(srcobject, base_index + i, SWM_FREE);
}
}
/*
* Free left over swap blocks in source.
*
* We have to revert the type to OBJT_DEFAULT so we do not accidently
* double-remove the object from the swap queues.
*/
if (destroysource) {
/*
* Reverting the type is not necessary, the caller is going
* to destroy srcobject directly, but I'm doing it here
* for consistency since we've removed the object from its
* queues.
*/
swp_pager_meta_free_all(srcobject);
if (srcobject->type == OBJT_SWAP)
srcobject->type = OBJT_DEFAULT;
}
}
/*
* SWAP_PAGER_HASPAGE() - determine if we have good backing store for
* the requested page.
*
* We determine whether good backing store exists for the requested
* page and return TRUE if it does, FALSE if it doesn't.
*
* If TRUE, we also try to determine how much valid, contiguous backing
* store exists before and after the requested page within a reasonable
* distance. We do not try to restrict it to the swap device stripe
* (that is handled in getpages/putpages). It probably isn't worth
* doing here.
*
* No requirements.
*/
boolean_t
swap_pager_haspage(vm_object_t object, vm_pindex_t pindex)
{
swblk_t blk0;
/*
* do we have good backing store at the requested index ?
*/
vm_object_hold(object);
blk0 = swp_pager_meta_ctl(object, pindex, 0);
if (blk0 == SWAPBLK_NONE) {
vm_object_drop(object);
return (FALSE);
}
vm_object_drop(object);
return (TRUE);
}
/*
* SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
*
* This removes any associated swap backing store, whether valid or
* not, from the page. This operates on any VM object, not just OBJT_SWAP
* objects.
*
* This routine is typically called when a page is made dirty, at
* which point any associated swap can be freed. MADV_FREE also
* calls us in a special-case situation
*
* NOTE!!! If the page is clean and the swap was valid, the caller
* should make the page dirty before calling this routine.
* This routine does NOT change the m->dirty status of the page.
* Also: MADV_FREE depends on it.
*
* The page must be busied.
* The caller can hold the object to avoid blocking, else we might block.
* No other requirements.
*/
void
swap_pager_unswapped(vm_page_t m)
{
if (m->flags & PG_SWAPPED) {
vm_object_hold(m->object);
KKASSERT(m->flags & PG_SWAPPED);
swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
vm_page_flag_clear(m, PG_SWAPPED);
vm_object_drop(m->object);
}
}
/*
* SWAP_PAGER_STRATEGY() - read, write, free blocks
*
* This implements a VM OBJECT strategy function using swap backing store.
* This can operate on any VM OBJECT type, not necessarily just OBJT_SWAP
* types. Only BUF_CMD_{READ,WRITE,FREEBLKS} is supported, any other
* requests will return EINVAL.
*
* This is intended to be a cacheless interface (i.e. caching occurs at
* higher levels), and is also used as a swap-based SSD cache for vnode
* and device objects.
*
* All I/O goes directly to and from the swap device.
*
* We currently attempt to run I/O synchronously or asynchronously as
* the caller requests. This isn't perfect because we loose error
* sequencing when we run multiple ops in parallel to satisfy a request.
* But this is swap, so we let it all hang out.
*
* NOTE: This function supports the KVABIO API wherein bp->b_data might
* not be synchronized to the current cpu.
*
* No requirements.
*/
void
swap_pager_strategy(vm_object_t object, struct bio *bio)
{
struct buf *bp = bio->bio_buf;
struct bio *nbio;
vm_pindex_t start;
vm_pindex_t biox_blkno = 0;
int count;
char *data;
struct bio *biox;
struct buf *bufx;
#if 0
struct bio_track *track;
#endif
#if 0
/*
* tracking for swapdev vnode I/Os
*/
if (bp->b_cmd == BUF_CMD_READ)
track = &swapdev_vp->v_track_read;
else
track = &swapdev_vp->v_track_write;
#endif
/*
* Only supported commands
*/
if (bp->b_cmd != BUF_CMD_FREEBLKS &&
bp->b_cmd != BUF_CMD_READ &&
bp->b_cmd != BUF_CMD_WRITE) {
bp->b_error = EINVAL;
bp->b_flags |= B_ERROR | B_INVAL;
biodone(bio);
return;
}
/*
* bcount must be an integral number of pages.
*/
if (bp->b_bcount & PAGE_MASK) {
bp->b_error = EINVAL;
bp->b_flags |= B_ERROR | B_INVAL;
biodone(bio);
kprintf("swap_pager_strategy: bp %p offset %lld size %d, "
"not page bounded\n",