-
Notifications
You must be signed in to change notification settings - Fork 117
/
usched_dfly.c
2376 lines (2182 loc) · 66.4 KB
/
usched_dfly.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) 2012 The DragonFly Project. All rights reserved.
* Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>. All rights reserved.
*
* This code is derived from software contributed to The DragonFly Project
* by Matthew Dillon <dillon@backplane.com>,
* by Mihai Carabas <mihai.carabas@gmail.com>
* and many others.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of The DragonFly Project nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific, prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/queue.h>
#include <sys/proc.h>
#include <sys/rtprio.h>
#include <sys/uio.h>
#include <sys/sysctl.h>
#include <sys/resourcevar.h>
#include <sys/spinlock.h>
#include <sys/cpu_topology.h>
#include <sys/thread2.h>
#include <sys/spinlock2.h>
#include <sys/mplock2.h>
#include <sys/ktr.h>
#include <machine/cpu.h>
#include <machine/smp.h>
/*
* Priorities. Note that with 32 run queues per scheduler each queue
* represents four priority levels.
*/
int dfly_rebalanced;
#define MAXPRI 128
#define PRIMASK (MAXPRI - 1)
#define PRIBASE_REALTIME 0
#define PRIBASE_NORMAL MAXPRI
#define PRIBASE_IDLE (MAXPRI * 2)
#define PRIBASE_THREAD (MAXPRI * 3)
#define PRIBASE_NULL (MAXPRI * 4)
#define NQS 32 /* 32 run queues. */
#define PPQ (MAXPRI / NQS) /* priorities per queue */
#define PPQMASK (PPQ - 1)
/*
* NICEPPQ - number of nice units per priority queue
* ESTCPUPPQ - number of estcpu units per priority queue
* ESTCPUMAX - number of estcpu units
*/
#define NICEPPQ 2
#define ESTCPUPPQ 512
#define ESTCPUMAX (ESTCPUPPQ * NQS)
#define BATCHMAX (ESTCPUFREQ * 30)
#define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1)
#define ESTCPULIM(v) min((v), ESTCPUMAX)
TAILQ_HEAD(rq, lwp);
#define lwp_priority lwp_usdata.dfly.priority
#define lwp_forked lwp_usdata.dfly.forked
#define lwp_rqindex lwp_usdata.dfly.rqindex
#define lwp_estcpu lwp_usdata.dfly.estcpu
#define lwp_estfast lwp_usdata.dfly.estfast
#define lwp_uload lwp_usdata.dfly.uload
#define lwp_rqtype lwp_usdata.dfly.rqtype
#define lwp_qcpu lwp_usdata.dfly.qcpu
#define lwp_rrcount lwp_usdata.dfly.rrcount
struct usched_dfly_pcpu {
struct spinlock spin;
struct thread helper_thread;
short unusde01;
short upri;
int uload;
int ucount;
struct lwp *uschedcp;
struct rq queues[NQS];
struct rq rtqueues[NQS];
struct rq idqueues[NQS];
u_int32_t queuebits;
u_int32_t rtqueuebits;
u_int32_t idqueuebits;
int runqcount;
int cpuid;
cpumask_t cpumask;
#ifdef SMP
cpu_node_t *cpunode;
#endif
};
typedef struct usched_dfly_pcpu *dfly_pcpu_t;
static void dfly_acquire_curproc(struct lwp *lp);
static void dfly_release_curproc(struct lwp *lp);
static void dfly_select_curproc(globaldata_t gd);
static void dfly_setrunqueue(struct lwp *lp);
static void dfly_setrunqueue_dd(dfly_pcpu_t rdd, struct lwp *lp);
static void dfly_schedulerclock(struct lwp *lp, sysclock_t period,
sysclock_t cpstamp);
static void dfly_recalculate_estcpu(struct lwp *lp);
static void dfly_resetpriority(struct lwp *lp);
static void dfly_forking(struct lwp *plp, struct lwp *lp);
static void dfly_exiting(struct lwp *lp, struct proc *);
static void dfly_uload_update(struct lwp *lp);
static void dfly_yield(struct lwp *lp);
#ifdef SMP
static void dfly_changeqcpu_locked(struct lwp *lp,
dfly_pcpu_t dd, dfly_pcpu_t rdd);
static dfly_pcpu_t dfly_choose_best_queue(struct lwp *lp);
static dfly_pcpu_t dfly_choose_worst_queue(dfly_pcpu_t dd);
static dfly_pcpu_t dfly_choose_queue_simple(dfly_pcpu_t dd, struct lwp *lp);
#endif
#ifdef SMP
static void dfly_need_user_resched_remote(void *dummy);
#endif
static struct lwp *dfly_chooseproc_locked(dfly_pcpu_t rdd, dfly_pcpu_t dd,
struct lwp *chklp, int worst);
static void dfly_remrunqueue_locked(dfly_pcpu_t dd, struct lwp *lp);
static void dfly_setrunqueue_locked(dfly_pcpu_t dd, struct lwp *lp);
struct usched usched_dfly = {
{ NULL },
"dfly", "Original DragonFly Scheduler",
NULL, /* default registration */
NULL, /* default deregistration */
dfly_acquire_curproc,
dfly_release_curproc,
dfly_setrunqueue,
dfly_schedulerclock,
dfly_recalculate_estcpu,
dfly_resetpriority,
dfly_forking,
dfly_exiting,
dfly_uload_update,
NULL, /* setcpumask not supported */
dfly_yield
};
/*
* We have NQS (32) run queues per scheduling class. For the normal
* class, there are 128 priorities scaled onto these 32 queues. New
* processes are added to the last entry in each queue, and processes
* are selected for running by taking them from the head and maintaining
* a simple FIFO arrangement. Realtime and Idle priority processes have
* and explicit 0-31 priority which maps directly onto their class queue
* index. When a queue has something in it, the corresponding bit is
* set in the queuebits variable, allowing a single read to determine
* the state of all 32 queues and then a ffs() to find the first busy
* queue.
*/
static cpumask_t dfly_curprocmask = -1; /* currently running a user process */
static cpumask_t dfly_rdyprocmask; /* ready to accept a user process */
#ifdef SMP
static volatile int dfly_scancpu;
#endif
static volatile int dfly_ucount; /* total running on whole system */
static struct usched_dfly_pcpu dfly_pcpu[MAXCPU];
static struct sysctl_ctx_list usched_dfly_sysctl_ctx;
static struct sysctl_oid *usched_dfly_sysctl_tree;
/* Debug info exposed through debug.* sysctl */
static int usched_dfly_debug = -1;
SYSCTL_INT(_debug, OID_AUTO, dfly_scdebug, CTLFLAG_RW,
&usched_dfly_debug, 0,
"Print debug information for this pid");
static int usched_dfly_pid_debug = -1;
SYSCTL_INT(_debug, OID_AUTO, dfly_pid_debug, CTLFLAG_RW,
&usched_dfly_pid_debug, 0,
"Print KTR debug information for this pid");
static int usched_dfly_chooser = 0;
SYSCTL_INT(_debug, OID_AUTO, dfly_chooser, CTLFLAG_RW,
&usched_dfly_chooser, 0,
"Print KTR debug information for this pid");
/*
* Tunning usched_dfly - configurable through kern.usched_dfly.
*
* weight1 - Tries to keep threads on their current cpu. If you
* make this value too large the scheduler will not be
* able to load-balance large loads.
*
* weight2 - If non-zero, detects thread pairs undergoing synchronous
* communications and tries to move them closer together.
* Behavior is adjusted by bit 4 of features (0x10).
*
* WARNING! Weight2 is a ridiculously sensitive parameter,
* a small value is recommended.
*
* weight3 - Weighting based on the number of recently runnable threads
* on the userland scheduling queue (ignoring their loads).
* A nominal value here prevents high-priority (low-load)
* threads from accumulating on one cpu core when other
* cores are available.
*
* This value should be left fairly small relative to weight1
* and weight4.
*
* weight4 - Weighting based on other cpu queues being available
* or running processes with higher lwp_priority's.
*
* This allows a thread to migrate to another nearby cpu if it
* is unable to run on the current cpu based on the other cpu
* being idle or running a lower priority (higher lwp_priority)
* thread. This value should be large enough to override weight1
*
* features - These flags can be set or cleared to enable or disable various
* features.
*
* 0x01 Enable idle-cpu pulling (default)
* 0x02 Enable proactive pushing (default)
* 0x04 Enable rebalancing rover (default)
* 0x08 Enable more proactive pushing (default)
* 0x10 (flip weight2 limit on same cpu) (default)
* 0x20 choose best cpu for forked process
* 0x40 choose current cpu for forked process
* 0x80 choose random cpu for forked process (default)
*/
#ifdef SMP
static int usched_dfly_smt = 0;
static int usched_dfly_cache_coherent = 0;
static int usched_dfly_weight1 = 200; /* keep thread on current cpu */
static int usched_dfly_weight2 = 180; /* synchronous peer's current cpu */
static int usched_dfly_weight3 = 40; /* number of threads on queue */
static int usched_dfly_weight4 = 160; /* availability of idle cores */
static int usched_dfly_fast_resched = 0;/* delta priority / resched */
static int usched_dfly_features = 0x8F; /* allow pulls */
static int usched_dfly_swmask = ~PPQMASK; /* allow pulls */
#endif
static int usched_dfly_rrinterval = (ESTCPUFREQ + 9) / 10;
static int usched_dfly_decay = 8;
/* KTR debug printings */
KTR_INFO_MASTER(usched);
#if !defined(KTR_USCHED_DFLY)
#define KTR_USCHED_DFLY KTR_ALL
#endif
KTR_INFO(KTR_USCHED_DFLY, usched, chooseproc, 0,
"USCHED_DFLY(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)",
pid_t pid, int old_cpuid, int curr);
/*
* This function is called when the kernel intends to return to userland.
* It is responsible for making the thread the current designated userland
* thread for this cpu, blocking if necessary.
*
* The kernel will not depress our LWKT priority until after we return,
* in case we have to shove over to another cpu.
*
* We must determine our thread's disposition before we switch away. This
* is very sensitive code.
*
* WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
* TO ANOTHER CPU! Because most of the kernel assumes that no migration will
* occur, this function is called only under very controlled circumstances.
*/
static void
dfly_acquire_curproc(struct lwp *lp)
{
globaldata_t gd;
dfly_pcpu_t dd;
#ifdef SMP
dfly_pcpu_t rdd;
#endif
thread_t td;
int force_resched;
/*
* Make sure we aren't sitting on a tsleep queue.
*/
td = lp->lwp_thread;
crit_enter_quick(td);
if (td->td_flags & TDF_TSLEEPQ)
tsleep_remove(td);
dfly_recalculate_estcpu(lp);
gd = mycpu;
dd = &dfly_pcpu[gd->gd_cpuid];
/*
* Process any pending interrupts/ipi's, then handle reschedule
* requests. dfly_release_curproc() will try to assign a new
* uschedcp that isn't us and otherwise NULL it out.
*/
force_resched = 0;
if ((td->td_mpflags & TDF_MP_BATCH_DEMARC) &&
lp->lwp_rrcount >= usched_dfly_rrinterval / 2) {
force_resched = 1;
}
if (user_resched_wanted()) {
if (dd->uschedcp == lp)
force_resched = 1;
clear_user_resched();
dfly_release_curproc(lp);
}
/*
* Loop until we are the current user thread.
*
* NOTE: dd spinlock not held at top of loop.
*/
if (dd->uschedcp == lp)
lwkt_yield_quick();
while (dd->uschedcp != lp) {
lwkt_yield_quick();
spin_lock(&dd->spin);
/*
* We are not or are no longer the current lwp and a forced
* reschedule was requested. Figure out the best cpu to
* run on (our current cpu will be given significant weight).
*
* (if a reschedule was not requested we want to move this
* step after the uschedcp tests).
*/
#ifdef SMP
if (force_resched &&
(usched_dfly_features & 0x08) &&
(rdd = dfly_choose_best_queue(lp)) != dd) {
dfly_changeqcpu_locked(lp, dd, rdd);
spin_unlock(&dd->spin);
lwkt_deschedule(lp->lwp_thread);
dfly_setrunqueue_dd(rdd, lp);
lwkt_switch();
gd = mycpu;
dd = &dfly_pcpu[gd->gd_cpuid];
continue;
}
#endif
/*
* Either no reschedule was requested or the best queue was
* dd, and no current process has been selected. We can
* trivially become the current lwp on the current cpu.
*/
if (dd->uschedcp == NULL) {
atomic_set_cpumask(&dfly_curprocmask, gd->gd_cpumask);
dd->uschedcp = lp;
dd->upri = lp->lwp_priority;
KKASSERT(lp->lwp_qcpu == dd->cpuid);
spin_unlock(&dd->spin);
break;
}
/*
* Can we steal the current designated user thread?
*
* If we do the other thread will stall when it tries to
* return to userland, possibly rescheduling elsewhere.
*
* It is important to do a masked test to avoid the edge
* case where two near-equal-priority threads are constantly
* interrupting each other.
*
* In the exact match case another thread has already gained
* uschedcp and lowered its priority, if we steal it the
* other thread will stay stuck on the LWKT runq and not
* push to another cpu. So don't steal on equal-priority even
* though it might appear to be more beneficial due to not
* having to switch back to the other thread's context.
*
* usched_dfly_fast_resched requires that two threads be
* significantly far apart in priority in order to interrupt.
*
* If better but not sufficiently far apart, the current
* uschedcp will be interrupted at the next scheduler clock.
*/
if (dd->uschedcp &&
(dd->upri & ~PPQMASK) >
(lp->lwp_priority & ~PPQMASK) + usched_dfly_fast_resched) {
dd->uschedcp = lp;
dd->upri = lp->lwp_priority;
KKASSERT(lp->lwp_qcpu == dd->cpuid);
spin_unlock(&dd->spin);
break;
}
#ifdef SMP
/*
* We are not the current lwp, figure out the best cpu
* to run on (our current cpu will be given significant
* weight). Loop on cpu change.
*/
if ((usched_dfly_features & 0x02) &&
force_resched == 0 &&
(rdd = dfly_choose_best_queue(lp)) != dd) {
dfly_changeqcpu_locked(lp, dd, rdd);
spin_unlock(&dd->spin);
lwkt_deschedule(lp->lwp_thread);
dfly_setrunqueue_dd(rdd, lp);
lwkt_switch();
gd = mycpu;
dd = &dfly_pcpu[gd->gd_cpuid];
continue;
}
#endif
/*
* We cannot become the current lwp, place the lp on the
* run-queue of this or another cpu and deschedule ourselves.
*
* When we are reactivated we will have another chance.
*
* Reload after a switch or setrunqueue/switch possibly
* moved us to another cpu.
*/
spin_unlock(&dd->spin);
lwkt_deschedule(lp->lwp_thread);
dfly_setrunqueue_dd(dd, lp);
lwkt_switch();
gd = mycpu;
dd = &dfly_pcpu[gd->gd_cpuid];
}
/*
* Make sure upri is synchronized, then yield to LWKT threads as
* needed before returning. This could result in another reschedule.
* XXX
*/
crit_exit_quick(td);
KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
}
/*
* DFLY_RELEASE_CURPROC
*
* This routine detaches the current thread from the userland scheduler,
* usually because the thread needs to run or block in the kernel (at
* kernel priority) for a while.
*
* This routine is also responsible for selecting a new thread to
* make the current thread.
*
* NOTE: This implementation differs from the dummy example in that
* dfly_select_curproc() is able to select the current process, whereas
* dummy_select_curproc() is not able to select the current process.
* This means we have to NULL out uschedcp.
*
* Additionally, note that we may already be on a run queue if releasing
* via the lwkt_switch() in dfly_setrunqueue().
*/
static void
dfly_release_curproc(struct lwp *lp)
{
globaldata_t gd = mycpu;
dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
/*
* Make sure td_wakefromcpu is defaulted. This will be overwritten
* by wakeup().
*/
if (dd->uschedcp == lp) {
KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
spin_lock(&dd->spin);
if (dd->uschedcp == lp) {
dd->uschedcp = NULL; /* don't let lp be selected */
dd->upri = PRIBASE_NULL;
atomic_clear_cpumask(&dfly_curprocmask, gd->gd_cpumask);
spin_unlock(&dd->spin);
dfly_select_curproc(gd);
} else {
spin_unlock(&dd->spin);
}
}
}
/*
* DFLY_SELECT_CURPROC
*
* Select a new current process for this cpu and clear any pending user
* reschedule request. The cpu currently has no current process.
*
* This routine is also responsible for equal-priority round-robining,
* typically triggered from dfly_schedulerclock(). In our dummy example
* all the 'user' threads are LWKT scheduled all at once and we just
* call lwkt_switch().
*
* The calling process is not on the queue and cannot be selected.
*/
static
void
dfly_select_curproc(globaldata_t gd)
{
dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
struct lwp *nlp;
int cpuid = gd->gd_cpuid;
crit_enter_gd(gd);
spin_lock(&dd->spin);
nlp = dfly_chooseproc_locked(dd, dd, dd->uschedcp, 0);
if (nlp) {
atomic_set_cpumask(&dfly_curprocmask, CPUMASK(cpuid));
dd->upri = nlp->lwp_priority;
dd->uschedcp = nlp;
#if 0
dd->rrcount = 0; /* reset round robin */
#endif
spin_unlock(&dd->spin);
#ifdef SMP
lwkt_acquire(nlp->lwp_thread);
#endif
lwkt_schedule(nlp->lwp_thread);
} else {
spin_unlock(&dd->spin);
}
crit_exit_gd(gd);
}
/*
* Place the specified lwp on the user scheduler's run queue. This routine
* must be called with the thread descheduled. The lwp must be runnable.
* It must not be possible for anyone else to explicitly schedule this thread.
*
* The thread may be the current thread as a special case.
*/
static void
dfly_setrunqueue(struct lwp *lp)
{
dfly_pcpu_t dd;
dfly_pcpu_t rdd;
/*
* First validate the process LWKT state.
*/
KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN"));
KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0,
("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid,
lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags));
KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
/*
* NOTE: dd/rdd do not necessarily represent the current cpu.
* Instead they may represent the cpu the thread was last
* scheduled on or inherited by its parent.
*/
dd = &dfly_pcpu[lp->lwp_qcpu];
rdd = dd;
/*
* This process is not supposed to be scheduled anywhere or assigned
* as the current process anywhere. Assert the condition.
*/
KKASSERT(rdd->uschedcp != lp);
#ifndef SMP
/*
* If we are not SMP we do not have a scheduler helper to kick
* and must directly activate the process if none are scheduled.
*
* This is really only an issue when bootstrapping init since
* the caller in all other cases will be a user process, and
* even if released (rdd->uschedcp == NULL), that process will
* kickstart the scheduler when it returns to user mode from
* the kernel.
*
* NOTE: On SMP we can't just set some other cpu's uschedcp.
*/
if (rdd->uschedcp == NULL) {
spin_lock(&rdd->spin);
if (rdd->uschedcp == NULL) {
atomic_set_cpumask(&dfly_curprocmask, 1);
rdd->uschedcp = lp;
rdd->upri = lp->lwp_priority;
spin_unlock(&rdd->spin);
lwkt_schedule(lp->lwp_thread);
return;
}
spin_unlock(&rdd->spin);
}
#endif
#ifdef SMP
/*
* Ok, we have to setrunqueue some target cpu and request a reschedule
* if necessary.
*
* We have to choose the best target cpu. It might not be the current
* target even if the current cpu has no running user thread (for
* example, because the current cpu might be a hyperthread and its
* sibling has a thread assigned).
*
* If we just forked it is most optimal to run the child on the same
* cpu just in case the parent decides to wait for it (thus getting
* off that cpu). As long as there is nothing else runnable on the
* cpu, that is. If we did this unconditionally a parent forking
* multiple children before waiting (e.g. make -j N) leaves other
* cpus idle that could be working.
*/
if (lp->lwp_forked) {
lp->lwp_forked = 0;
if (usched_dfly_features & 0x20)
rdd = dfly_choose_best_queue(lp);
else if (usched_dfly_features & 0x40)
rdd = &dfly_pcpu[lp->lwp_qcpu];
else if (usched_dfly_features & 0x80)
rdd = dfly_choose_queue_simple(rdd, lp);
else if (dfly_pcpu[lp->lwp_qcpu].runqcount)
rdd = dfly_choose_best_queue(lp);
else
rdd = &dfly_pcpu[lp->lwp_qcpu];
} else {
rdd = dfly_choose_best_queue(lp);
/* rdd = &dfly_pcpu[lp->lwp_qcpu]; */
}
if (lp->lwp_qcpu != rdd->cpuid) {
spin_lock(&dd->spin);
dfly_changeqcpu_locked(lp, dd, rdd);
spin_unlock(&dd->spin);
}
#endif
dfly_setrunqueue_dd(rdd, lp);
}
#ifdef SMP
/*
* Change qcpu to rdd->cpuid. The dd the lp is CURRENTLY on must be
* spin-locked on-call. rdd does not have to be.
*/
static void
dfly_changeqcpu_locked(struct lwp *lp, dfly_pcpu_t dd, dfly_pcpu_t rdd)
{
if (lp->lwp_qcpu != rdd->cpuid) {
if (lp->lwp_mpflags & LWP_MP_ULOAD) {
atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
atomic_add_int(&dd->uload, -lp->lwp_uload);
atomic_add_int(&dd->ucount, -1);
atomic_add_int(&dfly_ucount, -1);
}
lp->lwp_qcpu = rdd->cpuid;
}
}
#endif
/*
* Place lp on rdd's runqueue. Nothing is locked on call. This function
* also performs all necessary ancillary notification actions.
*/
static void
dfly_setrunqueue_dd(dfly_pcpu_t rdd, struct lwp *lp)
{
#ifdef SMP
globaldata_t rgd;
/*
* We might be moving the lp to another cpu's run queue, and once
* on the runqueue (even if it is our cpu's), another cpu can rip
* it away from us.
*
* TDF_MIGRATING might already be set if this is part of a
* remrunqueue+setrunqueue sequence.
*/
if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0)
lwkt_giveaway(lp->lwp_thread);
rgd = globaldata_find(rdd->cpuid);
/*
* We lose control of the lp the moment we release the spinlock
* after having placed it on the queue. i.e. another cpu could pick
* it up, or it could exit, or its priority could be further
* adjusted, or something like that.
*
* WARNING! rdd can point to a foreign cpu!
*/
spin_lock(&rdd->spin);
dfly_setrunqueue_locked(rdd, lp);
/*
* Potentially interrupt the currently-running thread
*/
if ((rdd->upri & ~PPQMASK) <= (lp->lwp_priority & ~PPQMASK)) {
/*
* Currently running thread is better or same, do not
* interrupt.
*/
spin_unlock(&rdd->spin);
} else if ((rdd->upri & ~PPQMASK) <= (lp->lwp_priority & ~PPQMASK) +
usched_dfly_fast_resched) {
/*
* Currently running thread is not better, but not so bad
* that we need to interrupt it. Let it run for one more
* scheduler tick.
*/
if (rdd->uschedcp &&
rdd->uschedcp->lwp_rrcount < usched_dfly_rrinterval) {
rdd->uschedcp->lwp_rrcount = usched_dfly_rrinterval - 1;
}
spin_unlock(&rdd->spin);
} else if (rgd == mycpu) {
/*
* We should interrupt the currently running thread, which
* is on the current cpu.
*/
spin_unlock(&rdd->spin);
if (rdd->uschedcp == NULL) {
wakeup_mycpu(&rdd->helper_thread); /* XXX */
need_user_resched();
} else {
need_user_resched();
}
} else {
/*
* We should interrupt the currently running thread, which
* is on a different cpu.
*/
spin_unlock(&rdd->spin);
lwkt_send_ipiq(rgd, dfly_need_user_resched_remote, NULL);
}
#else
/*
* Request a reschedule if appropriate.
*/
spin_lock(&rdd->spin);
dfly_setrunqueue_locked(rdd, lp);
spin_unlock(&rdd->spin);
if ((rdd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
need_user_resched();
}
#endif
}
/*
* This routine is called from a systimer IPI. It MUST be MP-safe and
* the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on
* each cpu.
*/
static
void
dfly_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
{
globaldata_t gd = mycpu;
dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
/*
* Spinlocks also hold a critical section so there should not be
* any active.
*/
KKASSERT(gd->gd_spinlocks == 0);
if (lp == NULL)
return;
/*
* Do we need to round-robin? We round-robin 10 times a second.
* This should only occur for cpu-bound batch processes.
*/
if (++lp->lwp_rrcount >= usched_dfly_rrinterval) {
lp->lwp_thread->td_wakefromcpu = -1;
need_user_resched();
}
/*
* Adjust estcpu upward using a real time equivalent calculation,
* and recalculate lp's priority.
*/
lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1);
dfly_resetpriority(lp);
/*
* Rebalance two cpus every 8 ticks, pulling the worst thread
* from the worst cpu's queue into a rotating cpu number.
*
* This mechanic is needed because the push algorithms can
* steady-state in an non-optimal configuration. We need to mix it
* up a little, even if it means breaking up a paired thread, so
* the push algorithms can rebalance the degenerate conditions.
* This portion of the algorithm exists to ensure stability at the
* selected weightings.
*
* Because we might be breaking up optimal conditions we do not want
* to execute this too quickly, hence we only rebalance approximately
* ~7-8 times per second. The push's, on the otherhand, are capable
* moving threads to other cpus at a much higher rate.
*
* We choose the most heavily loaded thread from the worst queue
* in order to ensure that multiple heavy-weight threads on the same
* queue get broken up, and also because these threads are the most
* likely to be able to remain in place. Hopefully then any pairings,
* if applicable, migrate to where these threads are.
*/
#ifdef SMP
if ((usched_dfly_features & 0x04) &&
((u_int)sched_ticks & 7) == 0 &&
(u_int)sched_ticks / 8 % ncpus == gd->gd_cpuid) {
/*
* Our cpu is up.
*/
struct lwp *nlp;
dfly_pcpu_t rdd;
rdd = dfly_choose_worst_queue(dd);
if (rdd) {
spin_lock(&dd->spin);
if (spin_trylock(&rdd->spin)) {
nlp = dfly_chooseproc_locked(rdd, dd, NULL, 1);
spin_unlock(&rdd->spin);
if (nlp == NULL)
spin_unlock(&dd->spin);
} else {
spin_unlock(&dd->spin);
nlp = NULL;
}
} else {
nlp = NULL;
}
/* dd->spin held if nlp != NULL */
/*
* Either schedule it or add it to our queue.
*/
if (nlp &&
(nlp->lwp_priority & ~PPQMASK) < (dd->upri & ~PPQMASK)) {
atomic_set_cpumask(&dfly_curprocmask, dd->cpumask);
dd->upri = nlp->lwp_priority;
dd->uschedcp = nlp;
#if 0
dd->rrcount = 0; /* reset round robin */
#endif
spin_unlock(&dd->spin);
lwkt_acquire(nlp->lwp_thread);
lwkt_schedule(nlp->lwp_thread);
} else if (nlp) {
dfly_setrunqueue_locked(dd, nlp);
spin_unlock(&dd->spin);
}
}
#endif
}
/*
* Called from acquire and from kern_synch's one-second timer (one of the
* callout helper threads) with a critical section held.
*
* Adjust p_estcpu based on our single-cpu load, p_nice, and compensate for
* overall system load.
*
* Note that no recalculation occurs for a process which sleeps and wakes
* up in the same tick. That is, a system doing thousands of context
* switches per second will still only do serious estcpu calculations
* ESTCPUFREQ times per second.
*/
static
void
dfly_recalculate_estcpu(struct lwp *lp)
{
globaldata_t gd = mycpu;
sysclock_t cpbase;
sysclock_t ttlticks;
int estcpu;
int decay_factor;
int ucount;
/*
* We have to subtract periodic to get the last schedclock
* timeout time, otherwise we would get the upcoming timeout.
* Keep in mind that a process can migrate between cpus and
* while the scheduler clock should be very close, boundary
* conditions could lead to a small negative delta.
*/
cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
if (lp->lwp_slptime > 1) {
/*
* Too much time has passed, do a coarse correction.
*/
lp->lwp_estcpu = lp->lwp_estcpu >> 1;
dfly_resetpriority(lp);
lp->lwp_cpbase = cpbase;
lp->lwp_cpticks = 0;
lp->lwp_estfast = 0;
} else if (lp->lwp_cpbase != cpbase) {
/*
* Adjust estcpu if we are in a different tick. Don't waste
* time if we are in the same tick.
*
* First calculate the number of ticks in the measurement
* interval. The ttlticks calculation can wind up 0 due to
* a bug in the handling of lwp_slptime (as yet not found),
* so make sure we do not get a divide by 0 panic.
*/
ttlticks = (cpbase - lp->lwp_cpbase) /
gd->gd_schedclock.periodic;
if (ttlticks < 0) {
ttlticks = 0;
lp->lwp_cpbase = cpbase;
}
if (ttlticks == 0)
return;
updatepcpu(lp, lp->lwp_cpticks, ttlticks);
/*
* Calculate the percentage of one cpu being used then
* compensate for any system load in excess of ncpus.
*
* For example, if we have 8 cores and 16 running cpu-bound
* processes then all things being equal each process will
* get 50% of one cpu. We need to pump this value back
* up to 100% so the estcpu calculation properly adjusts
* the process's dynamic priority.
*
* estcpu is scaled by ESTCPUMAX, pctcpu is scaled by FSCALE.
*/
estcpu = (lp->lwp_pctcpu * ESTCPUMAX) >> FSHIFT;
ucount = dfly_ucount;
if (ucount > ncpus) {
estcpu += estcpu * (ucount - ncpus) / ncpus;
}
if (usched_dfly_debug == lp->lwp_proc->p_pid) {
kprintf("pid %d lwp %p estcpu %3d %3d cp %d/%d",
lp->lwp_proc->p_pid, lp,
estcpu, lp->lwp_estcpu,
lp->lwp_cpticks, ttlticks);
}
/*
* Adjust lp->lwp_esetcpu. The decay factor determines how
* quickly lwp_estcpu collapses to its realtime calculation.
* A slower collapse gives us a more accurate number over
* the long term but can create problems with bursty threads
* or threads which become cpu hogs.
*
* To solve this problem, newly started lwps and lwps which
* are restarting after having been asleep for a while are
* given a much, much faster decay in order to quickly
* detect whether they become cpu-bound.
*
* NOTE: p_nice is accounted for in dfly_resetpriority(),
* and not here, but we must still ensure that a
* cpu-bound nice -20 process does not completely
* override a cpu-bound nice +20 process.
*
* NOTE: We must use ESTCPULIM() here to deal with any
* overshoot.
*/
decay_factor = usched_dfly_decay;
if (decay_factor < 1)
decay_factor = 1;
if (decay_factor > 1024)
decay_factor = 1024;
if (lp->lwp_estfast < usched_dfly_decay) {
++lp->lwp_estfast;
lp->lwp_estcpu = ESTCPULIM(
(lp->lwp_estcpu * lp->lwp_estfast + estcpu) /
(lp->lwp_estfast + 1));
} else {
lp->lwp_estcpu = ESTCPULIM(
(lp->lwp_estcpu * decay_factor + estcpu) /
(decay_factor + 1));
}
if (usched_dfly_debug == lp->lwp_proc->p_pid)
kprintf(" finalestcpu %d\n", lp->lwp_estcpu);