1
- //! Performance
1
+ //! Sort an array of strings using MSD radix sort.
2
2
//!
3
- //! The running time of MSD string sort depends on the data.
3
+ //! # Performance
4
4
//!
5
- //! For random inputs, MSD string sort examines just enough
5
+ //! - The running time of MSD string sort depends on the data,
6
+ //! key length does not play a role. Indeed, the random string
7
+ //! model allows key length to approach infinity.
8
+ //! - For random inputs, MSD string sort examines just enough
6
9
//! characters to distinguish among the keys, and the running
7
10
//! time is sublinear in the number of characters in the data
8
11
//! (it examines a small fraction of the input characters).
50
53
//! will not be many duplicates and the random model will apply;
51
54
//! for a week’s worth of data on a local road, there will be numerous
52
55
//! duplicates and performance will be closer to the worst case.
56
+ //!
57
+ //! The main challenge in getting maximum efficiency from MSD string
58
+ //! sort on keys that are long strings is to deal with lack of randomness
59
+ //! in the data. Typically, keys may have long stretches of equal data,
60
+ //! or parts of them might fall in only a narrow range. For example,
61
+ //! an information-processing application for student data might have
62
+ //! keys that include graduation year (4 bytes, but one of four different
63
+ //! values), state names (perhaps 10 bytes, but one of 50 different values),
64
+ //! and gender (1 byte with one of two given values), as well as a person’s
65
+ //! name (more similar to random strings, but probably not short, with
66
+ //! nonuniform letter distributions, and with trailing blanks in a fixed-length
67
+ //! field). Restrictions like these lead to large numbers of empty subarrays
68
+ //! during the MSD string sort.
53
69
54
70
#![ allow( clippy:: many_single_char_names) ]
55
- use std:: cmp:: Ordering ;
71
+ use crate :: sort;
72
+ use crate :: strings:: util;
56
73
use std:: marker:: PhantomData ;
57
74
58
75
const R : usize = 256 ; // extended ASCII alphabet size
59
76
const CUTOFF : usize = 15 ; // cutoff to insertion sort
60
77
78
+ /// The MSD provides static methods for sorting an
79
+ /// array of extended ASCII strings using MSD radix
80
+ /// sort.
61
81
pub struct MSD < T > {
62
82
_marker : PhantomData < T > ,
63
83
}
@@ -66,6 +86,7 @@ impl<T> MSD<T>
66
86
where
67
87
T : AsRef < str > + Copy ,
68
88
{
89
+ /// Rearranges the array of extended ASCII strings in ascending order.
69
90
pub fn sort ( a : & mut [ T ] ) {
70
91
let n = a. len ( ) ;
71
92
if n > 0 {
@@ -74,17 +95,18 @@ where
74
95
}
75
96
}
76
97
98
+ /// sort from a[lo] to a[hi], starting at the d-th character
77
99
fn do_sort ( a : & mut [ T ] , lo : usize , hi : usize , d : usize , aux : & mut [ T ] ) {
78
100
// cutoff to insertion sort for small subarrays
79
101
if hi <= lo + CUTOFF {
80
- Self :: insertion ( a, lo, hi, d) ;
102
+ sort :: insert :: sort_dth ( a, lo, hi, d) ;
81
103
return ;
82
104
}
83
105
84
106
// compute frequency counts
85
107
let mut count = [ 0 ; R + 2 ] ;
86
108
for it in a. iter ( ) . take ( hi + 1 ) . skip ( lo) {
87
- let c = char_at ( it. as_ref ( ) , d) ;
109
+ let c = util :: char_at ( it. as_ref ( ) , d) ;
88
110
count[ ( c + 2 ) as usize ] += 1 ;
89
111
}
90
112
95
117
96
118
// distribute
97
119
for it in a. iter ( ) . take ( hi + 1 ) . skip ( lo) {
98
- let c = char_at ( it. as_ref ( ) , d) ;
120
+ let c = util :: char_at ( it. as_ref ( ) , d) ;
99
121
aux[ count[ ( c + 1 ) as usize ] ] = * it;
100
122
count[ ( c + 1 ) as usize ] += 1 ;
101
123
}
@@ -114,49 +136,4 @@ where
114
136
}
115
137
}
116
138
}
117
-
118
- fn insertion ( a : & mut [ T ] , lo : usize , hi : usize , d : usize ) {
119
- for i in lo..=hi {
120
- let mut j = i;
121
- while j > lo && less ( a[ j] . as_ref ( ) , a[ j - 1 ] . as_ref ( ) , d) {
122
- a. swap ( j, j - 1 ) ;
123
- j -= 1 ;
124
- }
125
- }
126
- }
127
- }
128
-
129
- fn less ( v : & str , w : & str , d : usize ) -> bool {
130
- for ( a, b) in v. bytes ( ) . zip ( w. bytes ( ) ) . skip ( d) {
131
- match a. cmp ( & b) {
132
- Ordering :: Less => return true ,
133
- Ordering :: Equal => ( ) ,
134
- Ordering :: Greater => return false ,
135
- }
136
- }
137
- v. as_bytes ( ) . len ( ) < w. as_bytes ( ) . len ( )
138
- }
139
-
140
- fn char_at ( s : & str , d : usize ) -> i32 {
141
- let len = s. as_bytes ( ) . len ( ) ;
142
- if d >= len {
143
- -1
144
- } else {
145
- s. as_bytes ( ) [ d] as i32
146
- }
147
- }
148
-
149
- #[ test]
150
- fn t_less ( ) {
151
- assert ! ( less( "aaa" , "aaaa" , 0 ) ) ; // len less
152
- assert ! ( less( "aaa" , "aaaa" , 1 ) ) ; // len less
153
- assert ! ( less( "aaa" , "abaa" , 1 ) ) ; // 'a' < 'b'
154
- }
155
-
156
- #[ test]
157
- fn t_char_at ( ) {
158
- assert_eq ! ( b'a' as i32 , char_at( "abc" , 0 ) ) ;
159
- assert_eq ! ( b'b' as i32 , char_at( "abc" , 1 ) ) ;
160
- assert_eq ! ( b'c' as i32 , char_at( "abc" , 2 ) ) ;
161
- assert_eq ! ( -1 , char_at( "abc" , 3 ) ) ;
162
139
}
0 commit comments