-
Notifications
You must be signed in to change notification settings - Fork 836
/
StringUtil.cpp
291 lines (232 loc) · 7.71 KB
/
StringUtil.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
/* This file is part of the Spring engine (GPL v2 or later), see LICENSE.html */
#include "System/StringUtil.h"
#include "System/bitops.h"
#include <zlib.h>
#if defined(_MSC_VER) && (_MSC_VER >= 1310)
#include <intrin.h>
#endif
#include <cstring>
#include <cinttypes>
std::string StringReplace(const std::string& text,
const std::string& from,
const std::string& to)
{
std::string working = text;
std::string::size_type pos = 0;
while (true) {
if ((pos = working.find(from, pos)) == std::string::npos)
break;
std::string tmp = working.substr(0, pos);
tmp += to;
tmp += working.substr(pos + from.size(), std::string::npos);
pos += to.size();
working = tmp;
}
return working;
}
std::string StringStrip(const std::string& str, const std::string& chars)
{
std::string ret;
ret.reserve(str.size());
for (const char c: str) {
if (chars.find(c) != std::string::npos)
continue;
ret.push_back(c);
}
return ret;
}
/// @see http://www.codeproject.com/KB/stl/stdstringtrim.aspx
void StringTrimInPlace(std::string& str, const std::string& ws)
{
std::string::size_type pos = str.find_last_not_of(ws);
if (pos != std::string::npos) {
str.erase(pos + 1);
if ((pos = str.find_first_not_of(ws)) != std::string::npos)
str.erase(0, pos);
return;
}
str.erase(str.begin(), str.end());
}
std::string StringTrim(const std::string& str, const std::string& ws)
{
std::string copy(str);
StringTrimInPlace(copy, ws);
return copy;
}
bool StringToBool(std::string str)
{
StringTrimInPlace(str);
StringToLowerInPlace(str);
// regex would probably be more appropriate,
// but it is better not to rely on any external lib here
if (str.empty()) return false;
if (str == "0") return false;
if (str == "n") return false;
if (str == "no") return false;
if (str == "f") return false;
if (str == "false") return false;
if (str == "off") return false;
return true;
}
bool StringStartsWith(const std::string& str, const char* prefix)
{
if ((prefix == nullptr) || (str.size() < strlen(prefix)))
return false;
return (str.compare(0, strlen(prefix), prefix) == 0);
}
bool StringEndsWith(const std::string& str, const char* postfix)
{
if ((postfix == nullptr) || (str.size() < strlen(postfix)))
return false;
return (str.compare(str.size() - strlen(postfix), str.size(), postfix) == 0);
}
void InverseOrSetBool(bool& b, const std::string& argValue, const bool inverseArg)
{
if (argValue.empty()) {
// toggle
b = !b;
} else {
// set
b = inverseArg? (!StringToBool(argValue)) : (StringToBool(argValue));
}
}
char32_t utf8::GetNextChar(const std::string& text, int& pos)
{
// UTF8 looks like this
// 1Byte == ASCII: 0xxxxxxxxx
// 2Bytes encoded char: 110xxxxxxx 10xxxxxx
// 3Bytes encoded char: 1110xxxxxx 10xxxxxx 10xxxxxx
// 4Bytes encoded char: 11110xxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// Originaly there were 5&6 byte versions too, but they were dropped in RFC 3629.
// So UTF8 maps to UTF16 range only.
static constexpr auto UTF8_CONT_MASK = 0xC0; // 11xxxxxx
static constexpr auto UTF8_CONT_OKAY = 0x80; // 10xxxxxx
union UTF8_4Byte {
std::uint32_t i;
std::uint8_t c[4];
};
// read next 4bytes and check if it is an utf8 sequence
UTF8_4Byte utf8 = { 0 };
const int remainingChars = text.length() - pos;
if (remainingChars >= 4) {
// we need to use memcpy cause text[pos] isn't memory aligned as ints need to be
memcpy(&utf8.i, &text[pos], sizeof(std::uint32_t));
} else {
// read ahead of end of string
if (remainingChars <= 0)
return 0;
// end of string reached, only read till end
switch (remainingChars) {
case 3: utf8.c[2] = std::uint8_t(text[pos + 2]);
case 2: utf8.c[1] = std::uint8_t(text[pos + 1]);
case 1: utf8.c[0] = std::uint8_t(text[pos ]);
default: {}
};
}
// how many bytes are requested for our multi-byte utf8 sequence
unsigned clo = count_leading_ones(utf8.c[0]);
if (clo>4 || clo==0) clo = 1; // ignore >=5 byte ones cause of RFC 3629
// how many healthy utf8 bytes are following
unsigned numValidUtf8Bytes = 1; // first char is always valid
numValidUtf8Bytes += int((utf8.c[1] & UTF8_CONT_MASK) == UTF8_CONT_OKAY);
numValidUtf8Bytes += int((utf8.c[2] & UTF8_CONT_MASK) == UTF8_CONT_OKAY);
numValidUtf8Bytes += int((utf8.c[3] & UTF8_CONT_MASK) == UTF8_CONT_OKAY);
// check if enough trailing utf8 bytes are healthy
// else ignore utf8 and parse it as 8bit Latin-1 char (extended ASCII)
// this adds backwardcompatibility with the old renderer
// which supported extended ASCII with umlauts etc.
const auto usedUtf8Bytes = (clo <= numValidUtf8Bytes) ? clo : 1u;
char32_t u = 0;
switch (usedUtf8Bytes) {
case 0:
case 1: {
u = utf8.c[0];
} break;
case 2: {
u = (char32_t(utf8.c[0] & 0x1F)) << 6;
u |= (char32_t(utf8.c[1] & 0x3F));
} break;
case 3: {
u = (char32_t(utf8.c[0] & 0x0F)) << 12;
u |= (char32_t(utf8.c[1] & 0x3F)) << 6;
u |= (char32_t(utf8.c[2] & 0x3F));
} break;
case 4: {
u = (char32_t(utf8.c[0] & 0x07)) << 18;
u |= (char32_t(utf8.c[1] & 0x3F)) << 12;
u |= (char32_t(utf8.c[2] & 0x3F)) << 6;
u |= (char32_t(utf8.c[3] & 0x3F));
//TODO limit range to UTF16!
} break;
}
pos += usedUtf8Bytes;
// replace tabs with spaces
if (u == 0x9)
u = 0x2007;
return u;
}
std::string utf8::FromUnicode(char32_t ch)
{
std::string str;
// in: 0000 0000 0000 0000 0000 0000 0aaa aaaa
// out: 0aaa aaaa
if (ch < (1 << 7)) {
str += (char)ch;
}
// in: 0000 0000 0000 0000 0000 0bbb bbaa aaaa
// out: 110b bbbb 10aa aaaa
else if (ch < (1 << 11)) {
str += 0xC0 | (char)(ch >> 6);
str += 0x80 | (char)(ch & 0x3F);
}
// in: 0000 0000 0000 0000 cccc bbbb bbaa aaaa
// out: 1110 cccc 10bb bbbb 10aa aaaa
else if (ch < (1 << 16)) {
str += 0xE0 | (char) (ch >> 12 );
str += 0x80 | (char)((ch >> 6) & 0x3F);
str += 0x80 | (char)( ch & 0x3F);
}
// in: 0000 0000 000d ddcc cccc bbbb bbaa aaaa
// out: 1111 0ddd 10cc cccc 10bb bbbb 10aa aaaa
else if (ch < (1 << 21)) {
str += 0xF0 | (char) (ch >> 18 );
str += 0x80 | (char)((ch >> 12) & 0x3F);
str += 0x80 | (char)((ch >> 6) & 0x3F);
str += 0x80 | (char)( ch & 0x3F);
}
return str;
}
#if !defined(UNITSYNC) && !defined(UNIT_TEST) && !defined(BUILDING_AI)
std::vector<std::uint8_t> zlib::deflate(const std::vector<std::uint8_t>& inflData) { return (zlib::deflate(inflData.data(), inflData.size())); }
std::vector<std::uint8_t> zlib::deflate(const std::uint8_t* inflData, unsigned long inflSize) {
std::vector<std::uint8_t> deflData(compressBound(inflSize));
unsigned long bufSize = deflData.size();
if (compress(deflData.data(), &bufSize, inflData, inflSize) == Z_OK) {
deflData.resize(bufSize);
deflData.shrink_to_fit();
} else {
deflData.clear();
}
return deflData;
}
std::vector<std::uint8_t> zlib::inflate(const std::vector<std::uint8_t>& deflData) { return (zlib::inflate(deflData.data(), deflData.size())); }
std::vector<std::uint8_t> zlib::inflate(const std::uint8_t* deflData, unsigned long deflSize) {
// "the LSB does not describe any mechanism by which a
// compressor can communicate the size required to the
// uncompressor" ==> we must reserve some fixed-length
// buffer (starting at 256K bytes to handle very large
// blobs) for each new decompression attempt
std::vector<std::uint8_t> inflData(256 * 1024);
unsigned long bufSize = inflData.size();
unsigned long rawSize = bufSize;
int ret;
while ((ret = uncompress(inflData.data(), &rawSize, deflData, deflSize)) == Z_BUF_ERROR) {
inflData.resize(rawSize = (bufSize *= 2));
}
if (ret != Z_OK)
inflData.clear();
inflData.resize(rawSize);
inflData.shrink_to_fit();
return inflData;
}
#endif //UNITSYNC