-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathmanipulation.py
649 lines (472 loc) · 21.7 KB
/
manipulation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
# -*- coding: utf-8 -*-
# public api to export
__all__ = [
'camel_case_to_snake',
'snake_case_to_camel',
'reverse',
'shuffle',
'strip_html',
'prettify',
'asciify',
'slugify',
'booleanize',
'strip_margin',
'compress',
'decompress',
'roman_encode',
'roman_decode',
]
import base64
import random
import unicodedata
import zlib
from typing import Union
from uuid import uuid4
from ._regex import *
from .errors import InvalidInputError
from .validation import is_snake_case, is_full_string, is_camel_case, is_integer, is_string
# PRIVATE API
class __RomanNumbers:
# internal rule mappings for encode()
__mappings = [
# units
{1: 'I', 5: 'V'},
# tens
{1: 'X', 5: 'L'},
# hundreds
{1: 'C', 5: 'D'},
# thousands
{1: 'M'},
]
# swap key/value definitions for decode()
__reversed_mappings = [{v: k for k, v in m.items()} for m in __mappings]
@classmethod
def __encode_digit(cls, index: int, value: int) -> str:
# if digit is zero, there is no sign to display
if value == 0:
return ''
# from 1 to 3 we have just to repeat the sign N times (eg: III, XXX...)
if value <= 3:
return cls.__mappings[index][1] * value
# if 4 we have to add unit prefix
if value == 4:
return cls.__mappings[index][1] + cls.__mappings[index][5]
# if is 5, is a straight map
if value == 5:
return cls.__mappings[index][5]
# if 6, 7 or 8 we have to append unit suffixes
if value <= 8:
suffix = cls.__mappings[index][1] * (value - 5)
return cls.__mappings[index][5] + suffix
# if 9 we have to prepend current unit to next
return cls.__mappings[index][1] + cls.__mappings[index + 1][1]
@classmethod
def encode(cls, input_number: Union[str, int]) -> str:
# force input conversion to a string (we need it in order to iterate on each digit)
input_string = str(input_number)
if not is_integer(input_string):
raise ValueError('Invalid input, only strings or integers are allowed')
value = int(input_string)
if value < 1 or value > 3999:
raise ValueError('Input must be >= 1 and <= 3999')
input_len = len(input_string)
output = ''
# decode digits from right to left (start from units to thousands)
for index in range(input_len):
# get actual digit value as int
digit = int(input_string[input_len - index - 1])
# encode digit to roman string
encoded_digit = cls.__encode_digit(index, digit)
# prepend encoded value to the current output in order to have the final string sorted
# from thousands to units
output = encoded_digit + output
return output
@classmethod
def __index_for_sign(cls, sign: str) -> int:
for index, mapping in enumerate(cls.__reversed_mappings):
if sign in mapping:
return index
raise ValueError('Invalid token found: "{}"'.format(sign))
@classmethod
def decode(cls, input_string: str) -> int:
if not is_full_string(input_string):
raise ValueError('Input must be a non empty string')
# reverse the provided string so that we can start parsing from units to thousands
reversed_string = reverse(input_string.upper())
# track last used value
last_value = None
# computed number to return
output = 0
# for each sign in the string we get its numeric value and add or subtract it to the computed output
for sign in reversed_string:
# are we dealing with units, tens, hundreds or thousands?
index = cls.__index_for_sign(sign)
# it's basically 1 or 5 (based on mapping rules definitions)
key_value = cls.__reversed_mappings[index][sign]
# Based on the level (tens, hundreds...) we have to add as many zeroes as the level into which we are
# in order to have the actual sign value.
# For instance, if we are at level 2 we are dealing with hundreds, therefore instead of 1 or 5, we will
# obtain 100 or 500 by adding 2 zeroes
sign_value = int(str(key_value) + '0' * index)
# increase total value if we are moving on with level
if last_value is None or sign_value >= last_value:
output += sign_value
# Decrease value if we are back to a previous level
# For instance, if we are parsing "IX", we first encounter "X" which is ten then "I" which is unit,
# So we have to do the following operation in order to get 9 (the final result): 10 - 1
else:
output -= sign_value
last_value = sign_value
return output
class __StringCompressor:
@staticmethod
def __require_valid_input_and_encoding(input_string: str, encoding: str):
if not is_string(input_string):
raise InvalidInputError(input_string)
if len(input_string) == 0:
raise ValueError('Input string cannot be empty')
if not is_string(encoding):
raise ValueError('Invalid encoding')
@classmethod
def compress(cls, input_string: str, encoding: str = 'utf-8', compression_level: int = 9) -> str:
cls.__require_valid_input_and_encoding(input_string, encoding)
if not isinstance(compression_level, int) or compression_level < 0 or compression_level > 9:
raise ValueError('Invalid compression_level: it must be an "int" between 0 and 9')
# turns input string into a sequence of bytes using provided encoding
original_bytes = input_string.encode(encoding)
# compress bytes using zlib library
compressed_bytes = zlib.compress(original_bytes, compression_level)
# encode compressed bytes using base64
# (this ensure that all characters will be available and that the output string can be used safely in any
# context such URLs)
encoded_bytes = base64.urlsafe_b64encode(compressed_bytes)
# finally turns base64 bytes into a string
output = encoded_bytes.decode(encoding)
return output
@classmethod
def decompress(cls, input_string: str, encoding: str = 'utf-8') -> str:
cls.__require_valid_input_and_encoding(input_string, encoding)
# turns input string into a sequence of bytes
# (the string is assumed to be a previously compressed string, therefore we have to decode it using base64)
input_bytes = base64.urlsafe_b64decode(input_string)
# decompress bytes using zlib
decompressed_bytes = zlib.decompress(input_bytes)
# decode the decompressed bytes to get the original string back
original_string = decompressed_bytes.decode(encoding)
return original_string
class __StringFormatter:
def __init__(self, input_string):
if not is_string(input_string):
raise InvalidInputError(input_string)
self.input_string = input_string
def __uppercase_first_char(self, regex_match):
return regex_match.group(0).upper()
def __remove_duplicates(self, regex_match):
return regex_match.group(1)[0]
def __uppercase_first_letter_after_sign(self, regex_match):
match = regex_match.group(1)
return match[:-1] + match[2].upper()
def __ensure_right_space_only(self, regex_match):
return regex_match.group(1).strip() + ' '
def __ensure_left_space_only(self, regex_match):
return ' ' + regex_match.group(1).strip()
def __ensure_spaces_around(self, regex_match):
return ' ' + regex_match.group(1).strip() + ' '
def __remove_internal_spaces(self, regex_match):
return regex_match.group(1).strip()
def __fix_saxon_genitive(self, regex_match):
return regex_match.group(1).replace(' ', '') + ' '
# generates a placeholder to inject temporary into the string, it will be replaced with the original
# value at the end of the process
@staticmethod
def __placeholder_key():
return '$' + uuid4().hex + '$'
def format(self) -> str:
# map of temporary placeholders
placeholders = {}
out = self.input_string
# looks for url or email and updates placeholders map with found values
placeholders.update({self.__placeholder_key(): m[0] for m in URLS_RE.findall(out)})
placeholders.update({self.__placeholder_key(): m for m in EMAILS_RE.findall(out)})
# replace original value with the placeholder key
for p in placeholders:
out = out.replace(placeholders[p], p, 1)
out = PRETTIFY_RE['UPPERCASE_FIRST_LETTER'].sub(self.__uppercase_first_char, out)
out = PRETTIFY_RE['DUPLICATES'].sub(self.__remove_duplicates, out)
out = PRETTIFY_RE['RIGHT_SPACE'].sub(self.__ensure_right_space_only, out)
out = PRETTIFY_RE['LEFT_SPACE'].sub(self.__ensure_left_space_only, out)
out = PRETTIFY_RE['SPACES_AROUND'].sub(self.__ensure_spaces_around, out)
out = PRETTIFY_RE['SPACES_INSIDE'].sub(self.__remove_internal_spaces, out)
out = PRETTIFY_RE['UPPERCASE_AFTER_SIGN'].sub(self.__uppercase_first_letter_after_sign, out)
out = PRETTIFY_RE['SAXON_GENITIVE'].sub(self.__fix_saxon_genitive, out)
out = out.strip()
# restore placeholder keys with their associated original value
for p in placeholders:
out = out.replace(p, placeholders[p], 1)
return out
# PUBLIC API
def reverse(input_string: str) -> str:
"""
Returns the string with its chars reversed.
*Example:*
>>> reverse('hello') # returns 'olleh'
:param input_string: String to revert.
:type input_string: str
:return: Reversed string.
"""
if not is_string(input_string):
raise InvalidInputError(input_string)
return input_string[::-1]
def camel_case_to_snake(input_string, separator='_'):
"""
Convert a camel case string into a snake case one.
(The original string is returned if is not a valid camel case string)
*Example:*
>>> camel_case_to_snake('ThisIsACamelStringTest') # returns 'this_is_a_camel_case_string_test'
:param input_string: String to convert.
:type input_string: str
:param separator: Sign to use as separator.
:type separator: str
:return: Converted string.
"""
if not is_string(input_string):
raise InvalidInputError(input_string)
if not is_camel_case(input_string):
return input_string
return CAMEL_CASE_REPLACE_RE.sub(lambda m: m.group(1) + separator, input_string).lower()
def snake_case_to_camel(input_string: str, upper_case_first: bool = True, separator: str = '_') -> str:
"""
Convert a snake case string into a camel case one.
(The original string is returned if is not a valid snake case string)
*Example:*
>>> snake_case_to_camel('the_snake_is_green') # returns 'TheSnakeIsGreen'
:param input_string: String to convert.
:type input_string: str
:param upper_case_first: True to turn the first letter into uppercase (default).
:type upper_case_first: bool
:param separator: Sign to use as separator (default to "_").
:type separator: str
:return: Converted string
"""
if not is_string(input_string):
raise InvalidInputError(input_string)
if not is_snake_case(input_string, separator):
return input_string
tokens = [s.title() for s in input_string.split(separator) if is_full_string(s)]
if not upper_case_first:
tokens[0] = tokens[0].lower()
out = ''.join(tokens)
return out
def shuffle(input_string: str) -> str:
"""
Return a new string containing same chars of the given one but in a randomized order.
*Example:*
>>> shuffle('hello world') # possible output: 'l wodheorll'
:param input_string: String to shuffle
:type input_string: str
:return: Shuffled string
"""
if not is_string(input_string):
raise InvalidInputError(input_string)
# turn the string into a list of chars
chars = list(input_string)
# shuffle the list
random.shuffle(chars)
# convert the shuffled list back to string
return ''.join(chars)
def strip_html(input_string: str, keep_tag_content: bool = False) -> str:
"""
Remove html code contained into the given string.
*Examples:*
>>> strip_html('test: <a href="foo/bar">click here</a>') # returns 'test: '
>>> strip_html('test: <a href="foo/bar">click here</a>', keep_tag_content=True) # returns 'test: click here'
:param input_string: String to manipulate.
:type input_string: str
:param keep_tag_content: True to preserve tag content, False to remove tag and its content too (default).
:type keep_tag_content: bool
:return: String with html removed.
"""
if not is_string(input_string):
raise InvalidInputError(input_string)
r = HTML_TAG_ONLY_RE if keep_tag_content else HTML_RE
return r.sub('', input_string)
def prettify(input_string: str) -> str:
"""
Reformat a string by applying the following basic grammar and formatting rules:
- String cannot start or end with spaces
- The first letter in the string and the ones after a dot, an exclamation or a question mark must be uppercase
- String cannot have multiple sequential spaces, empty lines or punctuation (except for "?", "!" and ".")
- Arithmetic operators (+, -, /, \\*, =) must have one, and only one space before and after themselves
- One, and only one space should follow a dot, a comma, an exclamation or a question mark
- Text inside double quotes cannot start or end with spaces, but one, and only one space must come first and \
after quotes (foo" bar"baz -> foo "bar" baz)
- Text inside round brackets cannot start or end with spaces, but one, and only one space must come first and \
after brackets ("foo(bar )baz" -> "foo (bar) baz")
- Percentage sign ("%") cannot be preceded by a space if there is a number before ("100 %" -> "100%")
- Saxon genitive is correct ("Dave' s dog" -> "Dave's dog")
*Examples:*
>>> prettify(' unprettified string ,, like this one,will be"prettified" .it\\' s awesome! ')
>>> # -> 'Unprettified string, like this one, will be "prettified". It\'s awesome!'
:param input_string: String to manipulate
:return: Prettified string.
"""
formatted = __StringFormatter(input_string).format()
return formatted
def asciify(input_string: str) -> str:
"""
Force string content to be ascii-only by translating all non-ascii chars into the closest possible representation
(eg: ó -> o, Ë -> E, ç -> c...).
**Bear in mind**: Some chars may be lost if impossible to translate.
*Example:*
>>> asciify('èéùúòóäåëýñÅÀÁÇÌÍÑÓË') # returns 'eeuuooaaeynAAACIINOE'
:param input_string: String to convert
:return: Ascii utf-8 string
"""
if not is_string(input_string):
raise InvalidInputError(input_string)
# "NFKD" is the algorithm which is able to successfully translate the most of non-ascii chars
normalized = unicodedata.normalize('NFKD', input_string)
# encode string forcing ascii and ignore any errors (unrepresentable chars will be stripped out)
ascii_bytes = normalized.encode('ascii', 'ignore')
# turns encoded bytes into an utf-8 string
ascii_string = ascii_bytes.decode('utf-8')
return ascii_string
def slugify(input_string: str, separator: str = '-') -> str:
"""
Converts a string into a "slug" using provided separator.
The returned string has the following properties:
- it has no spaces
- all letters are in lower case
- all punctuation signs and non alphanumeric chars are removed
- words are divided using provided separator
- all chars are encoded as ascii (by using `asciify()`)
- is safe for URL
*Examples:*
>>> slugify('Top 10 Reasons To Love Dogs!!!') # returns: 'top-10-reasons-to-love-dogs'
>>> slugify('Mönstér Mägnët') # returns 'monster-magnet'
:param input_string: String to convert.
:type input_string: str
:param separator: Sign used to join string tokens (default to "-").
:type separator: str
:return: Slug string
"""
if not is_string(input_string):
raise InvalidInputError(input_string)
# replace any character that is NOT letter or number with spaces
out = NO_LETTERS_OR_NUMBERS_RE.sub(' ', input_string.lower()).strip()
# replace spaces with join sign
out = SPACES_RE.sub(separator, out)
# normalize joins (remove duplicates)
out = re.sub(re.escape(separator) + r'+', separator, out)
return asciify(out)
def booleanize(input_string: str) -> bool:
"""
Turns a string into a boolean based on its content (CASE INSENSITIVE).
A positive boolean (True) is returned if the string value is one of the following:
- "true"
- "1"
- "yes"
- "y"
Otherwise False is returned.
*Examples:*
>>> booleanize('true') # returns True
>>> booleanize('YES') # returns True
>>> booleanize('nope') # returns False
:param input_string: String to convert
:type input_string: str
:return: True if the string contains a boolean-like positive value, false otherwise
"""
if not is_string(input_string):
raise InvalidInputError(input_string)
return input_string.lower() in ('true', '1', 'yes', 'y')
def strip_margin(input_string: str) -> str:
"""
Removes tab indentation from multi line strings (inspired by analogous Scala function).
*Example:*
>>> strip_margin('''
>>> line 1
>>> line 2
>>> line 3
>>> ''')
>>> # returns:
>>> '''
>>> line 1
>>> line 2
>>> line 3
>>> '''
:param input_string: String to format
:type input_string: str
:return: A string without left margins
"""
if not is_string(input_string):
raise InvalidInputError(input_string)
line_separator = '\n'
lines = [MARGIN_RE.sub('', line) for line in input_string.split(line_separator)]
out = line_separator.join(lines)
return out
def compress(input_string: str, encoding: str = 'utf-8', compression_level: int = 9) -> str:
"""
Compress the given string by returning a shorter one that can be safely used in any context (like URL) and
restored back to its original state using `decompress()`.
**Bear in mind:**
Besides the provided `compression_level`, the compression result (how much the string is actually compressed
by resulting into a shorter string) depends on 2 factors:
1. The amount of data (string size): short strings might not provide a significant compression result\
or even be longer than the given input string (this is due to the fact that some bytes have to be embedded\
into the compressed string in order to be able to restore it later on)\
2. The content type: random sequences of chars are very unlikely to be successfully compressed, while the best\
compression result is obtained when the string contains several recurring char sequences (like in the example).
Behind the scenes this method makes use of the standard Python's zlib and base64 libraries.
*Examples:*
>>> n = 0 # <- ignore this, it's a fix for Pycharm (not fixable using ignore comments)
>>> # "original" will be a string with 169 chars:
>>> original = ' '.join(['word n{}'.format(n) for n in range(20)])
>>> # "compressed" will be a string of 88 chars
>>> compressed = compress(original)
:param input_string: String to compress (must be not empty or a ValueError will be raised).
:type input_string: str
:param encoding: String encoding (default to "utf-8").
:type encoding: str
:param compression_level: A value between 0 (no compression) and 9 (best compression), default to 9.
:type compression_level: int
:return: Compressed string.
"""
return __StringCompressor.compress(input_string, encoding, compression_level)
def decompress(input_string: str, encoding: str = 'utf-8') -> str:
"""
Restore a previously compressed string (obtained using `compress()`) back to its original state.
:param input_string: String to restore.
:type input_string: str
:param encoding: Original string encoding.
:type encoding: str
:return: Decompressed string.
"""
return __StringCompressor.decompress(input_string, encoding)
def roman_encode(input_number: Union[str, int]) -> str:
"""
Convert the given number/string into a roman number.
The passed input must represents a positive integer in the range 1-3999 (inclusive).
Why this limit? You may be wondering:
1. zero is forbidden since there is no related representation in roman numbers
2. the upper bound 3999 is due to the limitation in the ascii charset\
(the higher quantity sign displayable in ascii is "M" which is equal to 1000, therefore based on\
roman numbers rules we can use 3 times M to reach 3000 but we can't go any further in thousands without\
special "boxed chars").
*Examples:*
>>> roman_encode(37) # returns 'XXXVIII'
>>> roman_encode('2020') # returns 'MMXX'
:param input_number: An integer or a string to be converted.
:type input_number: Union[str, int]
:return: Roman number string.
"""
return __RomanNumbers.encode(input_number)
def roman_decode(input_string: str) -> int:
"""
Decode a roman number string into an integer if the provided string is valid.
*Example:*
>>> roman_decode('VII') # returns 7
:param input_string: (Assumed) Roman number
:type input_string: str
:return: Integer value
"""
return __RomanNumbers.decode(input_string)