1
- """ Test script for the unicodedata module.
1
+ """ Tests for the unicodedata module.
2
2
3
3
Written by Marc-Andre Lemburg (mal@lemburg.com).
4
4
5
5
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
6
7
7
"""
8
8
9
+ import hashlib
10
+ from http .client import HTTPException
9
11
import sys
12
+ import unicodedata
10
13
import unittest
11
- import hashlib
12
- from test .support import script_helper
13
-
14
- encoding = 'utf-8'
15
- errors = 'surrogatepass'
14
+ from test .support import (open_urlresource , requires_resource , script_helper ,
15
+ cpython_only , check_disallow_instantiation ,
16
+ ResourceDenied )
16
17
17
18
18
- ### Run tests
19
-
20
19
class UnicodeMethodsTest (unittest .TestCase ):
21
20
22
21
# update this, if the database changes
23
- expectedchecksum = '9129d6f2bdf008a81c2476e5b5127014a62130c1 '
22
+ expectedchecksum = '4739770dd4d0e5f1b1677accfc3552ed3c8ef326 '
24
23
25
- # TODO: RUSTPYTHON
26
- @unittest .expectedFailure
24
+ @requires_resource ('cpu' )
27
25
def test_method_checksum (self ):
28
26
h = hashlib .sha1 ()
29
- for i in range (0x10000 ):
27
+ for i in range (sys . maxunicode + 1 ):
30
28
char = chr (i )
31
29
data = [
32
30
# Predicates (single char)
@@ -63,33 +61,26 @@ def test_method_checksum(self):
63
61
(char + 'ABC' ).title (),
64
62
65
63
]
66
- h .update ('' .join (data ).encode (encoding , errors ))
64
+ h .update ('' .join (data ).encode ('utf-8' , 'surrogatepass' ))
67
65
result = h .hexdigest ()
68
66
self .assertEqual (result , self .expectedchecksum )
69
67
70
68
class UnicodeDatabaseTest (unittest .TestCase ):
71
-
72
- def setUp (self ):
73
- # In case unicodedata is not available, this will raise an ImportError,
74
- # but the other test cases will still be run
75
- import unicodedata
76
- self .db = unicodedata
77
-
78
- def tearDown (self ):
79
- del self .db
69
+ db = unicodedata
80
70
81
71
class UnicodeFunctionsTest (UnicodeDatabaseTest ):
82
72
83
73
# Update this if the database changes. Make sure to do a full rebuild
84
74
# (e.g. 'make distclean && make') to get the correct checksum.
85
- expectedchecksum = 'c44a49ca7c5cb6441640fe174ede604b45028652 '
75
+ expectedchecksum = '98d602e1f69d5c5bb8a5910c40bbbad4e18e8370 '
86
76
# TODO: RUSTPYTHON
87
77
@unittest .expectedFailure
78
+ @requires_resource ('cpu' )
88
79
def test_function_checksum (self ):
89
80
data = []
90
81
h = hashlib .sha1 ()
91
82
92
- for i in range (0x10000 ):
83
+ for i in range (sys . maxunicode + 1 ):
93
84
char = chr (i )
94
85
data = [
95
86
# Properties
@@ -106,6 +97,13 @@ def test_function_checksum(self):
106
97
result = h .hexdigest ()
107
98
self .assertEqual (result , self .expectedchecksum )
108
99
100
+ @requires_resource ('cpu' )
101
+ def test_name_inverse_lookup (self ):
102
+ for i in range (sys .maxunicode + 1 ):
103
+ char = chr (i )
104
+ if looked_name := self .db .name (char , None ):
105
+ self .assertEqual (self .db .lookup (looked_name ), char )
106
+
109
107
# TODO: RUSTPYTHON
110
108
@unittest .expectedFailure
111
109
def test_digit (self ):
@@ -201,15 +199,8 @@ def test_combining(self):
201
199
self .assertRaises (TypeError , self .db .combining )
202
200
self .assertRaises (TypeError , self .db .combining , 'xx' )
203
201
204
- def test_normalize (self ):
205
- self .assertRaises (TypeError , self .db .normalize )
206
- self .assertRaises (ValueError , self .db .normalize , 'unknown' , 'xx' )
207
- self .assertEqual (self .db .normalize ('NFKC' , '' ), '' )
208
- # The rest can be found in test_normalization.py
209
- # which requires an external file.
210
-
211
202
def test_pr29 (self ):
212
- # http ://www.unicode.org/review/pr-29.html
203
+ # https ://www.unicode.org/review/pr-29.html
213
204
# See issues #1054943 and #10254.
214
205
composed = ("\u0b47 \u0300 \u0b3e " , "\u1100 \u0300 \u1161 " ,
215
206
'Li\u030d t-s\u1e73 \u0301 ' ,
@@ -240,9 +231,6 @@ def test_issue29456(self):
240
231
self .assertEqual (self .db .normalize ('NFC' , u11a7_str_a ), u11a7_str_b )
241
232
self .assertEqual (self .db .normalize ('NFC' , u11c3_str_a ), u11c3_str_b )
242
233
243
- # For tests of unicodedata.is_normalized / self.db.is_normalized ,
244
- # see test_normalization.py .
245
-
246
234
def test_east_asian_width (self ):
247
235
eaw = self .db .east_asian_width
248
236
self .assertRaises (TypeError , eaw , b'a' )
@@ -265,6 +253,11 @@ def test_east_asian_width_9_0_changes(self):
265
253
266
254
class UnicodeMiscTest (UnicodeDatabaseTest ):
267
255
256
+ @cpython_only
257
+ def test_disallow_instantiation (self ):
258
+ # Ensure that the type disallows instantiation (bpo-43916)
259
+ check_disallow_instantiation (self , unicodedata .UCD )
260
+
268
261
# TODO: RUSTPYTHON
269
262
@unittest .expectedFailure
270
263
def test_failed_import_during_compiling (self ):
@@ -363,5 +356,103 @@ def test_linebreak_7643(self):
363
356
self .assertEqual (len (lines ), 1 ,
364
357
r"\u%.4x should not be a linebreak" % i )
365
358
359
+ class NormalizationTest (unittest .TestCase ):
360
+ @staticmethod
361
+ def check_version (testfile ):
362
+ hdr = testfile .readline ()
363
+ return unicodedata .unidata_version in hdr
364
+
365
+ @staticmethod
366
+ def unistr (data ):
367
+ data = [int (x , 16 ) for x in data .split (" " )]
368
+ return "" .join ([chr (x ) for x in data ])
369
+
370
+ @requires_resource ('network' )
371
+ def test_normalization (self ):
372
+ TESTDATAFILE = "NormalizationTest.txt"
373
+ TESTDATAURL = f"http://www.pythontest.net/unicode/{ unicodedata .unidata_version } /{ TESTDATAFILE } "
374
+
375
+ # Hit the exception early
376
+ try :
377
+ testdata = open_urlresource (TESTDATAURL , encoding = "utf-8" ,
378
+ check = self .check_version )
379
+ except PermissionError :
380
+ self .skipTest (f"Permission error when downloading { TESTDATAURL } "
381
+ f"into the test data directory" )
382
+ except (OSError , HTTPException ) as exc :
383
+ self .skipTest (f"Failed to download { TESTDATAURL } : { exc } " )
384
+
385
+ with testdata :
386
+ self .run_normalization_tests (testdata )
387
+
388
+ def run_normalization_tests (self , testdata ):
389
+ part = None
390
+ part1_data = {}
391
+
392
+ def NFC (str ):
393
+ return unicodedata .normalize ("NFC" , str )
394
+
395
+ def NFKC (str ):
396
+ return unicodedata .normalize ("NFKC" , str )
397
+
398
+ def NFD (str ):
399
+ return unicodedata .normalize ("NFD" , str )
400
+
401
+ def NFKD (str ):
402
+ return unicodedata .normalize ("NFKD" , str )
403
+
404
+ for line in testdata :
405
+ if '#' in line :
406
+ line = line .split ('#' )[0 ]
407
+ line = line .strip ()
408
+ if not line :
409
+ continue
410
+ if line .startswith ("@Part" ):
411
+ part = line .split ()[0 ]
412
+ continue
413
+ c1 ,c2 ,c3 ,c4 ,c5 = [self .unistr (x ) for x in line .split (';' )[:- 1 ]]
414
+
415
+ # Perform tests
416
+ self .assertTrue (c2 == NFC (c1 ) == NFC (c2 ) == NFC (c3 ), line )
417
+ self .assertTrue (c4 == NFC (c4 ) == NFC (c5 ), line )
418
+ self .assertTrue (c3 == NFD (c1 ) == NFD (c2 ) == NFD (c3 ), line )
419
+ self .assertTrue (c5 == NFD (c4 ) == NFD (c5 ), line )
420
+ self .assertTrue (c4 == NFKC (c1 ) == NFKC (c2 ) == \
421
+ NFKC (c3 ) == NFKC (c4 ) == NFKC (c5 ),
422
+ line )
423
+ self .assertTrue (c5 == NFKD (c1 ) == NFKD (c2 ) == \
424
+ NFKD (c3 ) == NFKD (c4 ) == NFKD (c5 ),
425
+ line )
426
+
427
+ self .assertTrue (unicodedata .is_normalized ("NFC" , c2 ))
428
+ self .assertTrue (unicodedata .is_normalized ("NFC" , c4 ))
429
+
430
+ self .assertTrue (unicodedata .is_normalized ("NFD" , c3 ))
431
+ self .assertTrue (unicodedata .is_normalized ("NFD" , c5 ))
432
+
433
+ self .assertTrue (unicodedata .is_normalized ("NFKC" , c4 ))
434
+ self .assertTrue (unicodedata .is_normalized ("NFKD" , c5 ))
435
+
436
+ # Record part 1 data
437
+ if part == "@Part1" :
438
+ part1_data [c1 ] = 1
439
+
440
+ # Perform tests for all other data
441
+ for c in range (sys .maxunicode + 1 ):
442
+ X = chr (c )
443
+ if X in part1_data :
444
+ continue
445
+ self .assertTrue (X == NFC (X ) == NFD (X ) == NFKC (X ) == NFKD (X ), c )
446
+
447
+ def test_edge_cases (self ):
448
+ self .assertRaises (TypeError , unicodedata .normalize )
449
+ self .assertRaises (ValueError , unicodedata .normalize , 'unknown' , 'xx' )
450
+ self .assertEqual (unicodedata .normalize ('NFKC' , '' ), '' )
451
+
452
+ def test_bug_834676 (self ):
453
+ # Check for bug 834676
454
+ unicodedata .normalize ('NFC' , '\ud55c \uae00 ' )
455
+
456
+
366
457
if __name__ == "__main__" :
367
458
unittest .main ()
0 commit comments