In [ ]:
#Text String: Unicode

In [1]:
def unicode_test(value):
    import unicodedata
    name = unicodedata.name(value)
    value2 = unicodedata.lookup(name)
    print('value="%s", name="%s", value2="%s"' % (value, name, value2))

In [2]:
unicode_test('A')

value="A", name="LATIN CAPITAL LETTER A", value2="A"


In [3]:
unicode_test('$')

value="$", name="DOLLAR SIGN", value2="$"


In [4]:
unicode_test('\u00a2')

value="¢", name="CENT SIGN", value2="¢"


In [5]:
unicode_test('\u20ac')

value="€", name="EURO SIGN", value2="€"


In [6]:
unicode_test('\u2603')

value="☃", name="SNOWMAN", value2="☃"


In [7]:
unicode_test('\u00e9')

value="é", name="LATIN SMALL LETTER E WITH ACUTE", value2="é"


In [8]:
unicode_test('\u1f32')

value="ἲ", name="GREEK SMALL LETTER IOTA WITH PSILI AND VARIA", value2="ἲ"


value="ἲ", name="GREEK SMALL LETTER IOTA WITH PSILI AND VARIA", value2="ἲ"


In [10]:
place = 'café'
place

'café'

In [11]:
import unicodedata
unicodedata.lookup('E WITH ACUTE, LATIN CAPITAL LETTER')

KeyError: "undefined character name 'E WITH ACUTE, LATIN CAPITAL LETTER'"

In [12]:
import unicodedata
unicodedata.lookup('LATIN CAPITAL LETTER E WITH ACUTE')

'É'

In [13]:
place = 'caf\u00e9'
place

'café'

In [14]:
place = 'caf\N{LATIN SMALL LETTER E WITH ACUTE}'
place

'café'

In [15]:
u_umlaut = '\N{LATIN SMALL LETTER U WITH DIAERESIS}'
u_umlaut

'ü'

In [16]:
drink = 'Gew' + u_umlaut + 'rztraminer'
print('Now I can finally have my', drink, 'in a', place)

Now I can finally have my Gewürztraminer in a café


In [17]:
len('$')

1

In [18]:
len('\U0001f47b')

1

In [19]:
chr(233)

'é'

In [20]:
chr(0xe9)

'é'

In [21]:
chr(0x1fc6)

'ῆ'

In [22]:
snowman = '\u2603'

In [23]:
snowman

'☃'

In [24]:
len(snowman)

1

In [25]:
len('\u2603')

1

In [26]:
ds = snowman.encode('utf-8')

In [27]:
ds

b'\xe2\x98\x83'

In [28]:
len(ds)

3

In [29]:
ds = snowman.encode('ascii')

UnicodeEncodeError: 'ascii' codec can't encode character '\u2603' in position 0: ordinal not in range(128)

In [30]:
snowman.encode('ascii', 'ignore')

b''

In [31]:
snowman.encode('ascii','replace')

b'?'

In [32]:
snowman.encode('ascii', 'backslashreplace')

b'\\u2603'

In [33]:
snowman.encode('ascii','xmlcharrefreplace')

b'&#9731;'

In [34]:
place = 'caf\u00e9'
place

'café'

In [35]:
type(place)

str

In [36]:
place_bytes = place.encode('utf-8')
place_bytes

b'caf\xc3\xa9'

In [37]:
type(place_bytes)

bytes

In [38]:
place2 = place_bytes.decode('utf-8')
place2

'café'

In [39]:
place3 = place_bytes.decode('ascii')

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 3: ordinal not in range(128)

In [40]:
place4 = place_bytes.decode('latin-1')
place4

'cafÃ©'

In [43]:
place5 = place_bytes.decode('windows-1252')
place5

'cafÃ©'

In [44]:
import html
html.unescape('&egrave;')

'è'

In [45]:
import html
html.unescape('&#233;')

'é'

In [46]:
import html
html.unescape('&#xe9;')

'é'

In [47]:
from html.entities import html5
html5["egrave"]

'è'

In [48]:
html5["egrave;"]

'è'

In [49]:
import html
char = '\u00e9'
dec_value = ord(char)
html.entities.codepoint2name[dec_value]

'eacute'

In [50]:
place = 'caf\u00e9'
byte_value = place.encode('ascii', 'xmlcharrefreplace')
byte_value

b'caf&#233;'

In [51]:
byte_value.decode()

'caf&#233;'

In [52]:
eacute1 = 'é'
eacute2 = '\u00e9'
eacute3 = \
    '\N{LATIN SMALL LETTER E WITH ACUTE}'
eacute4 = chr(233)
eacute5 = chr(0xe9)
eacute1, eacute2, eacute3, eacute4, eacute5

('é', 'é', 'é', 'é', 'é')

In [53]:
eacute1 == eacute2 == eacute3 == eacute4 == eacute5

True

In [54]:
import unicodedata
unicodedata.name(eacute1)

'LATIN SMALL LETTER E WITH ACUTE'

In [55]:
ord(eacute1)

233

In [56]:
0xe9

233

In [57]:
eacute_combine1 = "e\u0301"
eacute_combine2 = "e\N{COMBINING ACUTE ACCENT}"
eacute_combine3 = "e" + "\u0301"
eacute_combine1, eacute_combine2, eacute_combine3

('é', 'é', 'é')

In [58]:
eacute_combine1 == eacute_combine2 == eacute_combine3

True

In [59]:
len(eacute_combine1)

2

In [60]:
eacute1 == eacute_combine1

False

In [61]:
len(eacute1)

1

In [62]:
eacute1

'é'

In [63]:
eacute_combine1

'é'

In [64]:
import unicodedata
eacute_normalized = unicodedata.normalize('NFC', eacute_combine1) # NFC means normal form composed
len(eacute_normalized)

1

In [65]:
eacute_normalized == eacute1

True

In [66]:
unicodedata.name(eacute_normalized)

'LATIN SMALL LETTER E WITH ACUTE'

In [ ]:
# Text String: Regular Expressions

In [67]:
import re
result = re.match('You', 'Young Frankenstein')

In [68]:
result

<re.Match object; span=(0, 3), match='You'>

In [69]:
import re
youpattern = re.compile('You')
result = youpattern.match('Young Frankenstein')

In [70]:
result

<re.Match object; span=(0, 3), match='You'>

In [72]:
import re
source = 'Young Frankenstein'
m = re.match('You', source)
if m:
    print(m.group())

You


In [73]:
m = re.match('^You', source)
if m:
    print(m.group())

You


In [75]:
import re
source = 'Young Frankenstein'
m = re.match('Frank', source)
if m:
    print(m.group())

In [78]:
import re
source = 'Young Frankenstein'
if m := re.match('Frank', source):
    print(m.group())

In [79]:
import re
source = 'Young Frankenstein'
if m:= re.search('Frank',source):
    print(m.group())

Frank


In [80]:
import re
source = 'Young Frankenstein'
if m:= re.match('.*Frank',source): # * means 0 or more characters
    print(m.group())

Young Frank


In [81]:
import re
source = 'Young Frankenstein'
if m := re.search('Frank', source):
    print(m.group())

Frank


In [82]:
import re
source = 'Young Frankenstein'
m = re.findall('n', source)
m

['n', 'n', 'n', 'n']

In [83]:
print('Found', len(m), 'matches')

Found 4 matches


In [84]:
import re
source = 'Young Frankenstein'
m = re.findall('n.', source) # . means a single-letter string
m

['ng', 'nk', 'ns']

In [85]:
import re
source = 'Young Frankenstein'
m = re.findall('n.?', source) # ? means optional
m

['ng', 'nk', 'ns', 'n']

In [86]:
import re
source = 'Young Frankenstein'
m = re.split('n', source)
m

['You', 'g Fra', 'ke', 'stei', '']

In [87]:
import re
source = 'Young Frankenstein'
m = re.sub('n', '?', source)
m

'You?g Fra?ke?stei?'

In [88]:
import string
printable = string.printable
len(printable)

100

In [89]:
printable[:50]

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN'

In [90]:
printable[50:]

'OPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [91]:
re.findall('\d', printable)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [92]:
re.findall('\w', printable)

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '_']

In [93]:
re.findall('\s', printable)

[' ', '\t', '\n', '\r', '\x0b', '\x0c']

In [94]:
x = 'abc' + '-/*' + '\u00ea' + '\u0115'
re.findall('\w', x)

['a', 'b', 'c', 'ê', 'ĕ']

In [95]:
source = '''I wish I may, I wish I might
    Have a dish of fish tonight.'''
re.findall('wish', source)

['wish', 'wish']

In [96]:
re.findall('wish|fish', source)

['wish', 'wish', 'fish']

In [97]:
re.findall('^wish', source)

[]

In [98]:
re.findall('^I wish', source)

['I wish']

In [99]:
re.findall('fish$', source)

[]

In [100]:
re.findall('fish tonight.$', source)

['fish tonight.']

In [101]:
re.findall('fish tonight\.$', source)

['fish tonight.']

In [102]:
re.findall('[wf]ish', source)

['wish', 'wish', 'fish']

In [103]:
re.findall('[wsh]+',source)

['w', 'sh', 'w', 'sh', 'h', 'sh', 'sh', 'h']

In [104]:
re.findall('ght\W',source)

['ght\n', 'ght.']

In [105]:
source

'I wish I may, I wish I might\n    Have a dish of fish tonight.'

In [107]:
re.findall('I (?=wish)', source) # find I followed by wish

['I ', 'I ']

In [108]:
re.findall('(?<=I) wish', source) # find wish preceded by I

[' wish', ' wish']

In [109]:
re.findall('\bfish',source)

[]

In [110]:
re.findall(r'\bfish',source)

['fish']

In [111]:
m = re.search(r'(. dish\b).*(\bfish)', source)

In [112]:
m.group()

'a dish of fish'

In [113]:
source

'I wish I may, I wish I might\n    Have a dish of fish tonight.'

In [114]:
m.groups()

('a dish', 'fish')

In [115]:
m = re.search(r'(?P<DISH>. dish\b).*(?P<FISH>\bfish)', source)
m.group()

'a dish of fish'

In [116]:
m.groups()

('a dish', 'fish')

In [117]:
m.group('DISH')

'a dish'

In [118]:
m.group('FISH')

'fish'

In [119]:
# Binary Data
blist = [1,2,3,255]
the_bytes = bytes(blist)
the_bytes

b'\x01\x02\x03\xff'

In [120]:
the_byte_array= bytearray(blist)
the_byte_array

bytearray(b'\x01\x02\x03\xff')

In [121]:
the_bytes == the_byte_array

True

In [122]:
b'\x61'

b'a'

In [123]:
b'\x01abc\xff'

b'\x01abc\xff'

In [124]:
blist = [1, 2, 3, 255]
the_bytes = bytes(blist)
the_bytes[1] = 124

TypeError: 'bytes' object does not support item assignment

In [125]:
the_bytes[0]

1

In [126]:
the_bytes[-1]

255

In [127]:
the_bytes[0:2]

b'\x01\x02'

In [128]:
the_bytes[0:3]

b'\x01\x02\x03'

In [129]:
blist= [1, 2, 3, 255]
the_byte_array = bytearray(blist)
the_byte_array

bytearray(b'\x01\x02\x03\xff')

In [130]:
the_byte_array[1]=127

In [131]:
the_byte_array

bytearray(b'\x01\x7f\x03\xff')

In [132]:
the_byte_array[0] = 300

ValueError: byte must be in range(0, 256)

In [133]:
the_bytes = bytes(range(0,256))

In [134]:
the_byte_array = bytearray(range(0,256))

In [136]:
the_bytes

b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'

In [139]:
import struct
valid_png_header = b'\x89PNG\r\n\x1a\n'
data = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR' + \
       b'\x00\x00\x00\x9a\x00\x00\x00\x8d\x08\x02\x00\x00\x00\xc0'
if data[:8] == valid_png_header:
    width,height = struct.unpack('>LL', data[16:24])
    print('Valid PNG, width', width, 'height', height)
else:
    print('Not a valid PNG.')

Valid PNG, width 154 height 141


In [140]:
data[16:20]

b'\x00\x00\x00\x9a'

In [143]:
# data[20:24]0x9a #SyntaxError: invalid syntax

SyntaxError: invalid syntax (600132786.py, line 1)

In [144]:
0x9a

154

In [145]:
0x8d

141

In [146]:
import struct
struct.pack('>L', 154)

b'\x00\x00\x00\x9a'

In [147]:
struct.pack('>L', 141)

b'\x00\x00\x00\x8d'

In [148]:
struct.unpack('>2L', data[16:24])

(154, 141)

In [149]:
struct.unpack('>16x2L6x',data)

(154, 141)

In [150]:
# ImportError: cannot import name 'Magic' from 'construct'
from construct import Struct, Magic, UBInt32,Const, String
fmt = Struct('png',
    Magic(b'\x89PNG\r\n\x1a\n'),
    UBInt32('length'),
    Const(String('type',4),b'IHDR'),
    UBInt32('width'),
    UBInt32('height')
)
data = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR' + \
       b'\x00\x00\x00\x9a\x00\x00\x00\x8d\x08\x02\x00\x00\x00\xc0'
result = fmt.parse(data)
print(result)

ImportError: cannot import name 'Magic' from 'construct' (/Users/bob/PycharmProjects/introduce-python/.venv/lib/python3.9/site-packages/construct/__init__.py)

In [151]:
print('Hello Jupter!')

Hello Jupter!


In [152]:
import binascii
valid_png_header = b'\x89PNG\r\n\x1a\n'
print(binascii.hexlify(valid_png_header))

b'89504e470d0a1a0a'


In [153]:
print(binascii.unhexlify(b'89504e470d0a1a0a'))

b'\x89PNG\r\n\x1a\n'


In [154]:
x = 5
y = 1
x & y

1

In [155]:
x | y

5

In [156]:
x ^ y

4

In [157]:
~x

-6

In [158]:
x << 1

10

In [159]:
x >> 1

2