# Chapter 4. Text versus Byptes

In [1]:
s = 'cafe'
len(s)

4

In [2]:
b = s.encode('utf8')
b

b'cafe'

In [3]:
len(b)

4

In [4]:
b.decode('utf8')

'cafe'

In [10]:
cafe = bytes('café', encoding='utf_8')
cafe

b'caf\xc3\xa9'

In [11]:
cafe[0]

99

In [12]:
cafe[:1]

b'c'

In [13]:
cafe_arr = bytearray(cafe)
cafe_arr

bytearray(b'caf\xc3\xa9')

In [14]:
cafe_arr[-1:]

bytearray(b'\xa9')

In [15]:
bytes.fromhex('31 4B CE A9')

b'1K\xce\xa9'

In [16]:
import array
numbers = array.array('h', [-2, -1, 0, 1, 2])
octets = bytes(numbers)
octets

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

In [None]:
import struct
fmt = '<3s3sHH'
with open('filter.gif', 'rb') as fp:
    img = memoryview(fp.read())
header = img[:10]
bytes(header)

In [None]:
struct.unpack(fmt, header)

In [None]:
del header
del img

# Encode and Decode

In [17]:
for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


## Error Handling

In [18]:
city = 'São Paulo'
city.encode('utf_8')

b'S\xc3\xa3o Paulo'

In [19]:
city.encode('utf_16')

b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'

In [20]:
city.encode('iso8859_1')

b'S\xe3o Paulo'

In [21]:
city.encode('cp437')

UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>

In [22]:
city.encode('cp437', errors='ignore')

b'So Paulo'

In [23]:
city.encode('cp437', errors='replace')

b'S?o Paulo'

In [24]:
city.encode('cp437', errors='xmlcharrefreplace')

b'S&#227;o Paulo'

In [25]:
octets = b'Montr\xe9al'
octets.decode('cp1252')

'Montréal'

In [26]:
octets.decode('iso8859_7')

'Montrιal'

In [27]:
octets.decode('koi8_r')

'MontrИal'

In [28]:
octets.decode('utf_8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte

In [29]:
octets.decode('utf-8', errors='replace')

'Montr�al'