# Character representation schemes
- a representation is a mapping between integers and character

# Limitations of Ascii
- 1 bit was not used, leaving some room for extensions, but one byte is not enough to represent all the characters in the world



# Unicode
- "universal character set"
- represents over a million different characters
- every language on earth
- each character represented by a unique integer
- [code charts](http://www.unicode.org/charts/)

# Python 'str' type 
- stores Unicode characters, not ASCII

# encodings
- 'encoding' is converting a unicode string into a byte sequence or stream (in some encoding)
- 'decoding' is converting a byte stream(in some encoding) into a unicode string
- there are several different encoding/decoding schemes
- java uses UTF-16
- W3C recommends web pages use UTF-8
- the UTF-8 encoding has the special property that if the unicode string is just ascii characters, the UTF-8 encoding
is the same as the ascii encoding
- when you WRITE a unicode string from  python(saving a file, writing to the network), you must ENCODE it into a sequence of bytes
- when you read a unicode string INTO of python, you must DECODE it from a sequence of bytes


In [1]:
# 'Python' spelled in characters from different 
# unicode character sets. len is 6, 
# which is the numbers of characters, 
# not the number bytes it takes to represent them
# \uabcd is 32 bits written in hex
# \Uabcdefgh is 64 bits written in hex

uni = '\U00002119\u01b4\u2602\u210c\xf8\u1f24'
[type(uni), uni, len(uni)]

[str, 'ℙƴ☂ℌøἤ', 6]

In [2]:
# python knows how to render lots of characters!

''.join([chr(j) for j in range(17100, 18999)])

'䋌䋍䋎䋏䋐䋑䋒䋓䋔䋕䋖䋗䋘䋙䋚䋛䋜䋝䋞䋟䋠䋡䋢䋣䋤䋥䋦䋧䋨䋩䋪䋫䋬䋭䋮䋯䋰䋱䋲䋳䋴䋵䋶䋷䋸䋹䋺䋻䋼䋽䋾䋿䌀䌁䌂䌃䌄䌅䌆䌇䌈䌉䌊䌋䌌䌍䌎䌏䌐䌑䌒䌓䌔䌕䌖䌗䌘䌙䌚䌛䌜䌝䌞䌟䌠䌡䌢䌣䌤䌥䌦䌧䌨䌩䌪䌫䌬䌭䌮䌯䌰䌱䌲䌳䌴䌵䌶䌷䌸䌹䌺䌻䌼䌽䌾䌿䍀䍁䍂䍃䍄䍅䍆䍇䍈䍉䍊䍋䍌䍍䍎䍏䍐䍑䍒䍓䍔䍕䍖䍗䍘䍙䍚䍛䍜䍝䍞䍟䍠䍡䍢䍣䍤䍥䍦䍧䍨䍩䍪䍫䍬䍭䍮䍯䍰䍱䍲䍳䍴䍵䍶䍷䍸䍹䍺䍻䍼䍽䍾䍿䎀䎁䎂䎃䎄䎅䎆䎇䎈䎉䎊䎋䎌䎍䎎䎏䎐䎑䎒䎓䎔䎕䎖䎗䎘䎙䎚䎛䎜䎝䎞䎟䎠䎡䎢䎣䎤䎥䎦䎧䎨䎩䎪䎫䎬䎭䎮䎯䎰䎱䎲䎳䎴䎵䎶䎷䎸䎹䎺䎻䎼䎽䎾䎿䏀䏁䏂䏃䏄䏅䏆䏇䏈䏉䏊䏋䏌䏍䏎䏏䏐䏑䏒䏓䏔䏕䏖䏗䏘䏙䏚䏛䏜䏝䏞䏟䏠䏡䏢䏣䏤䏥䏦䏧䏨䏩䏪䏫䏬䏭䏮䏯䏰䏱䏲䏳䏴䏵䏶䏷䏸䏹䏺䏻䏼䏽䏾䏿䐀䐁䐂䐃䐄䐅䐆䐇䐈䐉䐊䐋䐌䐍䐎䐏䐐䐑䐒䐓䐔䐕䐖䐗䐘䐙䐚䐛䐜䐝䐞䐟䐠䐡䐢䐣䐤䐥䐦䐧䐨䐩䐪䐫䐬䐭䐮䐯䐰䐱䐲䐳䐴䐵䐶䐷䐸䐹䐺䐻䐼䐽䐾䐿䑀䑁䑂䑃䑄䑅䑆䑇䑈䑉䑊䑋䑌䑍䑎䑏䑐䑑䑒䑓䑔䑕䑖䑗䑘䑙䑚䑛䑜䑝䑞䑟䑠䑡䑢䑣䑤䑥䑦䑧䑨䑩䑪䑫䑬䑭䑮䑯䑰䑱䑲䑳䑴䑵䑶䑷䑸䑹䑺䑻䑼䑽䑾䑿䒀䒁䒂䒃䒄䒅䒆䒇䒈䒉䒊䒋䒌䒍䒎䒏䒐䒑䒒䒓䒔䒕䒖䒗䒘䒙䒚䒛䒜䒝䒞䒟䒠䒡䒢䒣䒤䒥䒦䒧䒨䒩䒪䒫䒬䒭䒮䒯䒰䒱䒲䒳䒴䒵䒶䒷䒸䒹䒺䒻䒼䒽䒾䒿䓀䓁䓂䓃䓄䓅䓆䓇䓈䓉䓊䓋䓌䓍䓎䓏䓐䓑䓒䓓䓔䓕䓖䓗䓘䓙䓚䓛䓜䓝䓞䓟䓠䓡䓢䓣䓤䓥䓦䓧䓨䓩䓪䓫䓬䓭䓮䓯䓰䓱䓲䓳䓴䓵䓶䓷䓸䓹䓺䓻䓼䓽䓾䓿䔀䔁䔂䔃䔄䔅䔆䔇䔈䔉䔊䔋䔌䔍䔎䔏䔐䔑䔒䔓䔔䔕䔖䔗䔘䔙䔚䔛䔜䔝䔞䔟䔠䔡䔢䔣䔤䔥䔦䔧䔨䔩䔪䔫䔬䔭䔮䔯䔰䔱䔲䔳䔴䔵䔶䔷䔸䔹䔺䔻䔼䔽䔾䔿䕀䕁䕂䕃䕄䕅䕆䕇䕈䕉䕊䕋䕌䕍䕎䕏䕐䕑䕒䕓䕔䕕䕖䕗䕘䕙䕚䕛䕜䕝䕞䕟䕠䕡䕢䕣䕤䕥䕦䕧䕨䕩䕪䕫䕬䕭䕮䕯䕰䕱䕲䕳䕴䕵䕶䕷䕸䕹䕺䕻䕼䕽䕾䕿䖀䖁䖂䖃䖄䖅䖆䖇䖈䖉䖊䖋䖌䖍䖎䖏䖐䖑䖒䖓䖔䖕䖖䖗䖘䖙䖚䖛䖜䖝䖞䖟䖠䖡䖢䖣䖤䖥䖦䖧䖨䖩䖪䖫䖬䖭䖮䖯䖰䖱䖲䖳䖴䖵䖶䖷䖸䖹䖺䖻䖼䖽䖾䖿䗀䗁䗂䗃䗄䗅䗆䗇䗈䗉䗊䗋䗌䗍䗎䗏䗐䗑䗒䗓䗔䗕䗖䗗䗘䗙䗚䗛䗜䗝䗞䗟䗠䗡䗢䗣䗤䗥䗦䗧䗨䗩䗪䗫䗬䗭䗮䗯䗰䗱䗲䗳䗴䗵䗶䗷䗸䗹䗺䗻䗼䗽䗾䗿䘀䘁䘂䘃䘄䘅䘆䘇䘈䘉䘊䘋䘌䘍䘎䘏䘐䘑䘒䘓䘔䘕䘖䘗䘘䘙䘚䘛䘜䘝䘞䘟䘠䘡䘢䘣䘤䘥䘦䘧䘨䘩䘪䘫䘬䘭䘮䘯䘰䘱䘲䘳䘴䘵䘶䘷䘸䘹䘺䘻䘼䘽䘾䘿䙀䙁䙂䙃䙄䙅䙆䙇䙈䙉䙊䙋䙌䙍䙎䙏䙐䙑䙒䙓䙔䙕䙖䙗䙘䙙䙚䙛䙜䙝䙞䙟䙠䙡䙢䙣䙤䙥䙦䙧䙨䙩䙪䙫䙬䙭䙮䙯䙰䙱䙲䙳䙴䙵䙶䙷䙸䙹䙺䙻䙼䙽䙾䙿䚀䚁䚂䚃䚄䚅䚆䚇䚈䚉䚊䚋䚌䚍䚎䚏䚐䚑䚒䚓䚔䚕䚖䚗䚘䚙䚚䚛䚜䚝䚞䚟䚠䚡䚢䚣䚤䚥䚦䚧䚨䚩䚪䚫䚬䚭䚮䚯䚰䚱䚲

# 'ord' maps a char into its unicode integer
# 'chr' maps a unicode integer into a char

In [3]:
# 3rd char is from 'dingbats'

[ ord('A'), chr(65), chr(0x2702)]

[65, 'A', '✂']

In [4]:
uni

'ℙƴ☂ℌøἤ'

In [8]:
# three different encodings of unicode 

utf8, utf16, utf32 = [uni.encode(et) \
                      for et in \
                      ['utf-8', 'utf-16', 'utf-32']]

In [9]:
# length of unicode encoding varies 
# with different encodings

[[len(u), type(u)] for u in [utf8, utf16, utf32]]

[[16, bytes], [14, bytes], [28, bytes]]

In [10]:
# utf8, utf16, utf32 are type 'bytes', not str. 
# note b' prefix

[type(uni), type(utf8), utf8, utf16, utf32]

[str,
 bytes,
 b'\xe2\x84\x99\xc6\xb4\xe2\x98\x82\xe2\x84\x8c\xc3\xb8\xe1\xbc\xa4',
 b'\xff\xfe\x19!\xb4\x01\x02&\x0c!\xf8\x00$\x1f',
 b'\xff\xfe\x00\x00\x19!\x00\x00\xb4\x01\x00\x00\x02&\x00\x00\x0c!\x00\x00\xf8\x00\x00\x00$\x1f\x00\x00']

In [11]:
# decode converts bytes into unicode string

utf32.decode('utf-32')

'ℙƴ☂ℌøἤ'

In [12]:
utf8.decode('utf-8')

'ℙƴ☂ℌøἤ'

In [13]:
# to decode, must know the encoding type(key)
# selecting the wrong decoder doesn't 
# always generate an error
# sometimes you will just get a bogus string

utf32.decode('utf-8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

# ascii vs unicode
- ascii is easy, because storage media and networks handle bytes, and ascii is just bytes
- no byte order issues(big/little endian)
- unicode is harder, because
    - writing to the network or storage from Python, the unicode string must be ENCODED into a byte stream, in some format like utf-8, utf-16, etc
    - reading from the network or storage into Python, the byte stream must be DECODED into a unicode stream. somehow the encoding used must be provided
- given Python uses 'str' unicode, you are always
    - encoding as strings leave your program
    - decoding as strings enter your program
- if all you are using are ascii characters, then everything just works, without any special effort
- [standard text encoders](https://docs.python.org/3/library/codecs.html#standard-encodings)