- Normalizing Unicode Text to a Standard Representation

In [1]:
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'

In [2]:
s1, s2 # Both look same but have diff unicode characters

('Spicy Jalapeño', 'Spicy Jalapeño')

In [5]:
print(s1 == s2) # Is s1 same as s2

False


In [7]:
len(s1), len(s2) # it can be seen that both strings have different encoding. hence, the need for normalization

(14, 15)

In [8]:
#Normlization is critical for the processing of test. To normalize this strings, the unicodedata module is needed
import unicodedata

In [13]:
t1 = unicodedata.normalize('NFC', s1)
t2 = unicodedata.normalize('NFC', s2)
print(t1 == t2)
print(t1)
print(t2)
print(ascii(t1))# 'NFC' stands for fully composed unicode characters

True
Spicy Jalapeño
Spicy Jalapeño
'Spicy Jalape\xf1o'


In [14]:
t1 = unicodedata.normalize('NFD', s1)
t2 = unicodedata.normalize('NFD', s2)
print(t1 == t2)
print(t1)
print(t2)
print(ascii(t1)) # 'NFD' stands for fully decomposed unicode character

True
Spicy Jalapeño
Spicy Jalapeño
'Spicy Jalapen\u0303o'


In [15]:
t1 = unicodedata.normalize('NFD', s1)
print(t1)

Spicy Jalapeño


In [20]:
s = 'nut' ; s_unicode = s.encode('utf-8')

In [24]:
print(s)
print(ascii(s_unicode))

nut
b'nut'


In [26]:
s = '\ufb01' 
print(s)

ﬁ


In [29]:
unicodedata.normalize('NFKD', s) # More beautifully represented

'fi'

In [30]:
t1

'Spicy Jalapeño'

In [31]:
''.join(c for c in t1 if not unicodedata.combining(c))

'Spicy Jalapeno'

In [32]:
# Stripping unwanted charaters in a string
t = '-----hello====='

In [36]:
t.lstrip('-'), t.rstrip('=')

('hello=====', '-----hello')

In [37]:
# To remove both use the - and = in the string, we use the strip method and pass both characters
t.strip('-=')

'hello'

- It is important to not that strip() does not wor on spaces within the string

In [38]:
s = ' hello   world \n'
print(s.strip())

hello   world


- If a task is to be performed on the inner space, the replace() method can be used

In [6]:
text = ["ÉPÍU", "Naïve Café", "EL NIÑO"]

In [4]:
from unidecode import unidecode

In [8]:
for i in text:
    print(unidecode(i))

EPIU
Naive Cafe
EL NINO


In [9]:
s = 'pýtĥöñ\fis\tawesome\r\n'
print(s)

pýtĥöñis	awesome



- To sanitize the above string, first, we replace the whitespaces with a single space and empty string where necessary
- Secondly, we remove the diacritical marks on the some characters in the string

In [12]:
s

'pýtĥöñ\x0cis\tawesome\r\n'

In [19]:
# To sanitize the above string, we replace the whitespaces with a single space and empty string where necessary
map_object = {ord('\f'): ' ', ord('\t'): ' ', ord('\r'): '', ord('\n'): ''}

In [29]:
no_spaces_s = s.translate(map_object) # To replace the whitespaces
print(no_spaces_s)

pýtĥöñ is awesome


In [36]:
from unidecode import unidecode # unidecode used for transliterating unicode object into ascii string

In [37]:
cleaned_s = (unidecode(c) for c in no_spaces_s)
print(' '.join(cleaned_s))

p y t h o n   i s   a w e s o m e


In [38]:
x = '\u0661\u0662\u0663'

In [39]:
unidecode(x) # Easily translates unicode object

'123'

- Alignment of strings

In [42]:
text = 'Hello World'

In [43]:
text.ljust(20), text.rjust(20), text.center(20)

('Hello World         ', '         Hello World', '    Hello World     ')

- Using format() method for the step above

In [46]:
format(text, '<20'), format(text, '>20'), format(text, '^20')

('Hello World         ', '         Hello World', '    Hello World     ')

In [44]:
text.ljust(20, '+'), text.rjust(20, '+'), text.center(20, '+')

('Hello World+++++++++', '+++++++++Hello World', '++++Hello World+++++')

- Using format() method for the step above

In [47]:
format(text, '+<20'), format(text, '+>20'), format(text, '+^20')

('Hello World+++++++++', '+++++++++Hello World', '++++Hello World+++++')

- format() has a more general purpose because it not limited to formatting strings. It can also be used for format integers

In [48]:
def sample():
    yield 'Is'
    yield 'Chicago'
    yield 'Not'
    yield 'Chicago?'

In [51]:
" ".join(sample())

'Is Chicago Not Chicago?'

In [52]:
name = 'Guido'
n = 37

In [55]:
class safesub(dict):
    def __missing__(self, key):
        return '{' + key + '}' # For handling missing value

In [56]:
import sys

In [57]:
def sub(text):
    return text.format_map(safesub(sys._getframe(1).f_locals))

In [62]:
sys._getframe(1).f_locals

{'self': <ipykernel.zmqshell.ZMQInteractiveShell at 0x1ccb8838588>,
 'code_obj': <code object <module> at 0x000001CCBD25DE40, file "<ipython-input-62-7552f423cd10>", line 1>,
 'result': <ExecutionResult object at 1ccbc7e3e08, execution_count=62 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 1ccbc7e3d88, raw_cell="sys._getframe(1).f_locals" store_history=True silent=False shell_futures=True> result=None>,
 'async_': False,
 'old_excepthook': <bound method IPKernelApp.excepthook of <ipykernel.kernelapp.IPKernelApp object at 0x000001CCB6275408>>,
 'outflag': True}

In [63]:
print(sub('Hello {name}'))

Hello Guido


In [64]:
print(sub('You have {n} messages.'))

You have 37 messages.


In [65]:
print(sub('Your favorite color is {color}'))

Your favorite color is {color}


In [66]:
t = 'The prompt is &gt;&gt;&gt;'

In [71]:
import html

In [73]:
html.unescape(t)

'The prompt is >>>'

In [74]:
 s = 'Elements are written as "<tag>text</tag>".'

In [76]:
html.unescape(s), html.escape(s)

('Elements are written as "<tag>text</tag>".',
 'Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.')

- Tokenizing Text

In [79]:
text = 'foo = 23 + 42 * 10'

In [80]:
import nltk

In [81]:
from nltk.tokenize import WordPunctTokenizer

In [84]:
tk = WordPunctTokenizer()

In [85]:
tk.tokenize(text)

['foo', '=', '23', '+', '42', '*', '10']

In [86]:
text.split(' ')

['foo', '=', '23', '+', '42', '*', '10']

## Numbers and dates

In [87]:
a = 1627731

In [88]:
round(a, -1), round(a, -2), round(a, -3), round(a, -4)

(1627730, 1627700, 1628000, 1630000)

In [90]:
a = 2.1; b = 4.2

In [91]:
a + b

6.300000000000001

-To avoid the errors that come with floating point numbers due to CPU/ the IEEE 754 system, the decimal module can be employed

In [93]:
from decimal import Decimal

In [95]:
a = Decimal('2.1'); b = Decimal('4.2')

In [99]:
a + b

Decimal('6.3')

In [97]:
a+b == Decimal('6.3')

True

In [100]:
print(a+b)

6.3


In [101]:
nums = [1.23e+18, 1, -1.23e+18] # In adding from left to right using sum(), 1 will be lost
print(sum(nums))

0.0


In [102]:
# To avoid this, fsum is used from the maths module to ensure precision
import math

In [103]:
math.fsum(nums)

1.0

In [104]:
x = 1234.56789

In [106]:
format(x, '0.1f')

'1234.6'

In [110]:
format(x, '>10.1f')

'    1234.6'

In [112]:
x = 1234

In [113]:
bin(x), oct(x), hex(x)

('0b10011010010', '0o2322', '0x4d2')

- format()  method can also be sued for the above

In [114]:
format(x, 'b'), format(x, 'o'), format(x, 'x')

('10011010010', '2322', '4d2')

In [116]:
int('4d2', 16), int('2322', 8), int('10011010010', 2) # Converting them back to base 10

(1234, 1234, 1234)

In [117]:
int.from_bytes?

In [118]:
data = b'\x00\x124V\x00x\x90\xab\x00\xcd\xef\x01\x00#\x004'

In [120]:
int.from_bytes(data, 'little') # Most significant byte at the end of the byte string

69120565665751139577663547927094891008

In [121]:
int.from_bytes(data, 'big') # Most significant byte at the beggining of the byte string

94522842520747284487117727783387188

In [122]:
x = 94522842520747284487117727783387188

In [126]:
x.to_bytes(16, 'big'), x.to_bytes(16, 'little')

(b'\x00\x124V\x00x\x90\xab\x00\xcd\xef\x01\x00#\x004',
 b'4\x00#\x00\x01\xef\xcd\x00\xab\x90x\x00V4\x12\x00')

In [127]:
# To express positive and negative infinity, and Not a number
a = float('inf'); b= float('-inf'); c = float('nan')

In [129]:
math.isinf(a), math.isnan(c)

(True, True)

### Datetime

In [140]:
from datetime import datetime
from datetime import timedelta

In [141]:
a = datetime(2012, 9, 23)

In [142]:
a + timedelta(months=3) # The maximum keyword in timedelta is days

TypeError: 'months' is an invalid keyword argument for __new__()

In [143]:
# To avial oneself the use of month, dateutil.relativedelta can be used

In [144]:
from dateutil.relativedelta import relativedelta

In [145]:
a + relativedelta(months=3) 

datetime.datetime(2012, 12, 23, 0, 0)

In [146]:
a + relativedelta(months=+4) # One can go forward in month and backward

datetime.datetime(2013, 1, 23, 0, 0)

In [147]:
a + relativedelta(months=-4)

datetime.datetime(2012, 5, 23, 0, 0)

In [148]:
relativedelta?

##### Iterators and Generators

In [149]:
class Node:
    def __init__(self, value):
        self.value = value
        self.children = []
    def add_child(self, node):
        self.children.append(node)
    def __repr__(self):
        return 'Node({})'.format(self.value)
    def __iter__(self):
        return iter(self.children)

In [150]:
if __name__ == '__main__':
    root = Node(0)
    child1 = Node(1)
    child2 = Node(2)
    root.add_child(child1)
    root.add_child(child2)

In [154]:
for i in root:
    print(i)

Node(1)
Node(2)


In [156]:
repr(root), repr(child1), repr(child2)

('Node(0)', 'Node(1)', 'Node(2)')