# Working with HTML and XML in text

In [1]:
s = 'This is my doc "<tag>text</tag>".'

In [2]:
print(s)

This is my doc "<tag>text</tag>".


In [3]:
import html

In [4]:
print(html.escape(s))

This is my doc &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.


In [5]:
print(html.escape(s, quote = False))

This is my doc "&lt;tag&gt;text&lt;/tag&gt;".


In [6]:
'\u00f1'

'ñ'

In [7]:
s = 'El niño'

In [8]:
s.encode('ascii', 'ignore')

b'El nio'

In [9]:
ascii(s)

"'El ni\\xf1o'"

In [10]:
s.encode('ascii', errors = 'xmlcharrefreplace')

b'El ni&#241;o'

In [11]:
# s = 'El "niño"'

In [12]:
s = 'El &quot;ni&#241;o&quot.'

In [13]:
s

'El &quot;ni&#241;o&quot.'

In [14]:
html.unescape(s)

'El "niño".'

In [15]:
t = 'the python prompt &gt;&gt;&gt;'

In [16]:
from xml.sax.saxutils import unescape

In [17]:
unescape(t)

'the python prompt >>>'

# Tokenizing Text

In [18]:
s = 'a = 140 * 25 + 36'

In [19]:
tokens = [('NAME', 'a'), ('EQ', '='), ('NUM', '140'), ('TIMES', '*'), ('NUM', '25'),
          ('PLUS', '+'), ('NUM', '36')]

In [20]:
# ?P<TOKENNAME>

In [21]:
import re

In [37]:
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'

In [38]:
NUM = r'(?P<NUM>\d+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
PLUS = r'(?P<PLUS>\+)'
WS = r'(?P<WS>\s+)'


In [39]:
pat = re.compile('|'.join([NAME, NUM, TIMES, EQ, PLUS, WS]))

In [40]:
pat

re.compile(r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)|(?P<NUM>\d+)|(?P<TIMES>\*)|(?P<EQ>=)|(?P<PLUS>\+)|(?P<WS>\s+)',
           re.UNICODE)

In [41]:
scanner = pat.scanner('a = 140')

In [42]:
scanner.match()

<re.Match object; span=(0, 1), match='a'>

In [43]:
_.lastgroup, _.group()

('NAME', 'a')

In [44]:
scanner.match()

<re.Match object; span=(1, 2), match=' '>

In [45]:
_.lastgroup, _.group()

('WS', ' ')

In [46]:
scanner.match()

<re.Match object; span=(2, 3), match='='>

In [47]:
_.lastgroup, _.group()

('EQ', '=')

In [48]:
scanner.match()

<re.Match object; span=(3, 4), match=' '>

In [49]:
_.lastgroup, _.group()

('WS', ' ')

In [50]:
scanner.match()

<re.Match object; span=(4, 7), match='140'>

In [51]:
_.lastgroup, _.group()

('NUM', '140')

In [52]:
scanner.match()

In [53]:
_.lastgroup, _.group()

AttributeError: 'tuple' object has no attribute 'lastgroup'

In [54]:
from collections import namedtuple

In [55]:
token = namedtuple('token', ['type', 'val'])

In [56]:
def gen_tokens(pattern, text):
    scanner = pattern.scanner(text)
    for s in iter(scanner.match, None):
        yield token(s.lastgroup, s.group())

In [57]:
for t in gen_tokens(pat, 'a = 140'):
    print(t)

token(type='NAME', val='a')
token(type='WS', val=' ')
token(type='EQ', val='=')
token(type='WS', val=' ')
token(type='NUM', val='140')


In [58]:
for t in gen_tokens(pat, 'a = 140 * 1000 + 9'):
    print(t)

token(type='NAME', val='a')
token(type='WS', val=' ')
token(type='EQ', val='=')
token(type='WS', val=' ')
token(type='NUM', val='140')
token(type='WS', val=' ')
token(type='TIMES', val='*')
token(type='WS', val=' ')
token(type='NUM', val='1000')
token(type='WS', val=' ')
token(type='PLUS', val='+')
token(type='WS', val=' ')
token(type='NUM', val='9')


In [59]:
LT = r'(?P<LT><)'
LE = r'(?P<LE><=)'
EQ = r'(?P<EQ>=)'


In [62]:
pat2 = re.compile('|'.join([LE, LT, EQ])) #correct

In [61]:
pat2 = re.compile('|'.join([ LT, LE, EQ]))

In [63]:
#ply, pyparsing

# Performing textual operations on Byte strings

In [64]:
s = 'Ghazal Lalooha'

In [65]:
type(s)

str

In [66]:
b = b'Ghazal Lalooha'

In [67]:
type(b)

bytes

In [68]:
b[0:3]

b'Gha'

In [69]:
b.startswith(b'Gh')

True

In [70]:
b.split()

[b'Ghazal', b'Lalooha']

In [71]:
b.replace(b'Ghazal', b'John')

b'John Lalooha'

In [73]:
ba = bytearray(b'Ghazal Lalooha')

In [74]:
type(ba)

bytearray

In [75]:
b[-3:-1]

b'oh'

In [76]:
ba.endswith(b'ha')

True

In [77]:
data = b'120: 450, 890'

In [78]:
import re

In [79]:
re.split('[:,]', data)

TypeError: cannot use a string pattern on a bytes-like object

In [80]:
re.split(b'[:,]', data)

[b'120', b' 450', b' 890']

In [81]:
s = 'Hello Udemy'

In [82]:
s[0]

'H'

In [83]:
s[2]

'l'

In [84]:
b = b'Hello Udemy'

In [85]:
b[0]

72

In [86]:
ord('H')

72

In [87]:
print(b)

b'Hello Udemy'


In [88]:
print(b.decode('ascii'))

Hello Udemy


In [89]:
type(b.decode('ascii'))

str

In [90]:
s = 'Udemy'

In [91]:
sb = s.encode('utf-8')

In [92]:
sb

b'Udemy'

In [93]:
type(sb)

bytes

In [95]:
s = sb.decode('utf-8')

In [96]:
s

'Udemy'

In [97]:
'%15s %10d %10.3f' %('Udemy', 1012, 45.2147)

'          Udemy       1012     45.215'

In [98]:
b'%15s %10d %10.3f' %('Udemy', 1012, 45.2147)

TypeError: %b requires a bytes-like object, or an object that implements __bytes__, not 'str'

In [99]:
b'%15s %10d %10.3f' %(b'Udemy', 1012, 45.2147)

b'          Udemy       1012     45.215'

In [100]:
b'{} {} {}'.format(b'Udemy', 1012, 45.2147)

AttributeError: 'bytes' object has no attribute 'format'

In [101]:
'{} {} {}'.format(b'Udemy', 1012, 45.2147).encode('ascii')

b"b'Udemy' 1012 45.2147"

In [102]:
with open('elni\u00f1o.txt' , 'w') as f:
    f.write('Udemy')

In [103]:
import os

In [104]:
os.listdir('.')

['.anaconda',
 '.arduinoIDE',
 '.cache',
 '.conda',
 '.condarc',
 '.continuum',
 '.idlerc',
 '.ipynb_checkpoints',
 '.ipython',
 '.jupyter',
 '.keras',
 '.matplotlib',
 '.RData',
 '.Rhistory',
 '.spyder-py3',
 '.wyliodrinstudio',
 '01a_DEMO_Reading_Data.ipynb',
 '01b_LAB_Reading_Data.ipynb',
 '01e_DEMO_Hypothesis_Testing.ipynb',
 '02a_LAB_Transforming_Target.ipynb',
 '02b_LAB_Regression_Train_Test_Split.ipynb',
 '02c_DEMO_Cross_Validation.ipynb',
 '02d_DEMO_Regularization.ipynb',
 '02e_LAB_Regularization.ipynb',
 '03b_LAB_KNN.ipynb',
 '03c_DEMO_SVM.ipynb',
 '03d_LAB_Decision_Trees.ipynb',
 '03e_DEMO_Bagging.ipynb',
 '03f_LAB_Boosting_and_Stacking.ipynb',
 '04a_LAB_KMeansClustering.ipynb',
 '04b_DEMO_Distance_Dimensionality.ipynb',
 '04c_LAB_Clustering_Methods.ipynb',
 '04d_DEMO_Dimensionality_Reduction.ipynb',
 '04e_DEMO_nmf.ipynb',
 '4.3_Plotly_Basics.ipynb',
 '5) Classification_Tree_SVM.ipynb',
 '5Y1nHm5xSouNZx5ucaqLZg_c93a585749804b1e87603f707f04bd3f_01b_LAB_Reading_Data.zip',
 '5_P

In [105]:
os.listdir(b'.')

[b'.anaconda',
 b'.arduinoIDE',
 b'.cache',
 b'.conda',
 b'.condarc',
 b'.continuum',
 b'.idlerc',
 b'.ipynb_checkpoints',
 b'.ipython',
 b'.jupyter',
 b'.keras',
 b'.matplotlib',
 b'.RData',
 b'.Rhistory',
 b'.spyder-py3',
 b'.wyliodrinstudio',
 b'01a_DEMO_Reading_Data.ipynb',
 b'01b_LAB_Reading_Data.ipynb',
 b'01e_DEMO_Hypothesis_Testing.ipynb',
 b'02a_LAB_Transforming_Target.ipynb',
 b'02b_LAB_Regression_Train_Test_Split.ipynb',
 b'02c_DEMO_Cross_Validation.ipynb',
 b'02d_DEMO_Regularization.ipynb',
 b'02e_LAB_Regularization.ipynb',
 b'03b_LAB_KNN.ipynb',
 b'03c_DEMO_SVM.ipynb',
 b'03d_LAB_Decision_Trees.ipynb',
 b'03e_DEMO_Bagging.ipynb',
 b'03f_LAB_Boosting_and_Stacking.ipynb',
 b'04a_LAB_KMeansClustering.ipynb',
 b'04b_DEMO_Distance_Dimensionality.ipynb',
 b'04c_LAB_Clustering_Methods.ipynb',
 b'04d_DEMO_Dimensionality_Reduction.ipynb',
 b'04e_DEMO_nmf.ipynb',
 b'4.3_Plotly_Basics.ipynb',
 b'5) Classification_Tree_SVM.ipynb',
 b'5Y1nHm5xSouNZx5ucaqLZg_c93a585749804b1e87603f707f04