# Specifying a regular expression for the shortest match

In [1]:
import re

In [2]:
pat = re.compile(r'\"(.*)\"')

In [3]:
text = 'I am "Ghazal", a Python programmer in "Udemy"...'

In [4]:
pat.findall(text)

['Ghazal", a Python programmer in "Udemy']

In [5]:
pat2 = re.compile(r'\"(.*?)\"')

In [6]:
pat2.findall(text)

['Ghazal', 'Udemy']

# Normilizing Unicode text to a standard representation

In [7]:
# غزل

In [7]:
Ghazal = '\u063a\u0632\u0644'

In [8]:
Ghazal

'غزل'

In [9]:
s1 = 'Elni\u00f1o'

In [10]:
s1

'Elniño'

In [11]:
s2 = 'Elnin\u0303o'

In [12]:
s2

'Elniño'

In [13]:
s1 == s2

False

In [14]:
len(s1), len(s2)

(6, 7)

In [15]:
import unicodedata

In [16]:
s1n = unicodedata.normalize('NFC', s1)

In [17]:
s2n = unicodedata.normalize('NFC', s2)

In [18]:
s1n

'Elniño'

In [19]:
s2n

'Elniño'

In [20]:
s1n == s2n

True

In [21]:
print(ascii(s1n))

'Elni\xf1o'


In [22]:
print(ascii(s2n))

'Elni\xf1o'


In [23]:
ord('m')

109

In [24]:
ord('M')

77

In [25]:
chr(109)

'm'

In [26]:
chr(77)

'M'

In [27]:
ord('E')

69

In [28]:
chr(69)

'E'

In [29]:
s1n2 = unicodedata.normalize('NFD', s1)

In [30]:
s2n2 = unicodedata.normalize('NFD', s2)

In [31]:
s1n2

'Elniño'

In [32]:
s2n2

'Elniño'

In [33]:
s1n2 == s2n2

True

In [34]:
print(ascii(s1n2))

'Elnin\u0303o'


In [35]:
print(ascii(s2n2))

'Elnin\u0303o'


In [36]:
s = '\ufb01'; s

'ﬁ'

In [37]:
unicodedata.normalize('NFD', s)

'ﬁ'

In [38]:
unicodedata.normalize('NFC', s)

'ﬁ'

In [39]:
unicodedata.normalize('NFKD', s)

'fi'

In [40]:
unicodedata.normalize('NFKC', s)

'fi'

In [41]:
print(ascii('fi'))

'fi'


In [42]:
print(ascii('ﬁ'))

'\ufb01'


In [43]:
print(ascii('Elniño'))

'Elnin\u0303o'
