### PATTERN OBJECTS

In [1]:
import re

In [2]:
p = re.compile('a\d*b', re.IGNORECASE)

In [3]:
p.findall('a123asda21312Basdas213ABs')

['a21312B', 'AB']

In [4]:
# given a document, count the number of space character at the beginning of the document

In [5]:
doc = '   \t    \t  \t    \thow are you today.asdfa'

In [6]:
p = re.compile('\s+')

In [7]:
type(p)

re.Pattern

In [8]:
m=p.match(doc)

In [9]:
print('index span of the match is: ',m.span())

index span of the match is:  (0, 17)


In [10]:
print('start of the match is: ',m.start())

start of the match is:  0


In [11]:
print('end of the match is:', m.end())

end of the match is: 17


In [12]:
print('length of the match is: ',len( m.group()))

length of the match is:  17


In [13]:
# string after the matched string

In [14]:
print(doc[m.end():])

how are you today.asdfa


In [15]:
# string/pattern that is matched

In [16]:
m.group()

'   \t    \t  \t    \t'

In [17]:
doc = 'a \t    \t  \t    \thow are you today.asdfa'
p = re.compile('\s+')
m=p.match(doc)

In [18]:
# print('index span of the match is: ',m.span())
# print('start of the match is: ',m.start())
# print('end of the match is:', m.end())
# print('length of the match is: ',len( m.group()))

In [19]:
type(m) # since there is no match

NoneType

In [20]:
doc = 'a \t    \t  \t    \thow are you today.asdfa'
p = re.compile('\s+')
m=p.search(doc)

In [21]:
m

<re.Match object; span=(1, 16), match=' \t    \t  \t    \t'>

In [22]:
m.group()

' \t    \t  \t    \t'

In [23]:
m.start()

1

In [24]:
m.end()

16

In [25]:
doc = 'a \t    \t  \t    \thow are you today.asdfa'
p = re.compile('\s*')
m=p.match(doc)

In [26]:
m

<re.Match object; span=(0, 0), match=''>

In [27]:
# matches the single space in the document, anywhere in the string 
doc = 'a \t    \t  \t    \thow are you today.asdfa'
p = re.compile('\s')
m=p.search(doc)

In [28]:
m

<re.Match object; span=(1, 2), match=' '>

In [29]:
# findall although returns all the matches it does not return information about the matches

### findIter method

In [30]:
# find where the valid variable names in the document exist
# along with this, we also want to find its location (start and end) along with its length

In [31]:
doc = 'asfas_asf:?"23easd21231!@#!3213'
p = re.compile('[a-zA-Z_]\w*')
iterator = p.finditer(doc)

In [32]:
for i in iterator:
    print(i)
    print(i.span(),i.group())    

<re.Match object; span=(0, 9), match='asfas_asf'>
(0, 9) asfas_asf
<re.Match object; span=(14, 23), match='easd21231'>
(14, 23) easd21231


In [33]:
doc = 'asfas_asf:?"23easd21231!@#!3213'
p = re.compile('[a-z_]\w*', re.IGNORECASE)
iterator = p.finditer(doc)

In [34]:
for i in iterator:
    print(i)
    print(i.span(),i.group())

<re.Match object; span=(0, 9), match='asfas_asf'>
(0, 9) asfas_asf
<re.Match object; span=(14, 23), match='easd21231'>
(14, 23) easd21231


## using | (Logical OR) to join R.E

In [35]:
# to count number of backslashes and brackets in the string

doc = r'\n for new line [line ] \section and \\document and \\\\section \n'
regExp1 = '[\]\[]'
regExp2 = '[\\\]'

p = re.compile(regExp1+'|'+regExp2)
print(len(p.findall(doc)))

11


In [36]:
p.findall(doc)

['\\', '[', ']', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\']

### ^ (Matching for the Beginning)

In [37]:
# if ^ is not used inside the class [], it can be used to find the if present at beginning of the string

In [38]:
re.search('^Hello','Hello from the other side!!')

<re.Match object; span=(0, 5), match='Hello'>

In [39]:
re.search('^Hello','He said, Hello from the other side!!')

In [40]:
re.match('Hello','Hello from the other side')

<re.Match object; span=(0, 5), match='Hello'>

In [41]:
re.search('Hello','Hello from the other side')

<re.Match object; span=(0, 5), match='Hello'>

### $ (Matching for end of the line)

In [42]:
# matches at the end of a line, which is defined as either the end of the string or any location followed by a new
# line character

In [43]:
re.search('}$','hello}')

<re.Match object; span=(5, 6), match='}'>

In [44]:
# returns None because } is not the end of the string
re.search('}$','hello} ')

In [45]:
re.search('}$','hello}\n')

<re.Match object; span=(5, 6), match='}'>

### () (Matching group of characters)

In [46]:
# makes a group of character to be treated just like a single character

In [47]:
doc = 'find in thethethehellothe '

In [48]:
p = re.compile('(the)+') # search for appearance of the one or more times in the string
m = p.search(doc)

In [49]:
m.span()

(8, 17)

In [50]:
m.group()

'thethethe'

In [51]:
doc = 'i am learning NLP today'

In [52]:
p = re.compile('(the)*') # making a pattern object to find 0 or more occurence of the word the
m = p.search(doc)

In [53]:
m.span()

(0, 0)

In [54]:
m.group()

''

In [55]:
doc = 'find in thethethehellothe '
p = re.compile('(the)+') # search for appearance of the one or more times in the string
iterator = p.finditer(doc)

In [56]:
for i in iterator:
    print(i.span(), i.group())

(8, 17) thethethe
(22, 25) the


### Word Tokenizer- using split() method

In [57]:
## extracting words from the documents using split() method by splitting on one or more
# occurence of non-alphanumeric characters

In [58]:
p = re.compile('\W+')
p.split('this is ,, crazy1010__ ;; i \' canno\'t believe myadf234 asdf..,a.as/s')

['this',
 'is',
 'crazy1010__',
 'i',
 'canno',
 't',
 'believe',
 'myadf234',
 'asdf',
 'a',
 'as',
 's']

In [59]:
p = re.compile('\W')
p.split('this is ,, crazy1010__ ;; i \' canno\'t believe myadf234 asdf..,a.as/s')

['this',
 'is',
 '',
 '',
 '',
 'crazy1010__',
 '',
 '',
 '',
 'i',
 '',
 '',
 'canno',
 't',
 'believe',
 'myadf234',
 'asdf',
 '',
 '',
 'a',
 'as',
 's']

### sub() method for string substitution

In [60]:
p = re.compile('(red\s|white\s|blue\s)+')
p.sub('color ','red shoes and red red white white socks')

'color shoes and color socks'

In [61]:
# subn() returns the string and the number of match

p = re.compile('(red\s|white\s|blue\s)+', re.IGNORECASE)
p.subn('color ','rED shoes and rEd Red White wHite socks')

('color shoes and color socks', 2)

In [62]:
# EXERCISE: In a document replace multiple consecutive whitespaces into a single one and remove white spaces at the beginning
# and at the end of the document

In [63]:
doc = '  chasing   cars  on our heads     if i lay here   if    i just   lay here        '

In [64]:
p1=re.compile('(\s)+')

In [65]:
doc1=p1.sub(' ',doc)

In [66]:
p2 = re.compile('^[\s]*'+'|'+'[\s]*$')
p2.sub('',doc1)

'chasing cars on our heads if i lay here if i just lay here'

In [67]:
# achieving the same using less no of code

In [68]:
# first compile p1 and p2 and use sub() method directly

In [69]:
# p2 can also be written as 

p2 = re.compile('^ | $')

p2.sub('',p1.sub(' ', doc))

'chasing cars on our heads if i lay here if i just lay here'

## spaCy for Tokenization

In [73]:
import spacy

In [77]:
nlp = spacy.load('en_core_web_sm')

In [82]:
doc = 'I always uh do the main um processing, I mean, the uh um data-processing.'

In [83]:
stats = nlp(doc)

In [84]:
for token in stats:
    print(token.text)

I
always
uh
do
the
main
um
processing
,
I
mean
,
the
uh
um
data
-
processing
.


In [85]:
doc2 = 'U.K is my favorite country'

In [86]:
stats = nlp(doc2)

In [88]:
for token in stats:
    print(token.text)

U.K
is
my
favorite
country


In [89]:
print(re.split('\W+',doc))

['I', 'always', 'uh', 'do', 'the', 'main', 'um', 'processing', 'I', 'mean', 'the', 'uh', 'um', 'data', 'processing', '']


In [93]:
for token in re.split('\W+',doc):
    print(token)

I
always
uh
do
the
main
um
processing
I
mean
the
uh
um
data
processing



In [92]:
for token in re.split('\s+',doc):
    print(token)

I
always
uh
do
the
main
um
processing,
I
mean,
the
uh
um
data-processing.


In [95]:
for token in re.split('\s+',doc2):
    print(token)

U.K
is
my
favorite
country
