# NLP - Regular Expression Exercise - Analytics Vidhya

In [1]:
import re

## Important functions in 're' package

### re.match(pattern, string)

In [2]:
# matching AV in the given sentence
result = re.match(r'AV', 'AV Analytics Vidhya AV')
print ('\n',result)


 <re.Match object; span=(0, 2), match='AV'>


In [3]:
# The output shows that pattern match has been found. To print the matching string we’ll use method group (It helps to return the matching string). Use “r” at the start of the pattern string, it designates a python raw string.

# printing the matching string
result = re.match(r'AV', 'AV Analytics Vidhya AV')
print ('\nMatching string :',result.group(0))


Matching string : AV


In [4]:
# Let’s now find ‘Analytics’ in the given string. Here we see that string is not starting with ‘AV’ so it should return no match. Let’s see what we get:

# matching Analytics in the given sentence
result = re.match(r'Analytics', 'AV Analytics Vidhya AV')
print ('\nResult :', result)


Result : None


In [5]:
# There are methods like start() and end() to know the start and end position of matching pattern in the string.
result = re.match(r'AV', 'AV Analytics Vidhya AV')
print ('\nStarting position of the match :',result.start())
print ('Ending position of the match :',result.end())


Starting position of the match : 0
Ending position of the match : 2


### re.search(pattern, string)

In [6]:
result = re.search(r'Analytics', 'AV Analytics Vidhya AV')

In [7]:
print(result.group(0))

Analytics


### re.findall (pattern, string)

In [8]:
result = re.findall(r'AV', 'AV Analytics Vidhya AV')

In [9]:
print(result)

['AV', 'AV']


### re.split(pattern, string, [maxsplit=0])

In [10]:
result=re.split(r'y','Analytics')

In [11]:
print(result)

['Anal', 'tics']


In [12]:
result=re.split(r'i','Analytics Vidhya')

In [13]:
print(result)

['Analyt', 'cs V', 'dhya']


In [14]:
result=re.split(r'i','Analytics Vidhya',maxsplit=1)

In [15]:
print(result)

['Analyt', 'cs Vidhya']


### re.sub(pattern, repl, string)

In [16]:
result=re.sub(r'India','the World','AV is largest Analytics community of India')

In [17]:
print(result)

AV is largest Analytics community of the World


### re.compile(pattern, repl, string)

In [18]:
pattern=re.compile('AV')
result=pattern.findall('AV Analytics Vidhya AV')

In [19]:
print(result)

['AV', 'AV']


In [20]:
result2=pattern.findall('AV is largest analytics community of India')

In [21]:
print(result2)

['AV']


## Problem 1: Return the first word of a given string

### Extract each character (using “\w“)

In [22]:
result=re.findall(r'.','AV is largest Analytics community of India')
print(result)

['A', 'V', ' ', 'i', 's', ' ', 'l', 'a', 'r', 'g', 'e', 's', 't', ' ', 'A', 'n', 'a', 'l', 'y', 't', 'i', 'c', 's', ' ', 'c', 'o', 'm', 'm', 'u', 'n', 'i', 't', 'y', ' ', 'o', 'f', ' ', 'I', 'n', 'd', 'i', 'a']


In [23]:
result=re.findall(r'\w','AV is largest Analytics community of India')
print(result)

['A', 'V', 'i', 's', 'l', 'a', 'r', 'g', 'e', 's', 't', 'A', 'n', 'a', 'l', 'y', 't', 'i', 'c', 's', 'c', 'o', 'm', 'm', 'u', 'n', 'i', 't', 'y', 'o', 'f', 'I', 'n', 'd', 'i', 'a']


### Extract each word (using “*” or “+“)

In [24]:
result=re.findall(r'\w*','AV is largest Analytics community of India')
print(result)

['AV', '', 'is', '', 'largest', '', 'Analytics', '', 'community', '', 'of', '', 'India', '']


In [25]:
result=re.findall(r'\w+','AV is largest Analytics community of India')
print(result)

['AV', 'is', 'largest', 'Analytics', 'community', 'of', 'India']


### Extract each word (using “^“)

In [26]:
result=re.findall(r'^\w+','AV is largest Analytics community of India')
print(result)

['AV']


In [27]:
result=re.findall(r'\w+$','AV is largest Analytics community of India')
print(result)

['India']


## Problem 2: Return the first two character of each word

### Extract consecutive two characters of each word, excluding spaces (using “\w“)

In [28]:
result=re.findall(r'\w\w','AV is largest Analytics community of India')
print(result)

['AV', 'is', 'la', 'rg', 'es', 'An', 'al', 'yt', 'ic', 'co', 'mm', 'un', 'it', 'of', 'In', 'di']


### Extract consecutive two characters those available at start of word boundary (using “\b“)

In [29]:
result=re.findall(r'\b\w.','AV is largest Analytics community of India')
print(result)

['AV', 'is', 'la', 'An', 'co', 'of', 'In']


## Problem 3: Return the domain type of given email-ids

### Extract all characters after “@”

In [30]:
result=re.findall(r'@\w+','abc.test@gmail.com, xyz@test.in, test.first@analyticsvidhya.com, first.test@rest.biz') 
print(result)

['@gmail', '@test', '@analyticsvidhya', '@rest']


In [31]:
result=re.findall(r'@\w+.\w+','abc.test@gmail.com, xyz@test.in, test.first@analyticsvidhya.com, first.test@rest.biz')
print(result)

['@gmail.com', '@test.in', '@analyticsvidhya.com', '@rest.biz']


### Extract only domain name using “( )”

In [32]:
result=re.findall(r'@\w+.(\w+)','abc.test@gmail.com, xyz@test.in, test.first@analyticsvidhya.com, first.test@rest.biz')
print(result)

['com', 'in', 'com', 'biz']


## Problem 4: Return date from given string

### Here we will use “\d” to extract digit

In [33]:
result=re.findall(r'\d{2}-\d{2}-\d{4}','Amit 34-3456 12-05-2007, XYZ 56-4532 11-11-2011, ABC 67-8945 12-01-2009')
print(result)

['12-05-2007', '11-11-2011', '12-01-2009']


### If you want to extract only year again parenthesis “( )” will help you.

In [34]:
result=re.findall(r'\d{2}-\d{2}-(\d{4})','Amit 34-3456 12-05-2007, XYZ 56-4532 11-11-2011, ABC 67-8945 12-01-2009')
print(result)

['2007', '2011', '2009']


## Problem 5: Return all words of a string those starts with vowel

### Return each word

In [35]:
result=re.findall(r'\w+','AV is largest Analytics community of India')
print(result)

['AV', 'is', 'largest', 'Analytics', 'community', 'of', 'India']


### Return words starts with alphabets (using [])

In [36]:
result=re.findall(r'[aeiouAEIOU]\w+','AV is largest Analytics community of India')
print(result)

['AV', 'is', 'argest', 'Analytics', 'ommunity', 'of', 'India']


### Above you can see that it has returned “argest” and “ommunity” from the mid of words. To drop these two, we need to use “\b” for word boundary.

In [37]:
result=re.findall(r'\b[aeiouAEIOU]\w+','AV is largest Analytics community of India')
print(result)

['AV', 'is', 'Analytics', 'of', 'India']


### In similar ways, we can extract words those starts with constant using “^” within square bracket.

In [38]:
result=re.findall(r'\b[^aeiouAEIOU]\w+','AV is largest Analytics community of India')
print(result)

[' is', ' largest', ' Analytics', ' community', ' of', ' India']


### Above you can see that it has returned words starting with space. To drop it from output, include space in square bracket[].

In [39]:
result=re.findall(r'\b[^aeiouAEIOU ]\w+','AV is largest Analytics community of India')
print(result)

['largest', 'community']


## Problem 6: Validate a phone number (phone number must be of 10 digits and starts with 8 or 9) 

### We have a list phone numbers in list “li” and here we will validate phone numbers using regular

In [40]:
import re
li=['9999999999','999999-999','99999x9999']
for val in li:
 if re.match(r'[8-9]{1}[0-9]{9}',val) and len(val) == 10:
     print('yes')
 else:
     print('no')

yes
no
no


## Problem 7: Split a string with multiple delimiters

### Split a string

In [41]:
line = 'asdf fjdk;afed,fjek,asdf,foo'
result= re.split(r'[;,\s]', line)
print(result)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']


### We can also use method re.sub() to replace these multiple delimiters with one as space ” “.

In [42]:
line = 'asdf fjdk;afed,fjek,asdf,foo'
result= re.sub(r'[;,\s]',' ', line)
print(result)

asdf fjdk afed fjek asdf foo


## Problem 8: Retrieve Information from HTML file

In [43]:
x='<tr align="center"><td>1</td> <td>Noah</td> <td>Emma</td></tr>'

In [44]:
result=re.findall(r'<td>\w+</td>\s<td>(\w+)</td>\s<td>(\w+)</td>',x)
print(result)

[('Noah', 'Emma')]


---