In [13]:
import re

from https://colauttilab.github.io/PythonCrashCourse/2_regex.html

### re.match()
Returns ‘None’ if no match

In [31]:
# Try to apply the pattern at the start of the string, returning a Match object, or None if no match was found.

m1 = re.match("salicaria","salicaria")
m2 = re.match("a","salicaria")
m3 = re.match("z","salicaria")

print(m1)
print(m2)
print(m3)

<re.Match object; span=(0, 9), match='salicaria'>
None
None


### re.search()
Contrast output with re.match()

In [279]:
m1 = re.search("salicaria","salicaria")
m2 = re.search("a","salicaria")
m3 = re.search("z","salicaria")

print(m1)
print(m2)
print(m3)

<re.Match object; span=(0, 9), match='salicaria'>
None
None


In [19]:
# Scan through string looking for a match to the pattern, returning
# a Match object, or None if no match was found.

txt = "The rain in Spain"
m1 = re.search('rain' , string=txt)
m2 = re.search('rainn' , string=txt)

print(m1)
print(m2)

<re.Match object; span=(5, 8), match='ain'>
None


### re.findall()
Note [] instead of None when no match is found

In [29]:
# Return a list of all non-overlapping matches in the string.

m1 = re.findall("salicaria","salicaria")
m2 = re.findall("a","salicaria")
m3 = re.findall("z","salicaria")

print(m1)
print(m2)
print(m3)

['salicaria']
['a', 'a', 'a']
[]


### re.split()
Compare this output with previous functions

In [36]:
# Split the source string by the occurrences of the pattern,
# returning a list containing the resulting substrings.

m1 = re.split("salicaria","salicaria")
m2 = re.split("c","salicaria")

print(m1)
print(m2)

['', '']
['sali', 'aria']


### re.sub()
For this function, we have to add a replace string parameter

In [39]:
# Return the string obtained by replacing the leftmost
# non-overlapping occurrences of the pattern in string by the
# replacement repl.

m1 = re.sub(pattern="c",repl="X",string="salicaria")

print(m1)

saliXaria


### 3. Wildcards
The escape \ character tells the Python interpreter that the next string is not to be interpreted literally. 

In [102]:
txt = "...which 1-100  words get replaced?"

print(re.sub(pattern=r"[ac]",repl=r"_",string=txt))

print(re.search(pattern=r"[ac]",string=txt))

print(re.findall(pattern=r"[ac]",string=txt))

print(re.match(pattern=r"[ac]",string=txt))

...whi_h 1-100  words get repl__ed?
<re.Match object; span=(6, 7), match='c'>
['c', 'a', 'c']
None


In [143]:
# word \w , non-word \W
print(re.sub(pattern=r"\w",repl="X",string="salicaria"))
print(re.sub(pattern=r"\W",repl="X",string="salicaria"))

XXXXXXXXX
salicaria


In [110]:
# white space \s , non-whitespace \S
print(re.sub(pattern=r"\s",repl=r"*",string=txt))

...which*1-100**words*get*replaced?


In [107]:
# digit \d , non-digit \D
print(re.sub(pattern=r"\d",repl=r"*",string=txt))
print(re.sub(pattern=r"\D",repl=r"*",string=txt))

...which *-***  words get replaced?
*********1*100*********************


In [113]:
# . = any character except new line
re.sub(pattern=r'.',repl='*',string=txt)

'***********************************'

### 4. Special characters:


In [120]:
#  or |
re.sub("w|h","*",txt)

'...**ic* 1-100  *ords get replaced?'

In [135]:
print(txt)

print(re.sub("\w","*",txt))

print(re.sub("\w+","*",txt)) # + = 1 or more occurrences 

 # ? = 0 or 1 occurrences , This tells regex to do a ‘lazy’ search (find the first occurrance)
 # * = 0 or more occurrences , This tells regex to do a ‘greedy’ search (find the last occurrence)

...which 1-100  words get replaced?
...***** *-***  ***** *** ********?
...* *-*  * * *?


In [142]:
m1 = re.sub("\w{3}","*",txt)
print(m1)

...*ch 1-*  *ds * **ed?


### 5. Multiple search: [ ]
Use square brackets to find any matching characters.
Use dash for a range of numbers

In [146]:
print(re.sub("[which]","*",txt))
print(re.sub("[a-z]","*","AaBbCcDd"))

...***** 1-100  *ords get repla*ed?
A*B*C*D*


### 6. ^Start and end of line$

In [156]:
print(re.sub("^[sS]","*","start of slines"))
print(re.sub("[^sS]","*","start of slines"))
# IMPORTANT: ^ Also ‘negates’ when used within [] Find species containing any letter other than s

*tart of slines
s********s****s


In [160]:
print(re.sub("s$","*","start of slines"))

start of sline*
start of sline*


### 7. Capture text: ()
Capture text using () and reprint using \\1, \\2, etc

In [188]:
print(txt)
print(re.sub(".*(\w\w+).*","\\1",txt)) # Replace each word with its first letter
print(re.sub(".*([0-9]+)-([0-9]+).*","\\2-\\1",txt)) # Pull out only the numbers and reverse their order
print(re.sub("(\w)(\w)(\w+)","\\2\\1\\3",txt)) # Reverse first two letters of each ‘word’ containing 3+ characters

...which 1-100  words get replaced?
ed
100-1
...hwich 1-010  owrds egt erplaced?


### group()
As you can see above, re.match() and re.search() return an object if there is a match. What if we want to see what was matched? Use .group()

In [192]:
re.match("salicaria","salicaria").group()

'salicaria'

In [197]:
# re.match("z","salicaria").group() 
# But group() produces an error if no match was generated:

In [209]:
re.match("sal","salicaria").group()

'sal'

In [277]:
re.match("sal","salicaria").span()

(0, 3)

In [210]:
re.match("(s)(a)(l)","salicaria").groups()

('s', 'a', 'l')

### PRACTICE EXERCISES
1. Consider a vector of email addresses scraped from the internet:
- robert ‘dot’ colautti ‘at’ queensu ‘dot’ ca
- chris.eckert[at]queensu.ca
- lonnie.aarssen at queensu.ca

Use regular expressions to convert all email addresses to the standard format: name@queensu.ca

In [250]:
emails =['robert ‘dot’ colautti ‘at’ queensu ‘dot’ ca',
         'chris.eckert[at]queensu.ca',
         'lonnie.aarssen at queensu.ca']

In [238]:
re.sub(pattern=r'(\s)(at)(\s)', repl='@', string=emails[2])

'lonnie.aarssen@queensu.ca'

In [249]:
re.sub(pattern=r'\[at]', repl='@', string=emails[1])

'chris.eckert@queensu.ca'

In [262]:
s1 = re.sub(pattern=r'(\s)(‘dot’)(\s)', repl='.', string=emails[0])
re.sub(pattern=r'(\s)(‘at’)(\s)', repl='@', string=s1)

'robert.colautti@queensu.ca'

In [252]:
emails[0]

'robert ‘dot’ colautti ‘at’ queensu ‘dot’ ca'

### 2. Create a random sequence of DNA:
MySeq="ATGTGTGATAGATATAGTTTATAG"
* Replace T with U
* Find all start codons (AUG) and stop codons (UAA, UAG, UGA)
* Find all open reading frames (hint: consider each sequence beginning with AUG and ending with a stop codon; how do you know if both sequences are in the same reading frame?)
* Count the length of bp for all open reading frames

In [265]:
MySeq = "ATGTGTGATAGATATAGTTTATAG"

stp1 = re.sub(pattern='T', repl='U', string=MySeq)
stp1

'AUGUGUGAUAGAUAUAGUUUAUAG'

In [276]:
start_codons = re.findall(pattern='AUG', string=stp1)
stop_codons = re.findall(pattern='UAA|UAG|UGA', string=stp1)

start_codons , stop_codons

(['AUG'], ['UGA', 'UAG', 'UAG', 'UAG'])