In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import re  

# Basic usage of `re`

## `re.match`

<span style="font-family:New York Times; font-size:1em; color:green;">
    
* [Using regex for text manipulation in python](https://stackabuse.com/using-regex-for-text-manipulation-in-python/)
* [An introduction to regex in python](https://scotch.io/tutorials/an-introduction-to-regex-in-python)

The purpose of the compile method is to compile the `regex` pattern which will be used for matching later.

<table>
<tbody><tr>
<th>Element</th>
<th>Description</th>
</tr>
<tr>
<td><strong>.</strong></td>
<td>This element matches any character except \n</td>
</tr>
<tr>
<td><strong>\d</strong></td>
<td>This matches any digit [0-9]</td>
</tr>
<tr>
<td><strong>\D</strong></td>
<td>This matches non-digit characters [^0-9]</td>
</tr>
<tr>
<td><strong>\s</strong></td>
<td>This matches whitespace character [ \t\n\r\f\v]</td>
</tr>
<tr>
<td><strong>\S</strong></td>
<td>This matches non-whitespace character [^ \t\n\r\f\v]</td>
</tr>
<tr>
<td><strong>\w</strong></td>
<td>This matches alphanumeric character [a-zA-Z0-9_]</td>
</tr>
<tr>
<td><strong>\W</strong></td>
<td>This matches any non-alphanumeric character [^a-zA-Z0-9]</td>
</tr>
</tbody></table>
<table>
<tbody><tr>
<th>Quantifier</th>
<th>Description</th>
<th>Example</th>
<th>Sample match</th>
</tr>
<tr>
<td>+</td>
<td>one or more</td>
<td>\w+</td>
<td>ABCDEF097</td>
</tr>
<tr>
<td>{2}</td>
<td>exactly 2 times</td>
<td>\d{2}</td>
<td>01</td>
</tr>
<tr>
<td>{1,}</td>
<td>one or more times</td>
<td>\w{1,}</td>
<td>smiling</td>
</tr>
<tr>
<td>{2,4}</td>
<td>2, 3 or 4 times</td>
<td>\w{2,4}</td>
<td>1234</td>
</tr>
<tr>
<td>*</td>
<td>0 or more times</td>
<td>A*B</td>
<td>AAAAB</td>
</tr>
<tr>
<td>?</td>
<td>once or none(lazy)</td>
<td>\d+?</td>
<td>1 in 12345</td>
</tr>
</tbody></table>

In [25]:
print("\n \t  1 \r 2 \r 3 \f 4 \v 5") 


 	  1  2  3  4  5


In [12]:
re.match('hello ', 'hello world ok')

<re.Match object; span=(0, 6), match='hello '>

### When to escape

<span style="font-family:New York Times; font-size:1em; color:green;">
    
There are 12 characters with special meanings: 
* the backslash `\`
* the caret `^`
* the dollar sign `$`
* the period or dot `.`
* the vertical bar or pipe symbol `|`
* the question mark `?`
* the asterisk or star `*`
* the plus sign `+`
* the opening parenthesis `(`
* the closing parenthesis `)`
* the opening square bracket `[`
* the opening curly brace `{`

he `r` means the the following is a "raw string", ie. backslash characters are treated literally instead of signifying special treatment of the following character 

so `\n` is a single newline
and `r'\n'` is two characters - a backslash and the letter `'n'`
another way to write it would be `'\\n'` because the first backslash escapes the second

In [59]:
r'\n'
'\n'
print('\n a')
print(r'\n a')

'\\n'

'\n'


 a
\n a


In [64]:
#
re.escape('^a.*$+{}()[\e]+&b%')

'\\^a\\.\\*\\$\\+\\{\\}\\(\\)\\[\\\\e\\]\\+\\&b%'

In [63]:
# re.compile separate definition of the regex from its use.
pattern = re.compile('\\\\')
result = pattern.match("\\author")
result.group()   

'\\'

In [None]:
pattern = re.compile(r"\+") # match alphanumeric characters followed by a + character
result = pattern.search("file+")#
result.group()

In [None]:
string = '[em]e400824[/em]'
pattern = re.compile(r'^\[\w{2}\]\w+\d+\[\/\w{2}\]')
x = pattern.match(string)
x.group()

In [35]:
pattern = []
pattern.append(re.compile(r"\w"))
pattern.append(re.compile(r"\w+"))
pattern.append(re.compile(r"\w+\s\w"))
pattern.append(re.compile(r"\w+\s\w+\s\w+"))

# Let's feed in some strings to match
string = "regex is awesome!"
string2 = "regex is awesome\
Holy grail"
# Then call a matching method to match our pattern
for i in pattern:
    #match ⇒ find something at the beginning of the string and return a match object.
    result = i.match(string)
    #print(type(result))
    print(result.group()) 
    
for i in pattern:
    #search ⇒ find something anywhere in the string and return a match object.
    result = i.search(string2)
    print(result.group())

r
regex
regex i
regex is awesome
r
regex
regex i
regex is awesomeHoly


In [13]:
def regex(string):
    """This function returns at least one matching digit."""
    pattern = re.compile(r"\d{1,}") # For brevity, this is the same as r"\d+"
    result = pattern.match(string)
    if result:
        return  result.group()
    return None

# Call our function, passing in our string
regex("007 James Bond")

'007'

In [14]:
line = "dance more"
result = re.match(r"[^\d+]", line)
print(result)     # Prints out 'dance'
result.group()

<re.Match object; span=(0, 1), match='d'>


'd'

In [15]:
ass2 = 6
lisx = []
ass2 == True 
if lisx: 
    print("WTF")

False

## `re.search`

 ## `re.split`

In [16]:
text = "John Doe\
    Jane Doe\
    Jin Du\
    Chin Doe"
results = re.split(r"\n+", text)
print(results)

['John Doe    Jane Doe    Jin Du    Chin Doe']


## `re.findall`

In [96]:
def finder(string):
    """This function finds all the words in a given string."""
    result_list = re.findall(r"\w+", string)
    return result_list

finder("finding dory")

['finding', 'dory']

In [18]:
salaries = "120000   140000   10000   1000   200"

result_list = re.findall(r"\d{2,6}", salaries)
print(result_list)     

['120000', '140000', '10000', '1000', '200']


## `re.sub`

In [95]:
# Substitutions
pattern = re.compile(r"[0-9]+")
result = pattern.sub("__", "there is only 12 thing 23 do")
print(result)

there is only __ thing __ do


## `?=`

In [93]:
# Instead of matching from the start of the string, match an entity that's followed by the pattern
pattern = re.compile(r'\w+(?=\sfox)')
result = pattern.search("The quick brown fox")
print(result.group()) 

brown


In [None]:
number = eval(input("Enter your number\n"))


def monetizer(number):
    """This function adds a thousands separator using comma characters."""
    number = str(number)
    try:
        if type(int(number)) == int:
            # Format into groups of three from the right to the left
            pattern = re.compile(r'\d{1,3}(?=(\d{3})+(?!\d))')
            # substitute with a comma then return
            return pattern.sub(r'\g<0>,', number)
    except:
        return "Not a Number"

# Function call, passing in number as an argument
print(monetizer(number))

# Apply in real situation

In [12]:
infile = "FW.txt"
outfile = "FW_noemoji.txt"

pattern = re.compile(r'\[\w{2}\]\w+\d+\[\/\w{2}\]')
inContent = open(infile).read()
emoji = re.findall(pattern, inContent)
output = pattern.sub("", inContent)
output;

In [13]:
?re.escape(string)

 > re.escape(string)

 > Return string with all non-alphanumerics backslashed; this is useful if you want to match an arbitrary literal string that may have regular expression metacharacters in

In [None]:
re.escape(r'\n')
re.escape(r'\\')

In [None]:
pattern2 = re.compile(r'\n')
newLineSymbol = re.findall(pattern2, output)
noNewLineSymbol = pattern2.sub('',output) 
noNewLineSymbol

In [None]:
pattern3 = re.compile(r'\@\{\w{3}\:\d+\,\w+\:\w+\,\w+\:\d\}')
atuin = re.findall(pattern2, output)
noatuin = pattern3.sub('', noNewLineSymbol)
noatuin

# Regular expression in action

For instance, if I have a vector `["2", "3", "0", "1"]`, I would like to obtain something like `["aa", "aaa", "", "a"]`

In [None]:


a = tf.Variable(["2", "3", "0", "1"], dtype=tf.dtypes.string)
res = tf.strings.regex_replace(a, "([0-9]+)", r"a" * int("\\1"))

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(res)) # It should show ["aa", "aaa", "", "a"]

In [52]:
x = ["2", "3", "0", "1"]
for i in range(len(x)):
    x[i]=int(x[i])*"a" 
x 

['aa', 'aaa', '', 'a']

In [46]:
x = ["2", "3", "0", "1"]
[int(i)*'a' for i in x]

['aa', 'aaa', '', 'a']

## find words occuring before `xx`

In [104]:
text="the women marathon unknown introduced at the summer olympics los angeles usa and unknown won"  
items=re.finditer('unknown',text)  #as there are 2 unknown
#items.groups()
pattern = re.compile(r'\w+(?=\sunknown)')
result = pattern.search(text)

result.groups()

()

In [55]:
regex = r"([\s\S]*?)(unknown)"
test_str = "the women marathon unknown introduced at the summer olympics los angeles usa and unknown won"
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
    print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
    for groupNum in range(0, len(match.groups())):
        groupNum = groupNum + 1
        print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))

Match 1 was found at 0-26: the women marathon unknown
Group 1 found at 0-19: the women marathon 
Group 2 found at 19-26: unknown
Match 2 was found at 26-88:  introduced at the summer olympics los angeles usa and unknown
Group 1 found at 26-81:  introduced at the summer olympics los angeles usa and 
Group 2 found at 81-88: unknown
