In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import re  

# Basic usage of `re`

## `re.match`

* [Using regex for text manipulation in python](https://stackabuse.com/using-regex-for-text-manipulation-in-python/)
* [An introduction to regex in python](https://scotch.io/tutorials/an-introduction-to-regex-in-python)

The purpose of the compile method is to compile the `regex` pattern which will be used for matching later.

<table>
<tbody><tr>
<th>Element</th>
<th>Description</th>
</tr>
<tr>
<td><strong>.</strong></td>
<td>This element matches any character except \n</td>
</tr>
<tr>
<td><strong>\d</strong></td>
<td>This matches any digit [0-9]</td>
</tr>
<tr>
<td><strong>\D</strong></td>
<td>This matches non-digit characters [^0-9]</td>
</tr>
<tr>
<td><strong>\s</strong></td>
<td>This matches whitespace character [ \t\n\r\f\v]</td>
</tr>
<tr>
<td><strong>\S</strong></td>
<td>This matches non-whitespace character [^ \t\n\r\f\v]</td>
</tr>
<tr>
<td><strong>\w</strong></td>
<td>This matches alphanumeric character [a-zA-Z0-9_]</td>
</tr>
<tr>
<td><strong>\W</strong></td>
<td>This matches any non-alphanumeric character [^a-zA-Z0-9]</td>
</tr>
</tbody></table>

<table>
<tbody><tr>
<th>Quantifier</th>
<th>Description</th>
<th>Example</th>
<th>Sample match</th>
</tr>
<tr>
<td>+</td>
<td>one or more</td>
<td>\w+</td>
<td>ABCDEF097</td>
</tr>
<tr>
<td>{2}</td>
<td>exactly 2 times</td>
<td>\d{2}</td>
<td>01</td>
</tr>
<tr>
<td>{1,}</td>
<td>one or more times</td>
<td>\w{1,}</td>
<td>smiling</td>
</tr>
<tr>
<td>{2,4}</td>
<td>2, 3 or 4 times</td>
<td>\w{2,4}</td>
<td>1234</td>
</tr>
<tr>
<td>*</td>
<td>0 or more times</td>
<td>A*B</td>
<td>AAAAB</td>
</tr>
<tr>
<td>?</td>
<td>once or none(lazy)</td>
<td>\d+?</td>
<td>1 in 12345</td>
</tr>
</tbody></table>

### When to escape

There are 12 characters with special meanings: the backslash `\`, the caret `^`, the dollar sign `$`, the period or dot `.`, the vertical bar or pipe symbol `|`, the question mark `?`, the asterisk or star `*`, the plus sign `+`, the opening parenthesis `(`, the closing parenthesis `)`, the opening square bracket `[`, and the opening curly brace `{`, These special characters are often called "metacharacters". Most of them are errors when used alone.

In [None]:
re.escape('^a.*$+{}()[\e]')

In [None]:
pattern = re.compile('\\\\')
result = pattern.match("\\author")
print(result.group())   

In [None]:
pattern = re.compile(r"\+") # match alphanumeric characters followed by a + character
result = pattern.search("file+")
print(result.group())

In [None]:
string = '[em]e400824[/em]'
pattern = re.compile(r'^\[\w{2}\]\w+\d+\[\/\w{2}\]')
x = pattern.match(string)
x.group()

In [None]:
pattern = []
pattern.append(re.compile(r"\w"))
pattern.append(re.compile(r"\w+"))
pattern.append(re.compile(r"\w+\s\w"))
pattern.append(re.compile(r"\w+\s\w+\s\w+"))

# Let's feed in some strings to match
string = "regex is awesome!"
# Then call a matching method to match our pattern
for i in pattern:
    result = i.match(string)
    print(type(result))
    print(result.group()) 

In [None]:
def regex(string):
    """This function returns at least one matching digit."""
    pattern = re.compile(r"\d{1,}") # For brevity, this is the same as r"\d+"
    result = pattern.match(string)
    if result:
        return  result.group()
    return None

# Call our function, passing in our string
regex("007 James Bond")

In [None]:
line = "dance more"
result = re.match(r"[^\d+]", line)
print(result)     # Prints out 'dance'
result.group()

In [None]:
ass2 = 6
lisx = []
ass2 == True 
if ass2: 
    print("WTF")

## `re.search`

 ## `re.split`

In [None]:
text = "John Doe\
    Jane Doe\
    Jin Du\
    Chin Doe"
results = re.split(r"\n+", text)
print(results)

## `re.findall`

In [None]:
def finder(string):
    """This function finds all the words in a given string."""
    result_list = re.findall(r"\w+", string)
    return result_list

finder("finding dory")

In [None]:
salaries = "120000   140000   10000   1000   200"

result_list = re.findall(r"\d{2,6}", salaries)
print(result_list)     

## `re.sub`

In [None]:
pattern = re.compile(r"[0-9]+")
result = pattern.sub("__", "there is only 12 thing 23 do")
print(result)

## `?=`

Instead of matching from the start of the string, match an entity that's followed by the pattern

In [None]:
pattern = re.compile(r'\w+(?=\sfox)')
result = pattern.search("The quick brown fox")
print(result.group()) 

In [None]:
number = eval(input("Enter your number\n"))


def monetizer(number):
    """This function adds a thousands separator using comma characters."""
    number = str(number)
    try:
        if type(int(number)) == int:
            # Format into groups of three from the right to the left
            pattern = re.compile(r'\d{1,3}(?=(\d{3})+(?!\d))')
            # substitute with a comma then return
            return pattern.sub(r'\g<0>,', number)
    except:
        return "Not a Number"

# Function call, passing in number as an argument
print(monetizer(number))

## Apply in real situation

In [None]:
infile = "FW.txt"
outfile = "FW_noemoji.txt"

pattern = re.compile(r'\[\w{2}\]\w+\d+\[\/\w{2}\]')
inContent = open(infile).read()
emoji = re.findall(pattern, inContent)
output = pattern.sub("", inContent)
output

 > re.escape(string)

 > Return string with all non-alphanumerics backslashed; this is useful if you want to match an arbitrary literal string that may have regular expression metacharacters inm

In [None]:
re.escape(r'\n')
re.escape(r'\\')

In [None]:
pattern2 = re.compile(r'\n')
newLineSymbol = re.findall(pattern2, output)
noNewLineSymbol = pattern2.sub('',output) 
noNewLineSymbol

In [None]:
pattern3 = re.compile(r'\@\{\w{3}\:\d+\,\w+\:\w+\,\w+\:\d\}')
atuin = re.findall(pattern2, output)
noatuin = pattern3.sub('', noNewLineSymbol)
noatuin