In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import re 

In [None]:
import pandas as pd 

https://regex101.com/

# Learn by doing 
* https://regexr.com/

<span style="font-family:New York Times; font-size:1em; color:green;">
    
* [Using regex for text manipulation in python](https://stackabuse.com/using-regex-for-text-manipulation-in-python/)
* [An introduction to regex in python](https://scotch.io/tutorials/an-introduction-to-regex-in-python)

## `re.match`

<table>
<tbody><tr>
<th>Element</th>
<th>Description</th>
</tr>
<tr>
<td><strong>.</strong></td>
<td>This element matches any character except \n</td>
</tr>
<tr>
<td><strong>\d</strong></td>
<td>This matches any digit [0-9]</td>
</tr>
<tr>
<td><strong>\D</strong></td>
<td>This matches non-digit characters [^0-9]</td>
</tr>
<tr>
<td><strong>\s</strong></td>
<td>This matches whitespace character [ \t\n\r\f\v]</td>
</tr>
<tr>
<td><strong>\S</strong></td>
<td>This matches non-whitespace character [^ \t\n\r\f\v]</td>
</tr>
<tr>
<td><strong>\w</strong></td>
<td>This matches alphanumeric character [a-zA-Z0-9_]</td>
</tr>
<tr>
<td><strong>\W</strong></td>
<td>This matches any non-alphanumeric character [^a-zA-Z0-9]</td>
</tr>
</tbody></table>
<table>
<tbody><tr>
<th>Quantifier</th>
<th>Description</th>
<th>Example</th>
<th>Sample match</th>
</tr>
<tr>
<td>+</td>
<td>one or more</td>
<td>\w+</td>
<td>ABCDEF097</td>
</tr>
<tr>
<td>{2}</td>
<td>exactly 2 times</td>
<td>\d{2}</td>
<td>01</td>
</tr>
<tr>
<td>{1,}</td>
<td>one or more times</td>
<td>\w{1,}</td>
<td>smiling</td>
</tr>
<tr>
<td>{2,4}</td>
<td>2, 3 or 4 times</td>
<td>\w{2,4}</td>
<td>1234</td>
</tr>
<tr>
<td>*</td>
<td>0 or more times</td>
<td>A*B</td>
<td>AAAAB</td>
</tr>
<tr>
<td>?</td>
<td>once or none(lazy)</td>
<td>\d+?</td>
<td>1 in 12345</td>
</tr>
</tbody></table>

### When to escape

<span style="font-family:New York Times; font-size:1em; color:green;">
    
There are 12 characters with special meanings: 
* the backslash `\`
* the caret `^`
* the dollar sign `$`
* the period or dot `.`
* the vertical bar or pipe symbol `|`
* the question mark `?`
* the asterisk or star `*`
* the plus sign `+`
* the opening parenthesis `(`
* the closing parenthesis `)`
* the opening square bracket `[`
* the opening curly brace `{`

In [None]:
re.escape('^a.*$+{}()[\e]+&b%\n')

In [None]:
# re.compile separate definition of the regex from its use.
pattern = re.compile('\\\\')
result = pattern.match("\\author")
result.group()   

In [None]:
def regex(string):
    """This function returns at least one matching digit."""
    pattern = re.compile(r"\d{1,}") # For brevity, this is the same as r"\d+"
    result = pattern.match(string)
    if result:
        return  result.group()
    return None

regex("007 James Bond")

 ## `re.split`

In [None]:
text = "John Doe\
    Jane Doe\
    Jin Du\
    Chin Doe"

results = re.split(r"\n+", text)
print(results)

## `re.findall`

In [None]:
def finder(string):
    """This function finds all the words in a given string."""
    result_list = re.findall(r"\w+", string)
    return result_list

finder("finding dory")

In [None]:
salaries = "120000   140000   10000   1000   200"
result_list = re.findall(r"\d{2,6}", salaries)
print(result_list)     

### Find the part before a certain string 

https://regex101.com/r/8fY3CF/1

In [None]:
s = '''aa1 !=5
bb1 >=1
cc1 ==1
dd2 <= 2
e3 <2'''

rgx = '([a-zA-Z0-9]+)\s*(!=|==|<=|>=|>|<)\s*([a-zA-Z0-9]+)'
re.findall(rgx, s)
# re.findall('.+(?=>=|==|<=|!=|<|>)', s)

list2 = []
for m in re.findall(rgx, s):
    list2.append(m[2])

print(list2)

## `?=`

In [None]:
# Instead of matching from the start of the string, match an entity that's followed by the pattern
pattern = re.compile(r'\w+(?=\sfox)')
result = pattern.search("The quick brown fox")
print(result.group()) 

https://stackoverflow.com/questions/58020385/extracting-strings-using-regular-expression/58020544#58020544

In [None]:
s = '''LOW QUALITY PROTEIN: cysteine proteinase 5-like  [Solanum pennellii]
PREDICTED: LOW QUALITY PROTEIN: uncharacterized protein LOC107059219 [Solanum pennellii]
XP_019244624.1 PREDICTED: peroxidase 40-like [Nicotiana attenuata]
RVW92024.1 Retrovirus-related Pol polyprotein from transposon TNT 1-94 [Vitis vinifera]
hypothetical protein VITISV_035070 [Vitis vinifera]'''

rgx = '(:?)\s([\w\s-]+)\s(\[.+\])'

list1 = []
for m in re.findall(rgx, s):
    list1.append(m[1])

list1

s = "hypothetical protein VITISV_035070 [Vitis vinifera]"
for m in re.findall(r'([\w\s-]+)\s(\[.+\])', s):
    print(m[0])

## `re.sub`

In [None]:
https://stackoverflow.com/q/57764825

In [None]:
print(re.sub(r"\\m{|}", '', 'The fat \m{cat sat} on \m{the} mat.'))

In [None]:
pattern = re.compile(r"[0-9]+")
result = pattern.sub("__", "there is only 12 thing 23 do")
print(result)

In [None]:
re.sub(r'[^@.]', 'x', 'hello@gmail.com')

https://stackoverflow.com/questions/57721563/how-to-write-a-regular-expression-to-match-some-below-string-using-python/57721705#57721705

In [None]:
s = """
[ 1.1 ] 1. A method of providing a master
[ 12.1 ] 12. An apparatus for providing
[ 39.3 ] b. one or more control point applications
[ 39.8 ] iv. a server application programming interface
[ 30.2 ] a. a client application programming
"""
print(re.sub(r'\]\s\w{1,2}\.', '] ', s))

https://stackoverflow.com/questions/57838472/replace-all-occurrences-of-character-between-two-specific-characters#57838489

In [None]:
def repl(m):
    str = m.group(0)
    return str.replace(",", ";")


inp = "Hello World blah, blah, §Rd, Vasai - East, Thane§ also Goodbye, world!"
print(inp)
print(re.sub('§.*?§', repl, inp))

# Regular expression in action

## find words occuring before `xx`

In [None]:
regex = r"([\s\S]*?)(unknown)"
test_str = "the women marathon unknown introduced at the summer olympics los angeles usa and unknown won"
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
    print("Match {matchNum} was found at {start}-{end}: {match}".format(
        matchNum=matchNum,
        start=match.start(),
        end=match.end(),
        match=match.group()))
    for groupNum in range(0, len(match.groups())):
        groupNum = groupNum + 1
        print("Group {groupNum} found at {start}-{end}: {group}".format(
            groupNum=groupNum,
            start=match.start(groupNum),
            end=match.end(groupNum),
            group=match.group(groupNum)))

## Match sequence after a specified character
* https://stackoverflow.com/questions/10768924/match-sequence-using-regex-after-a-specified-character

## Sort according to name of the file

In [None]:
my_list = ['Time:  1.00000E+01 h', 'Time:  1.00000E+02 h', 'Time:  1.50000E+01 h']

In [None]:
pattern = re.compile(r"\d\.\d+E\+\d+")
sorted(my_list, key = lambda m : float(pattern.findall(m)[0]))

In [None]:
my_list.sort(key=lambda x: float(x.split()[1]))
print(my_list)

In [None]:
internal_list = [" 1001 Support ", "1021 Training", "1022", "1023", "1033 Procedures"]
[re.findall(r"\d{4}", i)[0] for i in internal_list]
print([ i.split()[0] for i in internal_list])
internal_list=[ i.split()[0] for i in internal_list]

In [None]:
pattern = re.compile(r"\d+")
m = [
    'paketone0.dump.xlsx', 'paketone100000.dump.xlsx',
    'paketone1000000.dump.xlsx', 'paketone1004000.dump.xlsx',
    'paketone1008000.dump.xlsx', 'paketone1012000.dump.xlsx',
    'paketone1016000.dump.xlsx', 'paketone1020000.dump.xlsx',
    'paketone1024000.dump.xlsx', 'paketone1028000.dump.xlsx',
    'paketone1032000.dump.xlsx', 'paketone1036000.dump.xlsx',
    'paketone104000.dump.xlsx', 'paketone1040000.dump.xlsx'
]
sorted(m, key=lambda m: pattern.findall(m)[0])

## Slice a string beween 
* https://stackoverflow.com/questions/57087310/i-would-like-to-slice-a-string-bounded-between-and

In [None]:
pattern = re.compile(r"\(1\)")
pattern.findall("(1)Basking Ridge,(1) NJ")
pattern.sub('', "(1)Basking Ridge, NJ")

In [None]:
x = "(1)Basking Ridge,(1) NJ"
#x.split("(1)")
"".join(x.split("(1)"))
y

# Find a string
https://stackoverflow.com/questions/57060369/python-split-a-string-by-a-word-which-contains-a-substring#57060573

In [None]:
meetingStrings = [
    "appointment",
    "meet",
    "interview"
]
text = "Fix me a meeting in 2 days"
for x in meetingStrings:
    if x in text.lower(): 
        txt = text.split(x, 1)[1]
        print(txt)

In [None]:
text = "Fix me a meeting in 2 days"
print(re.split("({})\\w*".format("|".join(meetingStrings)), text))
print(re.split("({})\\w*".format("|".join(meetingStrings)), text)[-1].strip())

In [None]:
pattern = re.compile("meet"+"\w+")
text = "Fix me a meeting in 2 days"
pattern.findall(text)

In [None]:
l=text.split()
for i in meetingStrings:
    for idx, j in enumerate(l):
        if i in j:
            l=l[idx+1:] 
' '.join(l)

In [None]:
s = """Today is Mar 4, 2014 and tomorrow will be 2014-03-05
       and yesterday was 03/03/2014 and now it is currently 2014-03-04 02:02:03"""
ACCEPTABLE_REGEX_DATETIME_PATTERNS = [
    r'\d{4}\-\d{1,2}\-\d{1,2}\s\d{1,2}\:\d{1,2}\:\d{1,2}',
    r'\d{4}\-\d{1,2}\-\d{1,2}',
    r'\d{1,2}\/\d{1,2}\/\d{2,4}',
    r'[a-zA-Z]{1,3}\.?\s\d{1,2}\,?\s\d{2,4}'
]
dt_regex = re.compile( '|'.join( ACCEPTABLE_REGEX_DATETIME_PATTERNS) )
re.findall(dt_regex, s)

# https://regex101.com/r/PpcUKi/1

# Eliminate number between quote 

```md
2 persons goes to watch "Adam 2"
persons goes to watch "Adam 2"
```

In [None]:
x = '2  3 persons "Adam  3" goes to watch "Adam  2"'
pattern = re.compile(r'\d[^$"]')
pattern.sub("", x)
print(pattern.sub("", x))

In [None]:
text = 'Hello, "find.me-_/\\" please help with python regex'
pattern = r'"([A-Za-z0-9_\./\\-]*)"'
m = re.search(pattern, text)

print (m.group())

# Process text 
https://regex101.com/r/PpcUKi/3

In [None]:
test_str = """

ASDF
wqer rtre 34 $^&% fsfa
DDwrgd 43 er 1. ewrtfg
324rfegf 4gfgre

QWE
pritoy Fbhfg 45345 )*9
tret 345 gret54
retre 56 gre ger
retgrh 546ttre

PIIPUU
gre tt HKH rre345 
sdrfetre
ewrewrqwr werfewrt34vds

ret
gre
wretretertettre

MMNNBMB
aserew Sfjlkjf
gdf
rerettyrdfv re HFGHFFHF er
ergre ret retre 
ret retretret 

reg regrtgh rertgre tret

"""

matches = re.findall(r'(?:^|\n\n)([A-Z]{3,}.*?)(?=\n\n[A-Z]{3,}\n|$)',
                     test_str,
                     flags=re.DOTALL)
print(matches)

In [None]:
lines = [
    'ASDF\n', 'wqer rtre 34 $^&% fsfa\n', 'DDwrgd 43 er 1. ewrtfg\n',
    '324rfegf 4gfgre\n', '\n', 'QWE\n', 'pritoy Fbhfg 45345 )*9\n',
    'tret 345 gret54\n', 'retre 56 gre ger\n', 'retgrh 546ttre\n', '\n',
    'PIIPUU\n', 'gre tt HKH rre345 \n', 'sdrfetre\n',
    'ewrewrqwr werfewrt34vds\n', '\n', 'ret\n', 'gre\n', 'wretretertettre\n',
    '\n', 'MMNNBMB\n', 'aserew Sfjlkjf\n', 'gdf\n',
    'rerettyrdfv re HFGHFFHF er\n', 'ergre ret retre \n', 'ret retretret \n',
    '\n', 'reg regrtgh rertgre tret'
]
pattern = re.compile(r'[A-Z]+')
for i, v in enumerate(lines):
    try:
        if pattern.findall(v)[0] == v[:-1]:
            print(i)
    except:
        pass

* https://stackoverflow.com/questions/57175334/need-to-append-inbetween-text-in-python/57176530#57176530

In [None]:
raw_location = {
    "Raynham, MA Topsham, ME", "Savannah, GA Cary, NC",
    "Irvine, CA Bradenton, FL"
}
pattern = re.compile(r'\w+\s\w+')
newset = set()
for i in raw_location:
    # find the part to further process
    x = pattern.findall(i)
    y = x[0].split(' ')
    # replace this specific part with new form
    newi = pattern.sub(y[0] + '/' + y[1], i)
    newset.add(newi)
print(newset)

In [None]:
raw_location = {
    "Raynham, MA Topsham, ME", "Savannah, GA Cary, NC",
    "Irvine, CA Bradenton, FL"
}
pattern = re.compile(r'\w+\s\w+')
newset = set()
for i in raw_location:
    # find the part to further process
    x = pattern.findall(i)
    y = x[0].split(' ')
    # replace this speciiic part with new form
    newi = pattern.sub(y[0] + '/' + y[1], i)
    newset.add(newi)
print(newset)

In [None]:
a_raw_location = "Raynham, MA Topsham, ME"
pattern.findall(a_raw_location)

In [None]:
raw_location = {
    "Raynham, MA Topsham, ME", "Savannah, GA  Cary, NC",
    "Irvine, CA Bradenton, FL"
}
pattern = re.compile(r'\w+\s+\w+')
newset = set()
for i in raw_location:
    x = pattern.findall(i.split(',')[1])
    y = x[0].split(' ')
    newi = i.split(',')[0] + ',' + pattern.sub(
        y[0] + '/' + y[1],
        i.split(',')[1]) + ',' + i.split(',')[2]
    newset.add(newi)
print(newset)

In [None]:
raw_location = {
    "Raynham ok  , MA Topsham, ME", "Savannah, GA  Cary, NC",
    "Irvine, CA Bradenton, FL"
}
pattern = re.compile(r'\w+\s+\w+')
newset = set()

newset = set()
for i in raw_location:
    tem = i.split(',')
    x = tem[1].strip().split()
    newi = tem[0].strip() + ', ' + x[0] + '/' + x[1] + ', ' + tem[1].strip()
    newset.add(newi)
newset

In [None]:
content = """
id-HTRY098WE
id-KNGT371WE?witkl
id-ZXV555NQE?phnu
eh-VCBG075LK
"""
for item in re.findall(r'id-([A-Z0-9]+)', content):
    print(item)