In [None]:
import re

MULTILINE

In [None]:
text = """Line one
Line two
Line three"""

matches1 = re.findall(r'^Line', text, )
matches2 = re.findall(r'^Line', text, re.MULTILINE)
matches3 = re.findall(r'.+', text, re.DOTALL)
print(matches1)
print(matches2)
print(matches3)

['Line']
['Line', 'Line', 'Line']
['Line one\nLine two\nLine three']


In [None]:
text = """
    Line one
    Line two
    Line three"""

# Remove leading whitespace at the beginning of each line
cleaned_text = re.sub(r'^\s+', '', text, flags=re.MULTILINE)

print(cleaned_text)


Line one
Line two
Line three


Greedy vs Non-greedy matches

In [None]:
import re

# The sample HTML text
html_text = "<p>This is the first paragraph.</p><p>This is the second paragraph.</p>"

# Greedy regex
greedy_regex = re.compile(r"<p>.*</p>")
# Non-greedy regex
nongreedy_regex = re.compile(r"<p>.*?</p>")

# Perform the greedy search
greedy_match = greedy_regex.search(html_text)
print("Greedy match:", greedy_match.group())

# Perform the non-greedy search
nongreedy_match = nongreedy_regex.search(html_text)
print("Non-greedy match:", nongreedy_match.group())


Greedy match: <p>This is the first paragraph.</p><p>This is the second paragraph.</p>
Non-greedy match: <p>This is the first paragraph.</p>


In [None]:
greedy_match_all = greedy_regex.findall(html_text)
print("Greedy match:", greedy_match_all)

# Perform the non-greedy search
nongreedy_match_all = nongreedy_regex.findall(html_text)
print("Non-greedy match:", nongreedy_match_all)


Greedy match: ['<p>This is the first paragraph.</p><p>This is the second paragraph.</p>']
Non-greedy match: ['<p>This is the first paragraph.</p>', '<p>This is the second paragraph.</p>']


In [None]:


html_text = """
<html>
<head>
<title>My Website</title>
</head>
<body>
<h1>Welcome to My Website</h1>
<p>This is a paragraph.</p>
<!-- This is a comment -->
<p>This is another paragraph with a <a href="http://example.com">link</a>.</p>
</body>
</html>
"""

# Extract the content of the title tag
title = re.search('<title>(.*)</title>', html_text, re.DOTALL)
print(title.group(1))

# Extract all paragraph texts
paragraphs = re.findall('<p>(.*)</p>', html_text, re.DOTALL)
for paragraph in paragraphs:
    print(paragraph.strip())  # Outputs the text within each <p> tag

# Extract href attribute values from anchor tags
links = re.findall('<a href="(.*)">', html_text)
for link in links:
    print(link)  # Outputs the URLs in href attributes


My Website
This is a paragraph.</p>
<!-- This is a comment -->
<p>This is another paragraph with a <a href="http://example.com">link</a>.
http://example.com


Substitution with Regular Expressions

In [None]:
text = "Hello World"
pattern = r"World"
replacement = "Regex"

new_text = re.sub(pattern, replacement, text)

print(new_text)

Hello Regex


In [None]:
res = re.search(r'(\d+)-(.*)-\1','123-abd-113')
print(res)
res = re.search(r'(\d+)-(.*)-\2','123-abd-abc')
print(res)

None
None


Use regex groups in the substitution to reference parts of the matched patterns.

In [None]:
text = "John: 1234, Jane: 5678"
pattern = r"(\w+): (\d+)"
replacement = r"User: \1, ID: \2"

new_text = re.sub(pattern, replacement, text)
print(new_text)

User: John, ID: 1234, User: Jane, ID: 5678


If you need to perform a more complex substitution that cannot be defined by a simple replacement string, you can pass a function as the 'replace' argument. The function will be called for each non-overlapping occurrence of pattern

In [None]:

text = "The price is 100 dollars"
pattern = r"\d+"

# Function to double the value matched by the pattern
def double(match):
    print(match.group(0))
    return str(int(match.group(0)) * 2)

new_text = re.sub(pattern, double, text)

print(new_text)


100
The price is 200 dollars


Flags

In [None]:

text = """Here is an EXAMPLE
With multiple LINES of text
And some more TEXT
"""

# Case-insensitive search without the IGNORECASE flag
print(re.findall('example', text))

# Case-insensitive search with the IGNORECASE flag
print(re.findall('example', text, re.IGNORECASE))

# Multiline search for lines starting with 'And'
print(re.findall('^And', text))

# Multiline search with the MULTILINE flag
print(re.findall('^And', text, re.MULTILINE))

# DOTALL search where '.' includes newlines
print(re.findall('LINES.*TEXT', text))

# DOTALL search with the DOTALL flag
print(re.findall('LINES.*TEXT', text, re.DOTALL))
print(re.findall('LINES.*?TEXT', text, re.DOTALL))
print(re.findall('LINES.*?TEXT', text, re.DOTALL|re.IGNORECASE))

# VERBOSE pattern with comments
pattern = re.compile(r"""
    ^        # beginning of the line
    (\w+)    # word characters group
    \s+      # one or more whitespace
    (\w+)    # second word
    """, re.VERBOSE | re.MULTILINE)

print(pattern.search(text))


[]
['EXAMPLE']
[]
['And']
[]
['LINES of text\nAnd some more TEXT']
['LINES of text\nAnd some more TEXT']
['LINES of text']
<re.Match object; span=(0, 7), match='Here is'>


Output?

In [None]:
pattern = r'John (Reginald)? Smith'


match1 = re.fullmatch(pattern, 'John Reginald Smith')
print("Match 1:", bool(match1))


match2 = re.fullmatch(pattern, 'John  Smith')
print("Match 2:", bool(match2))


match3 = re.fullmatch(pattern, 'John H Smith')
print("Match 3:", bool(match3))


Match 1: True
Match 2: True
Match 3: False


In [None]:
pattern = r'John (Reginald )?Smith'


match1 = re.fullmatch(pattern, 'John Reginald Smith')
print("Match 1:", bool(match1))


match2 = re.fullmatch(pattern, 'John Smith')
print("Match 2:", bool(match2))


match3 = re.fullmatch(pattern, 'John H Smith')
print("Match 3:", bool(match3))


Match 1: True
Match 2: True
Match 3: False


**Given the following text: "Error: Invalid input at line 42. Warning: Deprecated API call at line 108." Write a regex to extract the line numbers**

In [None]:
text = "Error: Invalid input at line 42. Warning: Deprecated API call at line 108."
pattern = r'line (\d+)'

# Find all matches of the pattern
line_numbers = re.findall(pattern, text)

print(line_numbers)



['42', '108']


**Consider a scenario where you have log data with timestamps in the format [YYYY-MM-DD HH:MM:SS]. Write a regex to match timestamps that correspond to the time between 9 PM and 11:59 PM, regardless of the date.**

In [None]:
time_pattern = r'^\[\d{4}-\d{2}-\d{2} (2[1-3]):[0-5][0-9]:[0-5][0-9]\]$'

# Sample log data
log_data = [
    "[2023-11-02 21:00:00]",
    "[2023-11-02 22:15:30]",
    "[2023-11-02 23:59:59]",
    "[2023-11-02 20:59:59]",
    "[2023-11-02 00:00:00]"
]

# Check each log entry against the pattern
for log in log_data:
    if re.match(time_pattern, log):
        print(f"Timestamp matches: {log}")
    else:
        print(f"Timestamp does not match: {log}")

Timestamp matches: [2023-11-02 21:00:00]
Timestamp matches: [2023-11-02 22:15:30]
Timestamp matches: [2023-11-02 23:59:59]
Timestamp does not match: [2023-11-02 20:59:59]
Timestamp does not match: [2023-11-02 00:00:00]


**Can you construct a regular expression that matches a string containing a pair of words, where the second word is the reverse of the first? For instance, it should match "defied deifed" or "level level".**

In [None]:
def reverse_word(match):
    print(match.group(1))
    print(match.group(1)[::-1])
    return match.group(1)[::-1]


test_strings = ["defied deifed", "level level", "test tset", "hello olleho"]


pattern = r'\b(\w+)\b'

# Check each string
for string in test_strings:
    # Find all words in the string
    words = re.findall(pattern, string)

    # Check if there are exactly two words and if the second is the reverse of the first
    if len(words) == 2 and words[1] == reverse_word(re.match(pattern, words[0])):
        print(f'Match: {string}')
    else:
        print(f'No match: {string}')

defied
deifed
Match: defied deifed
level
level
Match: level level
test
tset
Match: test tset
hello
olleh
No match: hello olleho


**Here's a regex intended to match a quoted string: "([^"]*)". However, it's currently also matching across multiple quoted sections, such as in He said, "This is one," but it was not "the only one." What can be modified to make sure it matches only within individual quotes without spanning across?**


 **Check for and correct doubled words in a text using regular expressions.**

In [None]:

def remove_doubled_words(text):
    # A regex pattern to match two consecutive identical words
    pattern = r'\b(\w+)(\s+)\1\b'

    # sub() method to replace any found doubled words with a single occurrence
    corrected_text = re.sub(pattern, r'\1', text, flags=re.IGNORECASE)

    return corrected_text


text_with_doubles = "This is an example example of a text with with doubled words words."


corrected_text = remove_doubled_words(text_with_doubles)

print("Original text:", text_with_doubles)
print("Corrected text:", corrected_text)


Original text: This is an example example of a text with with doubled words words.
Corrected text: This is an example of a text with doubled words.


**Given a text 'content'. Write a file 'regrouped.txt', in which the data is regrouped in the following way:**

1289,T83456

1289,Z22334

1205,T10032

1205,B77301

1205,T10786

1205,C77502

1410,K34001

1410,T98987

In [None]:
content = """customer-id 1289
T83456
customer-id 1289
customer-id 1205
T10032
B77301
customer-id 1205
customer-id 1410
K34001
T98987
customer-id 1410
customer-id 1205
T10786
C77502
customer-id 1205
customer-id 1289
Z22334
customer-id 1289"""


data = {}
for x in re.finditer(r"customer-id ([\d\n]{4})(.*?)customer-id \1", content, re.DOTALL):
    key, values = x.groups()
    print(x.groups())
    print(data)
    if key in data:
        data[key] += values
    else:
        data[key] = values
print(data)
with open("regrouped.txt", "w") as fh:
    for key in data:
        for art_no in data[key].split():
            fh.write(f"{key},{art_no}\n")

('1289', '\nT83456\n')
{}
('1205', '\nT10032\nB77301\n')
{'1289': '\nT83456\n'}
('1410', '\nK34001\nT98987\n')
{'1289': '\nT83456\n', '1205': '\nT10032\nB77301\n'}
('1205', '\nT10786\nC77502\n')
{'1289': '\nT83456\n', '1205': '\nT10032\nB77301\n', '1410': '\nK34001\nT98987\n'}
('1289', '\nZ22334\n')
{'1289': '\nT83456\n', '1205': '\nT10032\nB77301\n\nT10786\nC77502\n', '1410': '\nK34001\nT98987\n'}
{'1289': '\nT83456\n\nZ22334\n', '1205': '\nT10032\nB77301\n\nT10786\nC77502\n', '1410': '\nK34001\nT98987\n'}
