<a href="https://colab.research.google.com/github/Amulyanrao7777/NLP/blob/main/program1(with_defns_and_explanations)_Regex_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re

# ==========================================
# PART 0: THE LEVELS OF NLP
# ==========================================
# We will use one sentence to show how NLP breaks down language.
# SENTENCE: "Unbelievably, the brave explorers found the hidden treasure."

print("=== PART 0: THE 6 LEVELS OF LINGUISTIC ANALYSIS ===")
print("Sentence: 'Unbelievably, the brave explorers found the hidden treasure.'\n")

# 1. MORPHOLOGY
print("1. MORPHOLOGY")
print("   - Meaning: The study of the internal structure of words (roots, prefixes, suffixes).")
print("   - Example: 'Unbelievably' -> Un- (prefix) + believ (root) + -able (suffix) + -ly (suffix)\n")

# 2. LEXICAL ANALYSIS
print("2. LEXICAL ANALYSIS")
print("   - Meaning: Identifying the category (Part of Speech) of each word.")
print("   - Example: 'Found' -> Verb; 'Explorers' -> Noun; 'Brave' -> Adjective\n")

# 3. SYNTAX
print("3. SYNTAX")
print("   - Meaning: The rules for arranging words into grammatically correct sentences.")
print("   - Example: We group [The brave explorers] as a Noun Phrase and [found] as the Verb.\n")

# 4. SEMANTICS
print("4. SEMANTICS")
print("   - Meaning: The literal meaning of words and sentences, independent of context.")
print("   - Example: 'Treasure' literally means valuable objects like gold or gems.\n")

# 5. PRAGMATICS
print("5. PRAGMATICS")
print("   - Meaning: Understanding the intended meaning based on real-world context.")
print("   - Example: If a child says this in a backyard, 'treasure' might just mean a cool rock.\n")

# 6. REGULAR EXPRESSIONS
print("6. REGULAR EXPRESSIONS (Regex)")
print("   - Meaning: A computational tool used to find patterns in text.")
print("   - Usage: We use Regex to perform levels 1 & 2 (Morphology & Lexical Analysis) automatically.")
print("="*60 + "\n")

=== PART 0: THE 6 LEVELS OF LINGUISTIC ANALYSIS ===
Sentence: 'Unbelievably, the brave explorers found the hidden treasure.'

1. MORPHOLOGY
   - Meaning: The study of the internal structure of words (roots, prefixes, suffixes).
   - Example: 'Unbelievably' -> Un- (prefix) + believ (root) + -able (suffix) + -ly (suffix)

2. LEXICAL ANALYSIS
   - Meaning: Identifying the category (Part of Speech) of each word.
   - Example: 'Found' -> Verb; 'Explorers' -> Noun; 'Brave' -> Adjective

3. SYNTAX
   - Meaning: The rules for arranging words into grammatically correct sentences.
   - Example: We group [The brave explorers] as a Noun Phrase and [found] as the Verb.

4. SEMANTICS
   - Meaning: The literal meaning of words and sentences, independent of context.
   - Example: 'Treasure' literally means valuable objects like gold or gems.

5. PRAGMATICS
   - Meaning: Understanding the intended meaning based on real-world context.
   - Example: If a child says this in a backyard, 'treasure' might ju

In [None]:
# ==========================================
# PART 1: NORMALIZATION (Cleaning the Mess)
# ==========================================
print("--- Part 1: Text Normalization ---")

# Scenario: Data coming from the web is often messy.
raw_text = "   UnBeLiEvAbLy... the EXPLORERS found it!!!    "

# 1. Strip (Remove spaces from start/end)
step1 = raw_text.strip()
print("stripped: ", step1)
# 2. Lowercase (Standardize case)
step2 = step1.lower()
print("lowercased: ", step2)
# 3. Remove Punctuation (Basic Cleaning)
# [^\w\s] -> ^ means NOT, \w is word char, \s is space.
step3 = re.sub(r"[^\w\s]", "", step2)

print(f"\nOriginal: '{raw_text}'")
print(f"Cleaned:  '{step3}'")
print("\n" + "-"*30 + "\n")

--- Part 1: Text Normalization ---
stripped:  UnBeLiEvAbLy... the EXPLORERS found it!!!
lowercased:  unbelievably... the explorers found it!!!

Original: '   UnBeLiEvAbLy... the EXPLORERS found it!!!    '
Cleaned:  'unbelievably the explorers found it'

------------------------------



In [None]:
# ==========================================
# PART 2: MORPHOLOGY WITH REGEX
# ==========================================
print("--- Part 2: Morphology (Finding Word Patterns) ---")

text_morph = "The active students started acting like actors. They were unhappy and disorganized."

# Task A: Find all words related to 'act' (active, acting, actors).
# \b    -> Boundary (Start/End of word)
# \w* -> Any letters that follow (suffixes)
root_pattern = r"\bact\w*\b"
matches_act = re.findall(root_pattern, text_morph)
print(f"1. Words sharing the root 'act': {matches_act}")

# Task B: Find Negation (Prefixes like 'un-' or 'dis-')
# [ud][ni][sh]? -> Matches 'un' or 'dis'
# \w+           -> The rest of the word
negation_pattern = r"\b(?:un|dis)\w+\b"
negations = re.findall(negation_pattern, text_morph)
print(f"2. Words with negative prefixes: {negations}")

# Task C: Find Agents (Suffixes like '-er' or '-or')
# \w+           -> The root
# [eo]r         -> 'er' or 'or'
# \b            -> End of word
agent_pattern = r"\b\w+[eo]r\b"
agents = re.findall(agent_pattern, text_morph)
print(f"3. Words representing 'Agents' (doers): {agents}")

print("\n" + "-"*30 + "\n")

--- Part 2: Morphology (Finding Word Patterns) ---
1. Words sharing the root 'act': ['active', 'acting', 'actors']
2. Words with negative prefixes: ['unhappy', 'disorganized']
3. Words representing 'Agents' (doers): []

------------------------------



In [None]:
# ==========================================
# PART 3: INFORMATION EXTRACTION
# ==========================================
print("--- Part 3: Extracting Real-World Data (Complex Regex) ---")

social_data = """
Post 1: Loving the new #AI course!
Post 2: Contact support@rvu.edu.in for help.
Post 3: Call 555-0199 or (555) 123-4567 for emergency.
"""

# 1. Hashtags (# followed by words)
hashtags = re.findall(r"#\w+", social_data)
print("Hashtags:", hashtags)

# 2. Emails (User @ Domain)
emails = re.findall(r"[\w\.-]+@[\w\.-]+", social_data) #improve: ^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$

print("Emails:  ", emails)

# 3. Complex Phone Numbers (The "Complex Regex" Milestone)
# Matches: 555-0199 OR (555) 123-4567
# \(?     -> Optional opening parenthesis
# \d{3}   -> 3 digits (Area code)
# \)?     -> Optional closing parenthesis
# [-\s]?  -> Optional separator (dash or space)
# \d{3}   -> 3 digits
# [-\s]?  -> Optional separator
# \d{4}   -> 4 digits
phone_pattern = r"\(?\d{3}\)?[-\s]?\d{3}[-\s]?\d{4}"
phones = re.findall(phone_pattern, social_data)
print("Phones:  ", phones)

print("\n" + "-"*30 + "\n")

--- Part 3: Extracting Real-World Data (Complex Regex) ---
Hashtags: ['#AI']
Emails:   ['support@rvu.edu.in']
Phones:   ['(555) 123-4567']

------------------------------



In [None]:
# ==========================================
# PART 4: THE LAB CHALLENGE üèÜ
# ==========================================
print("--- Part 4: The Lab Challenge ---")

# MISSION:
# 1. Normalize the text (lowercase).
# 2. Extract the dates (DD/MM/YYYY).
# 3. Find the 'action' words ending in 'ed' (Morphology).
# 4. Extract the prices.

data = """
REPORT: On 12/05/2024, the Client purchasED
a gold plan for $250.50. It was approveD on 13/05/2024.
Is LiaR. Liar Told you That.
Give me your eyes, I need sunshine.
#skip
I'm Coming to the CoTTAGE.
Lily&Jane
"""

print(f"Raw Data:\n{data}")

# --- STUDENT SECTION: TRY TO FILL THESE IN! ---

# 1. Normalize
norm_data = data.lower()
print(f"1. Normalized: {norm_data.replace(chr(10), ' ')}") # removing newlines for display

# 2. Extract Dates (Pattern: Digits/Digits/Digits)
dates = re.findall(r"\d{2}/\d{2}/\d{4}", norm_data)
print(f"2. Dates Found: {dates}")

# 3. Morphology (Past tense words ending in 'ed')
past_tense = re.findall(r"\b\w+ed\b", norm_data)
print(f"3. Past Tense Actions: {past_tense}")

# 4. Extract Prices ($ followed by digits)
prices = re.findall(r"\$\d+(?:\.\d+)?", data)
print(f"4. Prices: {prices}")

print("\n=== LAB 1 COMPLETE ===")

--- Part 4: The Lab Challenge ---
Raw Data:

REPORT: On 12/05/2024, the Client purchasED
a gold plan for $250.50. It was approveD on 13/05/2024. 
Is LiaR. Liar Told you That.
Give me your eyes, I need sunshine.
#skip
I'm Coming to the CoTTAGE.
Lily&Jane

1. Normalized:  report: on 12/05/2024, the client purchased a gold plan for $250.50. it was approved on 13/05/2024.  is liar. liar told you that. give me your eyes, i need sunshine. #skip i'm coming to the cottage. lily&jane 
2. Dates Found: ['12/05/2024', '13/05/2024']
3. Past Tense Actions: ['purchased', 'approved', 'need']
4. Prices: ['$250.50']

=== LAB 1 COMPLETE ===


#Example2:

In [None]:
new_text = "   A NeW rEpOrT on 01/01/2023 said: 'The UNHAppy Customer purchasED a premium package for $99.99. Reach us via info@example.com or call (123) 456-7890. #NLPishere!!!   "

**Reasoning**:
Now that the `new_text` is defined, the next step is to apply text normalization techniques, which involve stripping leading/trailing whitespace, converting the text to lowercase, and removing punctuation to prepare it for further analysis, following the example from `PART 1: NORMALIZATION`.



In [None]:
import re

# 1. Strip (Remove spaces from start/end)
normalized_text_step1 = new_text.strip()
# 2. Lowercase (Standardize case)
normalized_text_step2 = normalized_text_step1.lower()
# 3. Remove Punctuation (Basic Cleaning) using the same regex as in PART 1
normalized_text = re.sub(r"[^\w\s]", "", normalized_text_step2)

print(f"Original text: '{new_text}'")
print(f"Normalized text: '{normalized_text}'")

Original text: '   A NeW rEpOrT on 01/01/2023 said: 'The UNHAppy Customer purchasED a premium package for $99.99. Reach us via info@example.com or call (123) 456-7890. #NLPishere!!!   '
Normalized text: 'a new report on 01012023 said the unhappy customer purchased a premium package for 9999 reach us via infoexamplecom or call 123 4567890 nlpishere'


**Reasoning**:
Following the text normalization, the next step is to apply morphology analysis and information extraction using regular expressions. I will extract dates, past tense verbs, words with negative prefixes and agent suffixes, prices, hashtags, emails, and phone numbers from the `normalized_text_step2` (lowercased with punctuation) to ensure the patterns match correctly.



In [None]:
print("--- Morphology Analysis and Information Extraction ---")

# 1. Extract Dates (DD/MM/YYYY)
dates_extracted = re.findall(r"\d{2}/\d{2}/\d{4}", normalized_text_step2)
print(f"Dates Found: {dates_extracted}")

# 2. Morphology: Past tense words ending in 'ed'
past_tense_words = re.findall(r"\b\w+ed\b", normalized_text_step2)
print(f"Past Tense Actions: {past_tense_words}")

# 3. Morphology: Words with negative prefixes (e.g., 'un-', 'dis-')
negation_prefixes = re.findall(r"\b(?:un|dis)\w+\b", normalized_text_step2)
print(f"Words with negative prefixes: {negation_prefixes}")

# 4. Morphology: Words representing 'Agents' (doers) ending in '-er' or '-or'
agent_words = re.findall(r"\b\w+[eo]r\b", normalized_text_step2)
print(f"Words representing 'Agents': {agent_words}")

# 5. Extract Prices ($ followed by digits, optionally with decimals)
prices_extracted = re.findall(r"\$\d+(?:\.\d+)?", normalized_text_step2)
print(f"Prices: {prices_extracted}")

# 6. Extract Hashtags (# followed by word characters)
hashtags_extracted = re.findall(r"#\w+", normalized_text_step2)
print(f"Hashtags: {hashtags_extracted}")

# 7. Extract Emails (User @ Domain)
emails_extracted = re.findall(r"[\w\.-]+@[\w\.-]+", normalized_text_step2)
print(f"Emails: {emails_extracted}")

# 8. Extract Phone Numbers (flexible format)
phone_pattern = r"\(?\d{3}\)?[-\s]?\d{3}[-\s]?\d{4}"
phone_numbers_extracted = re.findall(phone_pattern, normalized_text_step2)
print(f"Phones: {phone_numbers_extracted}")

--- Morphology Analysis and Information Extraction ---
Dates Found: ['01/01/2023']
Past Tense Actions: ['purchased']
Words with negative prefixes: ['unhappy']
Words representing 'Agents': ['customer', 'for']
Prices: ['$99.99']
Hashtags: ['#nlpishere']
Emails: ['info@example.com']
Phones: ['(123) 456-7890']
