In [2]:
import re

try:
    import nltk
    from nltk.stem import WordNetLemmatizer
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords

    for resource in ["wordnet", "punkt", "stopwords", "punkt_tab", "averaged_perceptron_tagger"]:
        try:
            nltk.download(resource, quiet=True)
        except Exception:
            pass

    lemmatizer = WordNetLemmatizer()
    STOP_WORDS = set(stopwords.words("english"))
    NLTK_OK = True

except ImportError:
    NLTK_OK = False

QUESTIONS = [
    "What are the running techniques used by athletes?",
    "How do scientists study the behaviors of wolves?",
    "Which countries are known for their innovative technologies?",
    "How does deforestation affect the lives of animals?",
    "What methods are used for purifying drinking water?",
    "How do teachers encourage students who are struggling?",
    "What are the best practices for managing financial risks?",
    "How do bees communicate the locations of flowers?",
    "What medications are prescribed for treating infections?",
    "How does regular exercising improve human health?",
]

PASSAGES = {
    "P1 - Sports & Science": """
    Athletes run and train daily to improve their performance. Scientists have studied
    the behaviors and movements of wolves in the wild. Researchers are developing
    innovative technologies to help athletes track their physical activities. Countries
    around the world invest heavily in new technologies for sports medicine.
    """,
    "P2 - Environment & Health": """
    Deforestation is destroying the lives and habitats of many animals. Purification
    methods are used to clean contaminated water sources. Doctors prescribe medications
    to treat bacterial infections. Regular exercise improves the health of individuals
    and reduces risks of chronic diseases. Bees communicate by dancing to indicate the
    location of flowers and food sources.
    """,
    "P3 - Education & Finance": """
    Teachers encourage struggling students by using adaptive learning strategies.
    Financial advisors manage investment risks by diversifying portfolios. Students
    who are motivated tend to perform better in examinations. Companies are adopting
    new risk management practices to handle economic uncertainties. Education systems
    worldwide are innovating their teaching methods.
    """,
}

def simple_tokenize(text):
    return re.findall(r"[a-z]+", text.lower())
BASIC_STOP = {
    "a","an","the","is","are","was","were","be","been","being","have","has","had",
    "do","does","did","will","would","could","should","may","might","shall","can",
    "of","in","on","at","to","for","with","by","from","up","about","into","through",
    "and","but","or","nor","so","yet","both","either","neither","not",
    "i","you","he","she","it","we","they","me","him","her","us","them",
    "what","which","who","whom","whose","how","when","where","why",
    "this","that","these","those","there","their","they","his","its","our","your",
}

SUFFIX_RULES = [
    ("ies", "y"), ("ied", "y"), ("ying", "y"),
    ("ves", "f"), ("ves", "fe"),
    ("ated", "ate"), ("ating", "ate"),
    ("ness", ""), ("ment", ""), ("tion", ""), ("ation", ""),
    ("ers", "er"), ("ing", ""), ("ings", ""),
    ("es", ""), ("ed", ""), ("s", ""),
]

def rule_lemmatize(word):
    for suffix, replacement in SUFFIX_RULES:
        if word.endswith(suffix) and len(word) - len(suffix) > 2:
            return word[:len(word) - len(suffix)] + replacement
    return word

def get_keywords_exact(text):
    tokens = simple_tokenize(text)
    stop = STOP_WORDS if NLTK_OK else BASIC_STOP
    return {t for t in tokens if t not in stop and len(t) > 2}

def get_keywords_lemma(text):
    if NLTK_OK:
        tokens = word_tokenize(text.lower())
        stop = STOP_WORDS
        return {lemmatizer.lemmatize(t) for t in tokens if t.isalpha() and t not in stop and len(t) > 2}
    else:
        tokens = simple_tokenize(text)
        stop = BASIC_STOP
        return {rule_lemmatize(t) for t in tokens if t not in stop and len(t) > 2}

def match_score(q_keywords, p_keywords):
    common = q_keywords & p_keywords
    return len(common), common

def run_demo():
    mode = "NLTK WordNetLemmatizer" if NLTK_OK else "Rule-based suffix lemmatizer"
    print("=" * 72)
    print(f"  NLP LEMMATIZATION — QUESTION vs PASSAGE MATCHING DEMO")
    print(f"  Lemmatizer: {mode}")
    print("=" * 72)

    passage_exact = {name: get_keywords_exact(text) for name, text in PASSAGES.items()}
    passage_lemma = {name: get_keywords_lemma(text) for name, text in PASSAGES.items()}

    total_exact = total_lemma = 0

    for qi, question in enumerate(QUESTIONS, 1):
        q_exact = get_keywords_exact(question)
        q_lemma = get_keywords_lemma(question)

        print(f"\n{'─'*72}")
        print(f"Q{qi:02d}: {question}")
        print(f"  Keywords (exact) : {sorted(q_exact)}")
        print(f"  Keywords (lemma) : {sorted(q_lemma)}")
        print()

        for pname in PASSAGES:
            sc_e, cm_e = match_score(q_exact, passage_exact[pname])
            sc_l, cm_l = match_score(q_lemma, passage_lemma[pname])
            gain = sc_l - sc_e
            total_exact += sc_e
            total_lemma += sc_l

            flag = "IMPROVED" if gain > 0 else ("same" if gain == 0 else "decreased")
            print(f"  [{pname}]")
            print(f"    Exact  — score: {sc_e:2d}  matched: {sorted(cm_e)}")
            print(f"    Lemma  — score: {sc_l:2d}  matched: {sorted(cm_l)}  [{flag}]")

    print(f"\n{'='*72}")
    print("  SUMMARY")
    print(f"{'='*72}")
    print(f"  Total match score  BEFORE lemmatization : {total_exact}")
    print(f"  Total match score  AFTER  lemmatization : {total_lemma}")
    improvement = ((total_lemma - total_exact) / max(total_exact, 1)) * 100
    print(f"  Overall improvement                     : +{improvement:.1f}%")
    print(f"{'='*72}")

    print("""
  EXPLANATION:
  - EXACT matching treats "running", "runs", "ran" as different words.
  - LEMMATIZATION reduces each word to its base form:
      running -> run,  technologies -> technology,
      behaviors -> behavior,  medications -> medication
  - After lemmatization, more question keywords match passage keywords,
    improving recall without hurting precision.
  - Lemmatization is a core step in NLP pipelines for search engines,
    chatbots, QA systems, and information retrieval.
""")


if __name__ == "__main__":
    run_demo()

  NLP LEMMATIZATION — QUESTION vs PASSAGE MATCHING DEMO
  Lemmatizer: NLTK WordNetLemmatizer

────────────────────────────────────────────────────────────────────────
Q01: What are the running techniques used by athletes?
  Keywords (exact) : ['athletes', 'running', 'techniques', 'used']
  Keywords (lemma) : ['athlete', 'running', 'technique', 'used']

  [P1 - Sports & Science]
    Exact  — score:  1  matched: ['athletes']
    Lemma  — score:  1  matched: ['athlete']  [same]
  [P2 - Environment & Health]
    Exact  — score:  1  matched: ['used']
    Lemma  — score:  1  matched: ['used']  [same]
  [P3 - Education & Finance]
    Exact  — score:  0  matched: []
    Lemma  — score:  0  matched: []  [same]

────────────────────────────────────────────────────────────────────────
Q02: How do scientists study the behaviors of wolves?
  Keywords (exact) : ['behaviors', 'scientists', 'study', 'wolves']
  Keywords (lemma) : ['behavior', 'scientist', 'study', 'wolf']

  [P1 - Sports & Science]
  