# This notebook demonstrates fundamental Natural Language Processing (NLP) tasks using popular Python libraries.
# Each exercise focuses on a specific NLP technique:
# 1. Tokenization with NLTK
# 2. Named Entity Recognition (NER) with SpaCy
# 3. Sentiment Analysis with TextBlob
# 4. Text Summarization with Sumy

# ------------------------------------------------------
# Imports and setup
# ------------------------------------------------------
import nltk
nltk.download('punkt')       # Required for NLTK's word tokenizer
nltk.download('punkt_tab')   # Sometimes needed for tokenizers used by Sumy
from nltk.tokenize import word_tokenize, wordpunct_tokenize, TreebankWordTokenizer

import spacy

from textblob import TextBlob

# Install sumy if not already installed (only needed once in a notebook environment)
# If this causes an error outside notebooks, you can run it in a terminal instead.
!pip install sumy

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
nltk.download('stopwords')   # Sumy often uses NLTK stopwords for summarization

# ------------------------------------------------------
# Exercise 1: Tokenization with NLTK
# ------------------------------------------------------
print("\n--- Exercise 1: Tokenization with NLTK ---")
text1 = "Natural Language Processing enables computers to understand human language."
tokens = word_tokenize(text1)  # Uses NLTK's word_tokenize function to split the text
print(f"Original text: '{text1}'")
print(f"Tokens: {tokens}")

# ------------------------------------------------------
# Challenge 1: Tokenization (different tokenizers)
# ------------------------------------------------------
print("\n--- Challenge 1: Comparing different NLTK tokenizers ---")

challenge_text = "Wow! NLP is fun, isn't it? Let's try different tokenizers."

# 1) word_tokenize (default)
tokens_word = word_tokenize(challenge_text)

# 2) wordpunct_tokenize (splits punctuation into separate tokens)
tokens_wordpunct = wordpunct_tokenize(challenge_text)

# 3) TreebankWordTokenizer (handles contractions and punctuation in a specific way)
treebank_tokenizer = TreebankWordTokenizer()
tokens_treebank = treebank_tokenizer.tokenize(challenge_text)

print(f"Original text: {challenge_text}")
print("\nword_tokenize:")
print(tokens_word)

print("\nwordpunct_tokenize (note how punctuation is split):")
print(tokens_wordpunct)

print("\nTreebankWordTokenizer (note how it splits \"isn't\" and others):")
print(tokens_treebank)

# ------------------------------------------------------
# Exercise 2: Named Entity Recognition with SpaCy
# ------------------------------------------------------
print("\n--- Exercise 2: Named Entity Recognition with SpaCy ---")
# Load SpaCy model - ensure 'en_core_web_sm' is downloaded (you might need: !python -m spacy download en_core_web_sm)
try:
    nlp = spacy.load("en_core_web_sm")  # Loads a small English model for processing
except OSError:
    print("Downloading en_core_web_sm model for SpaCy...")
    from spacy.cli import download
    download("en_core_web_sm")          # If model is not found, download it automatically
    nlp = spacy.load("en_core_web_sm")

text2 = "Google was founded by Larry Page and Sergey Brin while they were Ph.D. students at Stanford University."
doc = nlp(text2)  # Process the text with the loaded SpaCy model
print(f"Original text: '{text2}'")
print("Named Entities:")
for ent in doc.ents:  # Iterate through the detected entities
    print(f"  {ent.text:<25} {ent.label_}")

# ------------------------------------------------------
# Challenge 2: Named Entity Recognition on a new sentence
# ------------------------------------------------------
print("\n--- Challenge 2: NER on a custom sentence ---")

challenge_text2 = "Apple released the iPhone 15 in September 2023 for $799. The event took place in Cupertino, California."
doc2 = nlp(challenge_text2)

print(f"Challenge text: '{challenge_text2}'")
print("Named Entities found:")
for ent in doc2.ents:
    print(f"  {ent.text:<25} {ent.label_}")

# Optional: try another sentence with less common entities
challenge_text3 = "OpenAI signed a 3-year research partnership with the University of Leeds in 2025 for £2 million."
doc3 = nlp(challenge_text3)

print("\nAdditional challenge sentence:")
print(f"'{challenge_text3}'")
print("Named Entities found:")
for ent in doc3.ents:
    print(f"  {ent.text:<25} {ent.label_}")

# ------------------------------------------------------
# Exercise 3: Sentiment Analysis with TextBlob
# ------------------------------------------------------
print("\n--- Exercise 3: Sentiment Analysis with TextBlob ---")
text3 = "I am extremely happy with the service provided."
blob = TextBlob(text3)      # Create a TextBlob object from the text
sentiment = blob.sentiment  # Access the sentiment property (polarity, subjectivity)
print(f"Original text: '{text3}'")
print(f"Sentiment: {sentiment}")  # Polarity: -1 (neg) to 1 (pos), Subjectivity: 0 (objective) to 1 (subjective)

# ------------------------------------------------------
# Challenge 3: Sentiment Analysis on a review paragraph
# ------------------------------------------------------
print("\n--- Challenge 3: Sentiment on a review paragraph ---")

review_text = (
    "The phone’s design is beautiful and the screen is bright, "
    "but the battery life is disappointing and the camera often crashes. "
    "Overall, I feel a bit let down after all the hype."
)
review_blob = TextBlob(review_text)
review_sentiment = review_blob.sentiment

print(f"Review text: '{review_text}'")
print(f"Polarity: {review_sentiment.polarity:.3f}")
print(f"Subjectivity: {review_sentiment.subjectivity:.3f}")
print("Interpretation: Polarity > 0 is overall positive, < 0 is negative, close to 0 is neutral.")

# Try a sentence with sarcasm
sarcasm_text = "Yeah, waiting 3 hours for customer support was just amazing."
sarcasm_blob = TextBlob(sarcasm_text)
sarcasm_sentiment = sarcasm_blob.sentiment

print("\nSarcasm example:")
print(f"Text: '{sarcasm_text}'")
print(f"Polarity: {sarcasm_sentiment.polarity:.3f}, Subjectivity: {sarcasm_sentiment.subjectivity:.3f}")
print("Note: Simple sentiment models often struggle with sarcasm and may misclassify it.")

# ------------------------------------------------------
# Exercise 4: Text Summarization with Sumy
# ------------------------------------------------------
print("\n--- Exercise 4: Text Summarization with Sumy ---")
text4 = (
    "Natural Language Processing (NLP) is a fascinating field at the intersection of computer science, "
    "artificial intelligence, and linguistics. It enables machines to understand, interpret, and generate "
    "human language, opening up a world of possibilities for applications ranging from chatbots and translation "
    "services to sentiment analysis and beyond. This field involves various techniques, including machine learning, "
    "deep learning, and rule-based methods, to process and analyze large amounts of text data. The goal of NLP is "
    "to bridge the communication gap between humans and computers, allowing for more natural and intuitive "
    "interactions. Its applications are constantly expanding, making it a critical area of research and development "
    "in today's technologically driven world."
)

parser = PlaintextParser.from_string(text4, Tokenizer("english"))  # Parse the text using Sumy's PlaintextParser
summarizer = LsaSummarizer()
