In [10]:
import nltk
from nltk.tokenize import word_tokenize

text = "Natural Language Processing (NLP) is exciting, isn't it?"
tokens = word_tokenize(text.lower())

print(tokens)

['natural', 'language', 'processing', '(', 'nlp', ')', 'is', 'exciting', ',', 'is', "n't", 'it', '?']


In [9]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /home/harsh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
import nltk
from nltk.tokenize import word_tokenize

text = "Natural Language Processing (NLP) is exciting, isn't it?"
tokens = word_tokenize(text.lower())

print(tokens)

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

print(filtered_tokens)

from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

print(stemmer.stem('playing'))   # play
print(lemmatizer.lemmatize('playing', pos='v'))  # play

['natural', 'language', 'processing', '(', 'nlp', ')', 'is', 'exciting', ',', 'is', "n't", 'it', '?']
['natural', 'language', 'processing', 'nlp', 'exciting']
play
play


In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/harsh/nltk_data...


True

In [None]:
"""
Comprehensive Text Preprocessing Script
Includes cleaning HTML, removing emojis, spelling correction,
lowercasing, punctuation removal, tokenization, stopword removal,
stemming, and lemmatization.

Make sure to install dependencies:
    pip install beautifulsoup4 emoji textblob nltk

Run nltk.downloader once to avoid download prompts:
    import nltk
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
"""

from bs4 import BeautifulSoup
import emoji
from textblob import TextBlob
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Ensure required nltk data packages are downloaded
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

def clean_html(raw_html: str) -> str:
    """Remove HTML tags from text."""
    return BeautifulSoup(raw_html, "html.parser").get_text()

def remove_emojis(text: str) -> str:
    """Remove all emojis from text."""
    return emoji.replace_emoji(text, replace='')

def correct_spelling(text: str) -> str:
    """Correct spelling mistakes using TextBlob."""
    return str(TextBlob(text).correct())

def lower_case(text: str) -> str:
    """Convert text to lowercase."""
    return text.lower()

def remove_punctuation(text: str) -> str:
    """Remove punctuation from text."""
    return text.translate(str.maketrans('', '', string.punctuation))

def sentence_tokenize(text: str) -> list:
    """Split text into sentences."""
    return sent_tokenize(text)

def word_tokenize_text(text: str) -> list:
    """Split text into words."""
    return word_tokenize(text)

def remove_stopwords(words: list) -> list:
    """Remove English stopwords from a list of words."""
    stop_words = set(stopwords.words('english'))
    return [w for w in words if w.lower() not in stop_words]

def stem_words(words: list) -> list:
    """Stem words using PorterStemmer."""
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in words]

def lemmatize_word(word: str, pos: str='n') -> str:
    """Lemmatize a single word with specified part of speech (default noun)."""
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word, pos=pos)

def main():
    # Sample texts to demonstrate each function
    html_text = "<p>Hello <b>world</b>! 😊</p>"
    emoji_text = "I ❤️ Python 🐍!"
    spelling_text = "I havv a sleeppingg errr"
    mixed_text = "Hello, world! It's awesome. NLP is fun and powerful!"
    words_list = ['this', 'is', 'a', 'sample', 'text', 'played', 'playing']

    print("=== Clean HTML ===")
    print(clean_html(html_text))  # Hello world! 😊

    print("\n=== Remove Emojis ===")
    print(remove_emojis(emoji_text))  # I  Python !

    print("\n=== Correct Spelling ===")
    print(correct_spelling(spelling_text))  # I have a sleeping error

    print("\n=== Lowercase ===")
    print(lower_case(mixed_text))  # hello, world! it's awesome. nlp is fun and powerful!

    print("\n=== Remove Punctuation ===")
    print(remove_punctuation(mixed_text))  # Hello world Its awesome NLP is fun and powerful

    print("\n=== Sentence Tokenize ===")
    print(sentence_tokenize(mixed_text))  # ['Hello, world!', "It's awesome.", 'NLP is fun and powerful!']

    print("\n=== Word Tokenize ===")
    print(word_tokenize_text(mixed_text))  # ['Hello', ',', 'world', '!', 'It', "'s", 'awesome', '.', 'NLP', 'is', 'fun', 'and', 'powerful', '!']

    print("\n=== Remove Stopwords ===")
    filtered_words = remove_stopwords(words_list)
    print(filtered_words)  # ['sample', 'text', 'played', 'playing']

    print("\n=== Stemming ===")
    stems = stem_words(words_list)
    print(stems)  # e.g. ['thi', 'is', 'a', 'sampl', 'text', 'play', 'play']

    print("\n=== Lemmatization ===")
    # Lemmatize each word assuming noun (default)
    lemmatized_words = [lemmatize_word(w) for w in words_list]
    print(lemmatized_words)  # e.g. ['this', 'is', 'a', 'sample', 'text', 'played', 'playing']
    # Lemmatize verb example
    print("Lemmatized 'playing' as verb:", lemmatize_word('playing', pos='v'))  # play

if __name__ == "__main__":
    main()



IndentationError: expected an indented block after function definition on line 5 (1001748078.py, line 6)