In [20]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    """
    Preprocesses text by tokenization, removing stopwords, punctuation, and lemmatization.
    """
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [token.translate(table) for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

# Example text
text = "Natural Language Processing (NLP) is a subfield of artificial intelligence (AI) that deals with the interaction between computers and humans using natural language."

# Preprocess the text
tokens = preprocess_text(text)
print("Preprocessed tokens:", tokens)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...


Preprocessed tokens: ['natural', 'language', 'processing', '', 'nlp', '', 'subfield', 'artificial', 'intelligence', '', 'ai', '', 'deal', 'interaction', 'computer', 'human', 'using', 'natural', 'language', '']
