# Assignment No: 01
## Aim: Write a program for pre-processing of a text document such as stop word removal, stemming.

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # 1. Tokenization
    tokens = word_tokenize(text)

    # 2. Convert to lowercase
    tokens = [word.lower() for word in tokens]

    # 3. Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]

    # 4. Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 5. Stemming
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]

    return tokens

# Sample text document
text_document = """
Natural language processing (NLP) is a subfield of artificial intelligence (AI)
that focuses on the interaction between computers and humans through natural language.
The ultimate goal of NLP is to read, decipher, understand, and make sense of human
language in a valuable way.
"""

# Preprocess the text
processed_text = preprocess_text(text_document)
print("Processed Text:", processed_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Processed Text: ['natur', 'languag', 'process', 'nlp', 'subfield', 'artifici', 'intellig', 'ai', 'focus', 'interact', 'comput', 'human', 'natur', 'languag', 'ultim', 'goal', 'nlp', 'read', 'deciph', 'understand', 'make', 'sens', 'human', 'languag', 'valuabl', 'way']


# **Explanation**

**Tokenization:** Splits the text
into individual words (tokens) using word_tokenize.

**Convert to lowercase:** Normalizes the tokens to lowercase for consistency.

**Remove punctuation:** Filters out punctuation marks from the tokens.

**Remove stop words:** Uses a set of English stop words to filter out common words that add little value (e.g., "is", "the", "and").

**Stemming:** Reduces words to their root form using the Porter Stemmer.