## First notebook : Text Preprocessing Pipeline

Objective :
Develop a text preprocessing pipeline that :
 - Tokenizes the input text
 - Removes stopwords
 - Converts text to lowercase
 - Applies either stemming or lemmatization
 - Returns the cleaned text ready for analysis

In [None]:
# Import libraries 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download necessary resources

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt_tab")



In [18]:
def preprocess_text( text, use_lemmatization = True) :
    #tokenization
    tokens = word_tokenize(text)

    #stopwords removal
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # case conversion
    tokens = [word.lower() for word in tokens]

    #stemming or lemmatization
    if use_lemmatization :
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    else:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens

In [19]:
# Test the pipeline

text = "The quick brown foxes are jumping over the lazy dogs!"
print("with lemmatization : ", preprocess_text(text))
print("with stemming : ", preprocess_text(text, use_lemmatization= False))


with lemmatization :  ['quick', 'brown', 'fox', 'jumping', 'lazy', 'dog', '!']
with stemming :  ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '!']


## Explanation of results 

- lemmatization : 'foxes' becomes 'fox', 'jumping' becomes 'jump'
- stemming : 'jumping' becomes 'jump'. 'Lazy' becomes 'lazi' (doesn't exist) = drawback of stemming