[Reference](https://nautilus06.medium.com/nlp-text-preprocessing-steps-for-machine-learning-algorithms-bc015ccf0173)

# 1. Tokenization

In [13]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
text = "This is a sample text for tokenization."
tokens = word_tokenize(text)
print(tokens)

['This', 'is', 'a', 'sample', 'text', 'for', 'tokenization', '.']


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 2. Stopword Removal

In [9]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
tokens = [word for word in tokens if not word in stop_words]
print(tokens)

['This', 'sample', 'text', 'tokenization', '.']


# 3. Stemming

In [10]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]
print(stemmed_words)

['thi', 'sampl', 'text', 'token', '.']


# 4. Lemmatization

In [11]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print(lemmatized_words)

['This', 'sample', 'text', 'tokenization', '.']


# 5. Part-of-speech (POS) tagging

In [14]:
import nltk
# Sample sentence
sentence = "The quick brown fox jumps over the lazy dog"
# Tokenize the sentence into words
words = nltk.word_tokenize(sentence)
# Perform POS tagging
pos_tags = nltk.pos_tag(words)
# Print the POS tags
print(pos_tags)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


# 6. Named Entity Recognition (NER)

In [15]:
import spacy
# Load the English language model
nlp = spacy.load("en_core_web_sm")
# Sample text for NER
text = "Apple is looking at buying U.K. startup for $1 billion"
# Process the text with the language model
doc = nlp(text)
# Extract named entities from the text
for ent in doc.ents:
  print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


# 7. Spell Checking and Correction

In [19]:
# !pip install pyspellchecker

# from spellchecker import SpellChecker
# # initialize spell checker
# spell = SpellChecker()
# # example sentence with spelling errors
# sentence = "Ths sentnce hs spellng erors that nd to b corcted."
# # tokenize sentence
# tokens = sentence.split()
# # iterate over tokens and correct spelling errors
# for i in range(len(tokens)):
#     # check if token is misspelled
#     if not spell.correction(tokens[i]) == tokens[i]:
#         # replace misspelled token with corrected spelling
#         tokens[i] = spell.correction(tokens[i])
# # join corrected tokens back into sentence
# corrected_sentence = ' '.join(tokens)
# print(corrected_sentence)

# 8. Removing HTML tags, punctuation, and special characters

In [20]:
import re
import string

def remove_html_tags(text):
 clean_text = re.sub('<.*?>', '', text)
 return clean_text

def remove_punctuation(text):
 clean_text = text.translate(str.maketrans('', '', string.punctuation))
 return clean_text

def remove_special_characters(text):
 clean_text = re.sub('[^a-zA-Z0–9\s]', '', text)
 return clean_text

text = "<p>Hello, world!</p>"
clean_text = remove_html_tags(text)
clean_text = remove_punctuation(clean_text)
clean_text = remove_special_characters(clean_text)
print(clean_text)

Hello world


# 9. Converting to Lowercase

In [21]:
text = "This is a sample TEXT for preprocessing"
text = text.lower()
print(text)

this is a sample text for preprocessing


# 10. Text Vectorization

In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Example text corpus
corpus = ["This is the first document.", 
 "This document is the second document.", 
 "And this is the third one.", 
 "Is this the first document?"]

# Vectorize text using BoW representation
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(corpus)

print("BoW representation:")
print(X_bow.toarray())
print("Vocabulary:")
print(vectorizer.get_feature_names_out())

# Vectorize text using TF-IDF representation
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(corpus)

print("TF-IDF representation:")
print(X_tfidf.toarray())

BoW representation:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
Vocabulary:
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
TF-IDF representation:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
