# Whitespace
It refers to spaces, tabs, or newlines (\n, \t) used to separate words or format text.

# **Punctuation**

Definition: Punctuation marks are symbols used in text for grammar, emphasis, or structure.
Examples: . , ? ! ; : ( ) " ' -

Why they matter:

Sometimes punctuation is noise (e.g., commas in a paragraph).

Sometimes punctuation carries meaning:

  “Let’s eat, Grandma!” vs “Let’s eat Grandma!”

  Sentiment: “I am happy!!!” vs “I am happy.”

Handling:

1. Often removed for topic modeling or classification.

2. Usually kept for sentiment analysis, QA, and modern transformer models.

# **Stopwords**

Definition: Stopwords are very common words in a language that usually don’t add much meaning to the text.
Example in English: the, is, in, at, and, of, on, a, to

Why they matter:

In classical NLP, stopwords are often removed to reduce noise and vocabulary size.
But in modern NLP (transformers), removing stopwords is not always recommended because context matters.

# **How much cleaning is needed ?**

Classical ML (Naive Bayes, SVM, Logistic Regression, etc.):
Heavy cleaning (stopword removal, stemming, lemmatization) usually improves accuracy.

Deep learning (RNNs, LSTMs, CNNs):
Moderate cleaning (lowercasing, punctuation removal, normalization) is helpful, but too much cleaning (like removing stopwords) might remove useful context.

Transformers (BERT, GPT, T5, etc.):
Minimal cleaning is enough, since tokenizers handle punctuation, casing, and subword segmentation. Over-cleaning (e.g., removing stopwords) can actually hurt performance.

# CLEANING TEXT MANUALLY

In [7]:
filename = "/content/metamorphosis_clean.txt"
file = open(filename, 'rt')
text = file.read()
file.close()


In [12]:
# split by whitespace
words = text.split()
print(words[:100])

['One', 'morning,', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams,', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin.', 'He', 'lay', 'on', 'his', 'armour-like', 'back,', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly,', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections.', 'The', 'bedding', 'was', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seemed', 'ready', 'to', 'slide', 'off', 'any', 'moment.', 'His', 'many', 'legs,', 'pitifully', 'thin', 'compared', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him,', 'waved', 'about', 'helplessly', 'as', 'he', 'looked.', '“What’s', 'happened', 'to', 'me?”', 'he', 'thought.', 'It', 'wasn’t', 'a', 'dream.', 'His', 'room,', 'a', 'proper', 'human']


In [13]:
# regex method
import re
words = re.split(r'[-\s.,;!?]+', text)
print(words[:100])

['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'armour', 'like', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections', 'The', 'bedding', 'was', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seemed', 'ready', 'to', 'slide', 'off', 'any', 'moment', 'His', 'many', 'legs', 'pitifully', 'thin', 'compared', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', 'waved', 'about', 'helplessly', 'as', 'he', 'looked', '“What’s', 'happened', 'to', 'me', '”', 'he', 'thought', 'It', 'wasn’t', 'a', 'dream', 'His', 'room', 'a']


In [14]:
# split by whitespace and remove punctuation
import string
import re
filename = "/content/metamorphosis_clean.txt"
file = open(filename, 'rt')
text = file.read()
file.close()
words = text.split()
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
stripped = [re_punc.sub('', w) for w in words]
print(stripped[:100])

['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'armourlike', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections', 'The', 'bedding', 'was', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seemed', 'ready', 'to', 'slide', 'off', 'any', 'moment', 'His', 'many', 'legs', 'pitifully', 'thin', 'compared', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', 'waved', 'about', 'helplessly', 'as', 'he', 'looked', '“What’s', 'happened', 'to', 'me”', 'he', 'thought', 'It', 'wasn’t', 'a', 'dream', 'His', 'room', 'a', 'proper', 'human']


In [16]:
# normalizing the case
filename = "/content/metamorphosis_clean.txt"
file = open(filename, 'rt')
text = file.read()
file.close()
words = text.split()
words = [word.lower() for word in words]
print(words[:100])

['one', 'morning,', 'when', 'gregor', 'samsa', 'woke', 'from', 'troubled', 'dreams,', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin.', 'he', 'lay', 'on', 'his', 'armour-like', 'back,', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly,', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections.', 'the', 'bedding', 'was', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seemed', 'ready', 'to', 'slide', 'off', 'any', 'moment.', 'his', 'many', 'legs,', 'pitifully', 'thin', 'compared', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him,', 'waved', 'about', 'helplessly', 'as', 'he', 'looked.', '“what’s', 'happened', 'to', 'me?”', 'he', 'thought.', 'it', 'wasn’t', 'a', 'dream.', 'his', 'room,', 'a', 'proper', 'human']


# CLEANING TEXT USING NLTK

In [24]:
! pip install nltk




In [26]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")   # <-- new requirement


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [29]:
# split into sentences using nltk
import nltk
from nltk.tokenize import sent_tokenize

filename = "/content/metamorphosis_clean.txt"
with open(filename, "rt") as file:
    text = file.read()

# Sentence tokenization
sentences = sent_tokenize(text)
print(sentences[0])  # prints the first sentence



One morning, when Gregor Samsa woke from troubled dreams, he found
himself transformed in his bed into a horrible vermin.


In [32]:
# split into words using nltk
from nltk.tokenize import word_tokenize
filename = "/content/metamorphosis_clean.txt"
with open(filename, "rt") as file:
    text = file.read()

# Word tokenization
tokens = word_tokenize(text)
print(tokens[:100])

['One', 'morning', ',', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', ',', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', '.', 'He', 'lay', 'on', 'his', 'armour-like', 'back', ',', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', ',', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections', '.', 'The', 'bedding', 'was', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seemed', 'ready', 'to', 'slide', 'off', 'any', 'moment', '.', 'His', 'many', 'legs', ',', 'pitifully', 'thin', 'compared', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', ',', 'waved', 'about', 'helplessly', 'as', 'he', 'looked', '.', '“', 'What', '’', 's', 'happened']


In [35]:
# filtering out punctuation using nltk
from nltk.tokenize import word_tokenize
filename = "/content/metamorphosis_clean.txt"
with open(filename, "rt") as file:
    text = file.read()

# Word tokenization
tokens = word_tokenize(text)
words = [word for word in tokens if word.isalpha()]
print(words[:100])

['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections', 'The', 'bedding', 'was', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seemed', 'ready', 'to', 'slide', 'off', 'any', 'moment', 'His', 'many', 'legs', 'pitifully', 'thin', 'compared', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', 'waved', 'about', 'helplessly', 'as', 'he', 'looked', 'What', 's', 'happened', 'to', 'me', 'he', 'thought', 'It', 'wasn', 't', 'a', 'dream', 'His', 'room', 'a', 'proper']


In [38]:
# filtering stopwords using nltk
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords

# Get English stopwords
stop_words = stopwords.words("english")
print(stop_words[:20])  # show first 20 stopwords


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
