# Text Preprocessing Techniques :

**1. Lowercase Conversion :**

In [1]:
def lowercase_conversion(text):
  return text.lower()

text = "Hello NLP!"
lowercase_conversion(text)

'hello nlp!'

**2. Stop Words Removal :**

In [8]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [9]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
  stop_words = set(stopwords.words('english'))
  tokens = word_tokenize(text)
  filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
  return filtered_tokens

text = "This is an example sentence with some stop words."
remove_stopwords(text)

['example', 'sentence', 'stop', 'words', '.']

**3. Remove Punctuation**

In [10]:
import re

def remove_punctuation(text):
  punctuation = r'[^\w\s]'
  return re.sub(punctuation, '', text)

text = "Hello! This is an example sentence with punctuation."
remove_punctuation(text)

'Hello This is an example sentence with punctuation'

***4. Regular Expressions (Regex)**

In [11]:
#Match Simple Text
text = "I LOVE NLP"
pattern = r"NLP"
re.search(pattern, text)

<re.Match object; span=(7, 10), match='NLP'>

In [12]:
# Number Removal/Normalization
def remove_numbers(text):
  return re.sub(r'\d+', '', text)

text = "This is an example sentence with 123 numbers."
remove_numbers(text)

'This is an example sentence with  numbers.'

In [14]:
# Number Normalization
def normalize_numbers(text, replacement='NUM'):
  return re.sub(r'\d+', replacement, text)

text = "This is an example sentence with 123 numbers."
normalize_numbers(text)

'This is an example sentence with NUM numbers.'

In [17]:
# # Noise Removal
def remove_noise(text):
  noise_pattern = r'[^a-zA-Z0-9\s]'
  text = re.sub(noise_pattern, '', text)

  # Remove extra whitespace
  text = re.sub(r'\s+', ' ', text).strip()

  return text
text = "Special @#! characters & unicode     like 你好 should be removed."
remove_noise(text)

'Special characters unicode like should be removed'

**5. Tokenization**

In [5]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
from nltk.tokenize import word_tokenize, sent_tokenize

def tokenize_text(text):
  word_tokens = word_tokenize(text)
  sentence_tokens = sent_tokenize(text)
  return word_tokens, sentence_tokens

text = "This is an example sentence. It has multiple sentences."
tokenize_text(text)

(['This',
  'is',
  'an',
  'example',
  'sentence',
  '.',
  'It',
  'has',
  'multiple',
  'sentences',
  '.'],
 ['This is an example sentence.', 'It has multiple sentences.'])

In [7]:
# Try bert tokenizer
from transformers import BertTokenizer

def bert_tokenizer(text):
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  tokens = tokenizer.tokenize(text)
  return tokens

text = "This is an example sentence."
bert_tokenizer(text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

['this', 'is', 'an', 'example', 'sentence', '.']

In [8]:
#try GPT2 tokenizer
from transformers import GPT2Tokenizer

def gpt2_tokenizer(text):
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
  tokens = tokenizer.tokenize(text)
  return tokens

text = "This is an example sentence."
gpt2_tokenizer(text)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

['This', 'Ġis', 'Ġan', 'Ġexample', 'Ġsentence', '.']

**6. Stemming**

In [9]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

def stemming_text(text):
  porter = PorterStemmer()
  lancaster = LancasterStemmer()
  snowball = SnowballStemmer('english')

  porter_stems = [porter.stem(word) for word in text]
  lancaster_stems = [lancaster.stem(word) for word in text]
  snowball_stems = [snowball.stem(word) for word in text]

  return porter_stems, lancaster_stems, snowball_stems

text = ["running", "runs", "ran", "run"]
stemming_text(text)


porter_stems, lancaster_stems, snowball_stems = stemming_text(text)
print("Porter Stems:", porter_stems)
print("Lancaster Stems:", lancaster_stems)
print("Snowball Stems:", snowball_stems)


Porter Stems: ['run', 'run', 'ran', 'run']
Lancaster Stems: ['run', 'run', 'ran', 'run']
Snowball Stems: ['run', 'run', 'ran', 'run']


**7. Lemmatization**

In [10]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [11]:
from nltk.stem import WordNetLemmatizer

def lemmatization_text(text):
  lemmatizer = WordNetLemmatizer()
  lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
  return lemmatized_words

text = ["running", "runs", "ran", "run"]
lemmatization_text(text)

['running', 'run', 'ran', 'run']

**8. Spell Correction**

In [14]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.2-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.2-py3-none-any.whl (7.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.2


In [16]:
from spellchecker import SpellChecker

def correct_spelling(text):
  spell = SpellChecker()
  tokens = word_tokenize(text)
  corrected_tokens = [spell.correction(token) for token in tokens]
  return corrected_tokens

text = "I havv a good speling!"
correct_spelling(text)

['I', 'have', 'a', 'good', 'spelling', '!']

**9. Text Normalization with TextBlob**

In [17]:
!pip install textblob
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [20]:
from textblob import TextBlob

def normalize_text_with_textblob(text):
  blob = TextBlob(text)
  corrected_text = str(blob.correct())

  sentiment = blob.sentiment

  noun_phrases = blob.noun_phrases

  return corrected_text, sentiment, noun_phrases

text = "The quik brown fox jumpd over the lazzy dog."
# text = "guuod feeling"
corrected, sentiment, noun_phrases = normalize_text_with_textblob(text)

print(f"Corrected: {corrected}")
print(f"Sentiment: {sentiment}")
print(f"Noun phrases: {noun_phrases}")

Corrected: The quick brown fox jumped over the lazy dog.
Sentiment: Sentiment(polarity=0.0, subjectivity=0.0)
Noun phrases: ['brown fox jumpd', 'lazzy dog']


# Comprehensive Preprocessing Pipeline

In [26]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

def preprocess_text(text):
  """ Comprehensive text preprocessing pipeline"""
  #Apply Lowercasing
  text = text.lower()

  # Remove Punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))

  # Remove numbers
  text = re.sub(r'\d+', '', text)

  # Remove whitespace
  text = text.strip()
  text = re.sub(r'\s+', ' ', text)

  # Tokenization
  tokens = word_tokenize(text)

  # Remove Stopwords
  stop_words = set(stopwords.words('english'))
  tokens = [token for token in tokens if token not in stop_words]

  # Stemming
  stemmer = PorterStemmer()
  tokens = [stemmer.stem(token) for token in tokens]

  return tokens

text = "This is an example! The preprocessing pipeline removes punctuation, numbers (123), and stopwords."
tokens = preprocess_text(text)
print(tokens)

['exampl', 'preprocess', 'pipelin', 'remov', 'punctuat', 'number', 'stopword']
