🔴Task 37-> NLP Preprocessing  

Go through some common text preprocessing techniques and demonstrate them by applying them to different datasets.


In [None]:
pip install nltk spacy




In [None]:
import spacy

# Load spaCy English model for lemmatization
nlp = spacy.load('en_core_web_sm')




Importing Libraries

In [None]:
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load spaCy English model for lemmatization
nlp = spacy.load('en_core_web_sm')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Step 3: Defining the Pre processing functions

In [8]:
#1. lowercasing
def lowercasing(text):
    return text.lower()
#2. Removing Punctuation
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)
#3. Tokenization
def tokenize(text):
    return word_tokenize(text)
#4. Stopword Removal
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]
#5. Stemming
def stemming(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]
#6. Lemmatization
def lemmatization(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]
# 7. Removing Numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)
#8. Removing Special Characters
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)
# 9. Expanding Contractions
contractions = {
    "can't": "cannot",
    "won't": "will not",
    "don't": "do not",
    "it's": "it is",
    "i'm": "i am",
    # Add more contractions as needed
}
# 10. Removing URLs and HTML Tags
def remove_urls_and_html(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    return text

# 11. Handling Emoticons (Basic Example)
emoticons = {
    ":)": "smile",
    ":(": "sad",
    ":D": "laugh",
    # Add more emoticons as needed
}

def handle_emoticons(text, emoticons_dict=emoticons):
    for emoticon, replacement in emoticons_dict.items():
        text = text.replace(emoticon, replacement)
    return text

# 12. Normalizing Text (e.g., handling accents)
def normalize_text(text):
    return text.encode('ascii', 'ignore').decode('utf-8')


In [11]:
import pandas as pd

# Example 1: Movie Review
df = pd.read_csv('/content/spam_ham_dataset.csv')
movie_review = df['text'][0]

# Apply preprocessing
movie_review = lowercasing(movie_review)
movie_review = remove_punctuation(movie_review)
movie_review = remove_numbers(movie_review)
tokens = tokenize(movie_review)
tokens = remove_stopwords(tokens)
tokens_stemmed = stemming(tokens)
tokens_lemmatized = lemmatization(tokens)

print("Original Review:", movie_review)
print("Tokens:", tokens)
print("Stemmed Tokens:", tokens_stemmed)
print("Lemmatized Tokens:", tokens_lemmatized)




Original Review: subject enron methanol  meter   
this is a follow up to the note i gave you on monday        preliminary
flow data provided by daren  
please override pop  s daily volume  presently zero  to reflect daily
activity you can obtain from gas control 
this change is needed asap for economics purposes 
Tokens: ['subject', 'enron', 'methanol', 'meter', 'follow', 'note', 'gave', 'monday', 'preliminary', 'flow', 'data', 'provided', 'daren', 'please', 'override', 'pop', 'daily', 'volume', 'presently', 'zero', 'reflect', 'daily', 'activity', 'obtain', 'gas', 'control', 'change', 'needed', 'asap', 'economics', 'purposes']
Stemmed Tokens: ['subject', 'enron', 'methanol', 'meter', 'follow', 'note', 'gave', 'monday', 'preliminari', 'flow', 'data', 'provid', 'daren', 'pleas', 'overrid', 'pop', 'daili', 'volum', 'present', 'zero', 'reflect', 'daili', 'activ', 'obtain', 'ga', 'control', 'chang', 'need', 'asap', 'econom', 'purpos']
Lemmatized Tokens: ['subject', 'enron', 'methanol',

In [10]:
# Example 2: News Headline
news_headline = "Breaking: Apple's New iPhone 12 Pro Max features a 108MP Camera! #technology"

# Apply preprocessing
news_headline = lowercasing(news_headline)
news_headline = remove_punctuation(news_headline)
news_headline = remove_numbers(news_headline)
news_headline = handle_emoticons(news_headline)
tokens_headline = tokenize(news_headline)
tokens_headline = remove_stopwords(tokens_headline)

print("Original Headline:", news_headline)
print("Tokens:", tokens_headline)

Original Headline: breaking apples new iphone  pro max features a mp camera technology
Tokens: ['breaking', 'apples', 'new', 'iphone', 'pro', 'max', 'features', 'mp', 'camera', 'technology']
