1. Case Conversion

In [1]:
text = "Hey there! We have a new offer for you. Get 50% discount on all products. Limited time offer."

In [2]:
text = "Hey there! We have a new offer for you. Get 50% discount on all products. Limited time offer."
print(text)

Hey there! We have a new offer for you. Get 50% discount on all products. Limited time offer.


In [None]:
text = text.lower()
print(text)


2. Punctuation Removal
Punctuation often doesn’t contribute to the meaning of a sentence in tasks like sentiment analysis or spam detection. 

In [None]:
import string
text = text.translate(str.maketrans('', '', string.punctuation))
print(text)

3. Tokenization
Tokenization breaks a stream of text into smaller units called tokens. These can be words, sentences, or subwords.

Word Tokenization: Separates a text into a list of words.
Sentence Tokenization: Separates a text into a list of sentences.


In [None]:
# Download required NLTK data
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Sentence Tokenization
sentence_tokens = sent_tokenize(text)
print(sentence_tokens)

# Word Tokenization
words_tokens = word_tokenize(text)
print(words_tokens)

4. Stop Word Removal
Stop words are common words like “the,” “is,” “a,” etc., that often don’t add much value to the meaning of the text. Removing them can reduce the feature space and speed up processing.

In [None]:
from nltk.corpus import stopwords

# Download stopwords if not already present
# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words_tokens if word not in stop_words]
print(filtered_words)

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download wordnet if not already present
# nltk.download('wordnet')

# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]
print(stemmed_words)
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print(lemmatized_words)

6. Part-of-Speech (POS) Tagging
Part-of-Speech (POS) tagging is process of labeling each word in a sentence with its corresponding part of speech, such as a noun, verb, adjective, or adverb.

For example:

Nouns (NN): Refer to a person, place, thing, or idea (e.g., “dog,” “New York,” “love”).
Verbs (VB): Describe an action or state of being (e.g., “run,” “is,” “think”).
Adjectives (JJ): Modify or describe nouns (e.g., “happy,” “blue,” “tall”).
Adverbs (RB): Modify verbs, adjectives, or other adverbs (e.g., “quickly,” “very,” “well”).

In [None]:
# Assuming you have the original text processed and word tokenized
import nltk
text = "Hey there! We have a new offer for you."
words = ['hey', 'there', '!', 'we', 'have', 'a', 'new', 'offer', 'for', 'you', '.']

# POS Tagging
tagged_words = nltk.pos_tag(words)
print("POS Tagged Words:", tagged_words)
# Example output for a sentence: [('We', 'PRP'), ('have', 'VBP'), ('a', 'DT'), ('new', 'JJ'), ('offer', 'NN'), ('for', 'IN'), ('you', 'PRP'), ('.', '.')]

# Define a custom grammar to find noun phrases (NP)
grammar = "NP: {<DT>?<JJ>*<NN>}"
parser = nltk.RegexpParser(grammar)
result = parser.parse(tagged_words)
print("Parsed Result:")
print(result)
# The parser identifies and groups the words based on the grammar.
# For example, it might identify '(NP a/DT new/JJ offer/NN)' as a noun phrase.

spacy

In [None]:
import spacy

# Load the small English model
nlp = spacy.load("en_core_web_sm")
text = "The quick brown fox jumps over the lazy dog."

# Process the text with the nlp object
doc = nlp(text)

# Iterate through the tokens and print the word and its POS tag
for token in doc:
    print(f"{token.text:<10} {token.pos_:<10} {token.tag_}")

7. Chunking
also known as shallow parsing, is a NLP technique that groups words into meaningful phrases, such as noun phrases, verb phrases, or prepositional phrases.

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = "The quick brown fox jumps over the lazy dog."
doc = nlp(text)

# Print all noun chunks in the document
for chunk in doc.noun_chunks:
    print(chunk.text)

8. Named Entity Recognition (NER)
NER is a natural language processing (NLP) task that finds and classifies named entities in a text into predefined categories like people, organizations, locations, dates, and more.

In [None]:
import spacy

# Load the English model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Apple Inc. is a technology company headquartered in Cupertino, California. It was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne."

# Process the text with the NLP pipeline
doc = nlp(text)

# Print the identified entities and their labels
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}, Explanation: {spacy.explain(ent.label_)}")

9. Relationship extraction
Relationship extraction is an NLP task that identifies and classifies semantic relationships between entities in a text.
For example, it can identify that a specific person “works for” a specific company or that a company is “headquartered in” a specific location.

In [None]:
# Relationship extraction using spaCy's dependency parsing
import spacy

# Load the English model
nlp = spacy.load("en_core_web_sm")

# Sample text for relationship extraction
text = "Steve Jobs was the CEO of Apple Inc. Apple is headquartered in Cupertino, California."

# Process the text
doc = nlp(text)

print("Relationship Extraction using Dependency Parsing:")
print("=" * 50)

# Extract relationships based on dependency patterns
for token in doc:
    # Look for subject-verb-object patterns
    if token.dep_ == "nsubj":  # nominal subject
        subject = token.text
        verb = token.head.text
        
        # Look for objects related to this verb
        for child in token.head.children:
            if child.dep_ in ["dobj", "attr", "pobj"]:  # direct object, attribute, prepositional object
                obj = child.text
                print(f"Relationship: {subject} -> {verb} -> {obj}")
    
    # Look for compound relationships
    if token.dep_ == "compound":
        compound = token.text + " " + token.head.text
        print(f"Compound entity: {compound}")

print("\nNamed Entities and their relationships:")
print("-" * 30)

# Print named entities
for ent in doc.ents:
    print(f"Entity: {ent.text} ({ent.label_})")

print("\nDependency tree visualization:")
print("-" * 30)

# Show dependency relationships
for token in doc:
    print(f"{token.text:12} <- {token.dep_:10} <- {token.head.text}")