In [None]:
# Import necessary libraries
import nltk
import spacy
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from textblob import TextBlob

In [None]:


# Download required NLTK data files (only the first time)
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# Load spaCy's small English model for advanced NLP tasks (make sure it's installed via: python -m spacy download en_core_web_sm)
nlp = spacy.load("en_core_web_sm")

# Sample text to process
text = (
    "Apple Inc. is looking at buying U.K. startup for $1 billion. "
    "This is an exciting development in the tech industry!"
)



In [None]:
# --------------------------
# 1. Text Normalization
# --------------------------
# Convert the text to lowercase to standardize it
text_lower = text.lower()
print("Lowercased Text:")
print(text_lower, "\n")



Lowercased Text:
apple inc. is looking at buying u.k. startup for $1 billion. this is an exciting development in the tech industry! 



In [None]:
# --------------------------
# 2. Tokenization
# --------------------------
# Tokenize the lowercased text into words using NLTK
tokens = word_tokenize(text_lower)
print("Tokenized Words:")
print(tokens, "\n")



Tokenized Words:
['apple', 'inc.', 'is', 'looking', 'at', 'buying', 'u.k.', 'startup', 'for', '$', '1', 'billion', '.', 'this', 'is', 'an', 'exciting', 'development', 'in', 'the', 'tech', 'industry', '!'] 



In [None]:
# --------------------------
# 3. Punctuation Removal
# --------------------------
# Remove punctuation from the token list using Python's string punctuation set
tokens_no_punct = [token for token in tokens if token not in string.punctuation]
print("Tokens without Punctuation:")
print(tokens_no_punct, "\n")



Tokens without Punctuation:
['apple', 'inc.', 'is', 'looking', 'at', 'buying', 'u.k.', 'startup', 'for', '1', 'billion', 'this', 'is', 'an', 'exciting', 'development', 'in', 'the', 'tech', 'industry'] 



In [None]:
# --------------------------
# 4. Stopword Removal
# --------------------------
# Get the set of English stopwords from NLTK and filter them out
stop_words = set(stopwords.words('english'))
tokens_no_stopwords = [token for token in tokens_no_punct if token not in stop_words]
print("Tokens without Stopwords:")
print(tokens_no_stopwords, "\n")


Tokens without Stopwords:
['apple', 'inc.', 'looking', 'buying', 'u.k.', 'startup', '1', 'billion', 'exciting', 'development', 'tech', 'industry'] 



In [None]:

# --------------------------
# 5. Stemming
# --------------------------
# Use NLTK's PorterStemmer to reduce words to their stem (root form)
ps = PorterStemmer()
stemmed_tokens = [ps.stem(token) for token in tokens_no_stopwords]
print("Stemmed Tokens:")
print(stemmed_tokens, "\n")



Stemmed Tokens:
['appl', 'inc.', 'look', 'buy', 'u.k.', 'startup', '1', 'billion', 'excit', 'develop', 'tech', 'industri'] 



In [None]:
# --------------------------
# 6. Part-of-Speech (POS) Tagging
# --------------------------
# Tag each token (without punctuation and stopwords) with its part-of-speech
pos_tags = nltk.pos_tag(tokens_no_stopwords)
print("POS Tags:")
print(pos_tags, "\n")



POS Tags:
[('apple', 'NN'), ('inc.', 'NN'), ('looking', 'VBG'), ('buying', 'VBG'), ('u.k.', 'JJ'), ('startup', 'JJ'), ('1', 'CD'), ('billion', 'CD'), ('exciting', 'VBG'), ('development', 'NN'), ('tech', 'NN'), ('industry', 'NN')] 



In [None]:
# --------------------------
# 7. Named Entity Recognition (NER) and Dependency Parsing with spaCy
# --------------------------
# Process the original text with spaCy's pipeline
doc = nlp(text)

# Print Named Entities found in the text
print("Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")
print()

# Print Dependency Parsing information for each token in the sentence
print("Dependency Parsing:")
for token in doc:
    # token.dep_ is the dependency relation, token.head is the parent word
    print(f"{token.text:12} {token.dep_:10} -> {token.head.text}")
print()


Named Entities:
Apple Inc. -> ORG
U.K. -> GPE
$1 billion -> MONEY

Dependency Parsing:
Apple        compound   -> Inc.
Inc.         nsubj      -> looking
is           aux        -> looking
looking      ROOT       -> looking
at           prep       -> looking
buying       pcomp      -> at
U.K.         dobj       -> buying
startup      dep        -> looking
for          prep       -> startup
$            quantmod   -> billion
1            compound   -> billion
billion      pobj       -> for
.            punct      -> looking
This         nsubj      -> is
is           ROOT       -> is
an           det        -> development
exciting     amod       -> development
development  attr       -> is
in           prep       -> development
the          det        -> industry
tech         compound   -> industry
industry     pobj       -> in
!            punct      -> is



In [None]:

# --------------------------
# 8. Sentiment Analysis using TextBlob
# --------------------------
# Create a TextBlob object and analyze sentiment
blob = TextBlob(text)
print("Sentiment Analysis:")
print("Polarity:", blob.sentiment.polarity)      # Ranges from -1 (negative) to 1 (positive)
print("Subjectivity:", blob.sentiment.subjectivity)  # Ranges from 0 (objective) to 1 (subjective)

Sentiment Analysis:
Polarity: 0.375
Subjectivity: 0.8
