**Anjali Parwani D11AD 45 EXP7**

Aim: Perform the steps involved in Text Analysis in Python and R

Task to be performed :

Explore Top-5 Text Analytics Libraries in Python (w.r.t Features & Applications)

Explore Top-5 Text Analytics Libraries in R (w.r.t Features & Applications)

Perform the following experiments using Python & R

Exploring Nltk Library

In [None]:
#Tokenization
import pandas as pd
import numpy as np
import nltk
import os
import nltk.corpus
text = "NLTK is a leading platform for building Python programs to work with human language data."
from nltk.tokenize import word_tokenize
token = word_tokenize(text)
print(token)

['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data', '.']


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#Stopwords
from nltk.corpus import stopwords
filtered_tokens = [word for word in token if word.lower() not in stopwords.words('english')]
print("Tokens after removing stopwords:")
print(filtered_tokens)

Tokens after removing stopwords:
['NLTK', 'leading', 'platform', 'building', 'Python', 'programs', 'work', 'human', 'language', 'data', '.']


In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
#chunking
import nltk
from nltk.tokenize import word_tokenize

text = "The quick brown fox jumps over the lazy dog."

words = word_tokenize(text)

tags = nltk.pos_tag(words)

# Define chunk grammar using regular expressions
chunk_grammar = r"""
    NP: {<DT>?<JJ>*<NN>}    # Chunk sequences of DT, JJ, NN
"""

# Create a chunk parser
chunk_parser = nltk.RegexpParser(chunk_grammar)

# Perform chunking
chunked = chunk_parser.parse(tags)

# Print the chunked result
print(chunked)

(S
  (NP The/DT quick/JJ brown/NN)
  (NP fox/NN)
  jumps/VBZ
  over/IN
  (NP the/DT lazy/JJ dog/NN)
  ./.)


In [None]:
#stemming
from nltk.stem import PorterStemmer

porter_stemmer = PorterStemmer()

words = ["running", "easily", "fairly", "fairness"]

for word in words:
    stemmed_word = porter_stemmer.stem(word)
    print(f"{word} -> {stemmed_word}")


running -> run
easily -> easili
fairly -> fairli
fairness -> fair


Exploring Spacy library

In [None]:
import spacy

# Load English tokenizer, tagger, parser, and NER
nlp = spacy.load("en_core_web_sm")

text = "spaCy is an open-source software library for advanced natural language processing, written in Python and Cython."

# Tokenization
doc = nlp(text)
tokens = [token.text for token in doc]
print("spaCy Tokenization:")
print(tokens)

# Part-of-speech tagging
pos_tags = [(token.text, token.pos_) for token in doc]
print("Part-of-speech tagging:")
print(pos_tags)

spaCy Tokenization:
['spaCy', 'is', 'an', 'open', '-', 'source', 'software', 'library', 'for', 'advanced', 'natural', 'language', 'processing', ',', 'written', 'in', 'Python', 'and', 'Cython', '.']
Part-of-speech tagging:
[('spaCy', 'INTJ'), ('is', 'AUX'), ('an', 'DET'), ('open', 'ADJ'), ('-', 'PUNCT'), ('source', 'NOUN'), ('software', 'NOUN'), ('library', 'NOUN'), ('for', 'ADP'), ('advanced', 'ADJ'), ('natural', 'ADJ'), ('language', 'NOUN'), ('processing', 'NOUN'), (',', 'PUNCT'), ('written', 'VERB'), ('in', 'ADP'), ('Python', 'PROPN'), ('and', 'CCONJ'), ('Cython', 'PROPN'), ('.', 'PUNCT')]


In [None]:
#lemmatization

import spacy

nlp = spacy.load("en_core_web_sm")

text = "running easily fairly fairness"

doc = nlp(text)

for token in doc:
    print(token.text, token.lemma_)


running run
easily easily
fairly fairly
fairness fairness


In [None]:
#named entity recognition
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Apple is going to build a new factory in China."

doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)


Apple ORG
China GPE


Exploring TextBlob library

In [None]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [None]:
from textblob import TextBlob

text = "TextBlob is a simple library for processing textual data."

# Sentiment analysis
blob = TextBlob(text)
print("TextBlob Sentiment Analysis:")
print(blob.sentiment)

# Noun phrase extraction
print("Noun phrases:")
print(blob.noun_phrases)

TextBlob Sentiment Analysis:
Sentiment(polarity=0.0, subjectivity=0.35714285714285715)
Noun phrases:
['textblob', 'simple library', 'processing textual data']


**R Libraries**

In [None]:
#Stemming using SnowballC:

install.packages("SnowballC")
library(SnowballC)

words <- c("running", "easily", "fairly", "fairness")

stemmed_words <- wordStem(words)

print(stemmed_words)




Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



[1] "run"    "easili" "fairli" "fair"  


In [None]:
# Tokenization (Sentence & Word)
text <- "This is a sample sentence. Tokenization is important for NLP."
sentences <- strsplit(text, "\\.")[[1]]
words <- unlist(strsplit(text, "\\s+"))

print("Sentences:")
print(sentences)
print("Words:")
print(words)



[1] "Sentences:"
[1] "This is a sample sentence"          " Tokenization is important for NLP"
[1] "Words:"
 [1] "This"         "is"           "a"            "sample"       "sentence."   
 [6] "Tokenization" "is"           "important"    "for"          "NLP."        


In [None]:
install.packages("tokenizers")
library(tokenizers)

text <- "This is a sample sentence. Tokenization is important for NLP."
sentences <- tokenize_sentences(text)
words <- tokenize_words(text)

print("Sentences:")
print(sentences)
print("Words:")
print(words)



Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



[1] "Sentences:"
[[1]]
[1] "This is a sample sentence."         "Tokenization is important for NLP."

[1] "Words:"
[[1]]
 [1] "this"         "is"           "a"            "sample"       "sentence"    
 [6] "tokenization" "is"           "important"    "for"          "nlp"         



In [None]:
# Frequency Distribution
word_freq <- table(words)
print("Word frequency:")
print(word_freq)

[1] "Word frequency:"
words
           a          for    important           is          nlp       sample 
           1            1            1            2            1            1 
    sentence         this tokenization 
           1            1            1 


In [None]:
# Remove stopwords & punctuations
stop_words <- c("is", "a", "for")  # Example list of stopwords
filtered_words <- words[!tolower(words) %in% stop_words & !grepl("[[:punct:]]", words)]
print("Filtered words:")
 print(filtered_words)

[1] "Filtered words:"
list()
