In [2]:
# Text Mining and NLP - Hands-on
#################################

sentence = "We are Learning TextMining "

'TextMining' in sentence # verify if the text is present in the text or not

sentence.index('Learning') # Check the index location

sentence.split().index('TextMining') # Split the sentences into words and present the position

sentence.split()[2] # 3rd word in the sentence 

sentence.split()[2][::-1] # Print the 3rd word in reverse order

words = sentence.split() # All the words in list format

first_word = words[0]

last_word = words[len(words)-1] # Index in the reverse order start with -1

concat_word = first_word + ' ' + last_word # join 2 words
print(concat_word)

[words[i] for i in range(len(words)) if i%2 == 0] # print the words at even index

sentence[-3:] # Index in reverse starts from -1

sentence[::-1] # Print entire sentence in reverse order

print(' '.join([word[::-1] for word in words])) # Select each word and print it in reverse


# Word Tokenization 
import nltk
nltk.download('punkt')
from nltk import word_tokenize

words = word_tokenize("I am reading NLP Fundamentals")
print(words)

nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(words) # Parts of Speech Tagging

nltk.download('stopwords')  # Stop Words from nltk library
from nltk.corpus import stopwords

stop_words = stopwords.words('English') # 179 pre-defined stop words
print(stop_words)

sentence1 = "I am learning NLP. It is one of the most popular library in Python"

sentence_words = word_tokenize(sentence1) # Tokenize the sentence
print(sentence_words)



# Stop Words
# Filtering stop words from the input string
sentence_no_stops = ' '.join([word for word in sentence_words if word not in stop_words]) 
print(sentence_no_stops)


# Text Normalization
# Replace words in string
sentence2 = "I visited MY from IND on 14-02-20"

normalized_sentence = sentence2.replace("MY", "Malaysia").replace("IND", "India").replace("-20", "-2020")
print(normalized_sentence)


# Spelling Corrections
# pip install autocorrect
from autocorrect import Speller # Library to check typos
spell = Speller(lang='en') # supported languages: en, pl, ru, uk, tr, es
help(Speller)


spell('Natureal') # Correct spelling is printed

sentence3 = word_tokenize("Ntural Luanguage Processin deals with the art of extracting insightes from Natural Languaes")
print(sentence3)

sentence_corrected = ' '.join([spell(word) for word in sentence3])
print(sentence_corrected)


# Stemming
stemmer = nltk.stem.PorterStemmer()

stemmer.stem("Programming")
stemmer.stem("Programs")

stemmer.stem("Jumping")
stemmer.stem("Jumper")

stemmer.stem("battling") # battl - stemming does not look into dictionary words
stemmer.stem("amazing")

# Lemmatization
# Lemmatization looks into dictionary words
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatizer.lemmatize('Programming')

lemmatizer.lemmatize('Programs')

lemmatizer.lemmatize('battling')

lemmatizer.lemmatize("amazing")


# Named Entity Recognition (NER)
# Chunking (Shallow Parsing) - Identifying named entities
nltk.download('maxent_ne_chunker')
nltk.download('words')
sentence4 = "We are learning nlp in Python by 360DigiTMG which is based out of India."

i = nltk.ne_chunk(nltk.pos_tag(word_tokenize(sentence4)), binary=True)
[a for a in i if len(a)==1]


# Sentence Tokenization
from nltk.tokenize import sent_tokenize
sent_tokenize("We are learning NLP in Python. Delivered by 360DigiTMG. Do you know where is it located? It is based out of India.")


# WSD
from nltk.wsd import lesk

sentence1 = "Keep your savings in the bank"
print(lesk(word_tokenize(sentence1), 'bank'))

sentence2 = "It's so risky to drive over the banks of the river"
print(lesk(word_tokenize(sentence2), 'bank'))

# "bank" as multiple meanings. 
# The definitions for "bank" can be seen here:
from nltk.corpus import wordnet as wn
for ss in wn.synsets('bank'): print(ss, ss.definition())


#######################################
1.	CC	Coordinating conjunction
2.	CD	Cardinal number
3.	DT	Determiner
4.	EX	Existential there
5.	FW	Foreign word
6.	IN	Preposition or subordinating conjunction
7.	JJ	Adjective
8.	JJR	Adjective, comparative
9.	JJS	Adjective, superlative
10.	LS	List item marker
11.	MD	Modal
12.	NN	Noun, singular or mass
13.	NNS	Noun, plural
14.	NNP	Proper noun, singular
15.	NNPS	Proper noun, plural
16.	PDT	Predeterminer
17.	POS	Possessive ending
18.	PRP	Personal pronoun
19.	PRP$	Possessive pronoun
20.	RB	Adverb
21.	RBR	Adverb, comparative
22.	RBS	Adverb, superlative
23.	RP	Particle
24.	SYM	Symbol
25.	TO	to
26.	UH	Interjection
27.	VB	Verb, base form
28.	VBD	Verb, past tense
29.	VBG	Verb, gerund or present participle
30.	VBN	Verb, past participle
31.	VBP	Verb, non-3rd person singular present
32.	VBZ	Verb, 3rd person singular present
33.	WDT	Wh-determiner
34.	WP	Wh-pronoun
35.	WP$	Possessive wh-pronoun
36.	WRB	Wh-adverb
###################################################

SyntaxError: invalid syntax (Temp/ipykernel_73312/3641687732.py, line 148)

In [3]:
# Tokenization

import re

sentence5 = 'Sharat tweeted, "Witnessing 70th Republic Day of India from Rajpath, \
New Delhi. Mesmerizing performance by Indian Army! Awesome airshow! @india_official \
@indian_army #India #70thRepublic_Day. For more photos ping me sharat@photoking.com :)"'

sentence5.split()
re.sub(r'([^\s\w]|_)+', ' ', sentence5).split()


# Extracting n-grams
# n-grams can be extracted from 3 different techniques:
# listed below are:
# 1. Custom defined function
# 2. NLTK
# 3. TextBlob

# Extracting n-grams using customed defined function
import re
def n_gram_extractor(input_str, n):
    tokens = re.sub(r'([^\s\w]|_)+', ' ', input_str).split()
    for i in range(len(tokens)-n+1):
        print(tokens[i:i+n])

n_gram_extractor('The cute little boy is playing with the kitten.', 2)

n_gram_extractor('The cute little boy is playing with the kitten.', 3)


# Extracting n-grams with nltk
from nltk import ngrams
list(ngrams('The cute little boy is playing with the kitten.'.split(), 2))

list(ngrams('The cute little boy is playing with the kitten.'.split(), 3))


# Extracting n-grams using TextBlob
# TextBlob is a Python library for processing textual data.

# pip install textblob

from textblob import TextBlob
blob = TextBlob("The cute little boy is playing with the kitten.")

blob.ngrams(n=2)

blob.ngrams(n=3)


# Tokenizing texts with different packages: Keras, Textblob
sentence5 = 'Sharat tweeted, "Witnessing 70th Republic Day of India from Rajpath, New Delhi. Mesmerizing performance by Indian Army! Awesome airshow! @india_official @indian_army #India #70thRepublic_Day. For more photos ping me sharat@photoking.com :)"'

# pip install tensorflow
# pip install keras

# Tokenization with Keras
from keras.preprocessing.text import text_to_word_sequence
text_to_word_sequence(sentence5)

# Tokenization with TextBlob
from textblob import TextBlob
blob = TextBlob(sentence5)
blob.words

# Tokenize sentences using other nltk tokenizers:
# 1. Tweet Tokenizer
# 2. MWE Tokenizer (Multi-Word Expression)
# 3. Regexp Tokenizer
# 4. Whitespace Tokenizer
# 5. Word Punct Tokenizer


# 1. Tweet tokenizer
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokenizer.tokenize(sentence5)

# 2. MWE Tokenizer (Multi-Word Expression)
from nltk.tokenize import MWETokenizer
mwe_tokenizer = MWETokenizer([('Republic', 'Day')]) # Declaring set of words that are to be treated as one entity
mwe_tokenizer.add_mwe(('Indian', 'Army')) # Adding more words to the set

mwe_tokenizer.tokenize(sentence5.split()) #  Indian Army' should be treated as a single token. But here "Army!" is treated as a token. 

mwe_tokenizer.tokenize(sentence5.replace('!', '').split()) # "Army!" will be treated as Army 


# 3. Regexp Tokenizer
from nltk.tokenize import RegexpTokenizer
reg_tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
reg_tokenizer.tokenize(sentence5)


# 4. Whitespace Tokenizer
from nltk.tokenize import WhitespaceTokenizer
wh_tokenizer = WhitespaceTokenizer()
wh_tokenizer.tokenize(sentence5)


# 5. WordPunct Tokenizer
from nltk.tokenize import WordPunctTokenizer
wp_tokenizer = WordPunctTokenizer()
wp_tokenizer.tokenize(sentence5)


# Stemming
# Regexp Stemmer
sentence6 = "I love playing Cricket. Cricket players practice hard in their innings ."
from nltk.stem import RegexpStemmer
regex_stemmer = RegexpStemmer('ing$')

' '.join([regex_stemmer.stem(wd) for wd in sentence6.split()])


# Porter Stemmer
sentence7 = "Before eating, it would be nice to sanitize your hands with a sanitizer"
from nltk.stem.porter import PorterStemmer
ps_stemmer = PorterStemmer()
' '.join([ps_stemmer.stem(wd) for wd in sentence7.split()])



# Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

sentence8 = "The codes executed today are far better than what we execute generally."

' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(sentence8)])


# Singularize & Pluralize words
from textblob import TextBlob
sentence9 = TextBlob('She sells seashells on the seashore')
sentence9.words

sentence9.words[2].singularize()

sentence9.words[5].pluralize()


# Language Translation
# From Spanish to English

from textblob import TextBlob
en_blob = TextBlob(u'muy bien')
en_blob.translate(from_lang='es', to='en') 


# Custom Stop words removal
from nltk import word_tokenize
sentence9 = "She sells seashells on the seashore"
custom_stop_word_list = ['she', 'on', 'the', 'am', 'is', 'not']
' '.join([word for word in word_tokenize(sentence9) if word.lower() not in custom_stop_word_list])


# Extracting general features from raw texts

# Number of words
# Detect presence of wh words
# Polarity
# Subjectivity
# Language identification

import pandas as pd
df = pd.DataFrame([['The vaccine for covid-19 will be announced on 1st August.'],
                   ['Do you know how much expectation the world population is having from this research?'],
                   ['This risk of virus will end on 31st July.']])
df.columns = ['text']
df

# Number of words
from textblob import TextBlob
df['number_of_words'] = df['text'].apply(lambda x : len(TextBlob(x).words))
df['number_of_words']

# Detect presence of wh words
wh_words = set(['why', 'who', 'which', 'what', 'where', 'when', 'how'])
df['is_wh_words_present'] = df['text'].apply(lambda x : True if len(set(TextBlob(str(x)).words).intersection(wh_words)) > 0 else False)
df['is_wh_words_present']


# Polarity
df['polarity'] = df['text'].apply(lambda x : TextBlob(str(x)).sentiment.polarity)
df['polarity']

# Subjectivity
df['subjectivity'] = df['text'].apply(lambda x : TextBlob(str(x)).sentiment.subjectivity)
df['subjectivity']

# Language of the sentence
df['language'] = df['text'].apply(lambda x : TextBlob(str(x)).detect_language())
df['language']


# Bag of Words
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['At least seven Indian pharma companies are working to develop a vaccine against coronavirus',
'the deadly virus that has already infected more than 14 million globally.',
'Bharat Biotech, Indian Immunologicals, are among the domestic pharma firms working on the coronavirus vaccines in India.'
]

bag_of_words_model = CountVectorizer()
print(bag_of_words_model.fit_transform(corpus).todense()) # bag of words

bag_of_word_df = pd.DataFrame(bag_of_words_model.fit_transform(corpus).todense())
bag_of_word_df.columns = sorted(bag_of_words_model.vocabulary_)
bag_of_word_df.head()

# Bag of word model for top 5 frequent terms
bag_of_words_model_small = CountVectorizer(max_features=5)
bag_of_word_df_small = pd.DataFrame(bag_of_words_model_small.fit_transform(corpus).todense())
bag_of_word_df_small.columns = sorted(bag_of_words_model_small.vocabulary_)
bag_of_word_df_small.head()

# TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_model = TfidfVectorizer()
print(tfidf_model.fit_transform(corpus).todense())

tfidf_df = pd.DataFrame(tfidf_model.fit_transform(corpus).todense())
tfidf_df.columns = sorted(tfidf_model.vocabulary_)
tfidf_df.head()

# TFIDF for top 5 frequent terms
tfidf_model_small = TfidfVectorizer(max_features=5)
tfidf_df_small = pd.DataFrame(tfidf_model_small.fit_transform(corpus).todense())
tfidf_df_small.columns = sorted(tfidf_model_small.vocabulary_)
tfidf_df_small.head()


# Feature Engineering (Text Similarity)
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

lemmatizer = WordNetLemmatizer()

pair1 = ["Do you have Covid-19","Your body temperature will tell you"]
pair2 = ["I travelled to Malaysia.", "Where did you travel?"]
pair3 = ["He is a programmer", "Is he not a programmer?"]

def extract_text_similarity_jaccard (text1, text2):
    words_text1 = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(text1)]
    words_text2 = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(text2)]
    nr = len(set(words_text1).intersection(set(words_text2)))
    dr = len(set(words_text1).union(set(words_text2)))
    jaccard_sim = nr/dr
    return jaccard_sim

extract_text_similarity_jaccard(pair1[0], pair1[1])
extract_text_similarity_jaccard(pair2[0], pair2[1])
extract_text_similarity_jaccard(pair3[0], pair3[1])

tfidf_model = TfidfVectorizer()

# Creating a corpus which will have texts of pair1, pair2 and pair 3 respectively
corpus = [pair1[0], pair1[1], pair2[0], pair2[1], pair3[0], pair3[1]]

tfidf_results = tfidf_model.fit_transform(corpus).todense()
# Note: Here tfidf_results will have tf-idf representation of 
# texts of pair1, pair2 and pair3 in the given order.

# tfidf_results[0], tfidf_results[1] represents pair1
# tfidf_results[2], tfidf_results[3] represents pair2
# tfidf_results[4], tfidf_results[5] represents pair3

#cosine similarity between texts of pair1
cosine_similarity(tfidf_results[0], tfidf_results[1])

#cosine similarity between texts of pair2
cosine_similarity(tfidf_results[2], tfidf_results[3])

#cosine similarity between texts of pair3
cosine_similarity(tfidf_results[4], tfidf_results[5])


['The', 'cute']
['cute', 'little']
['little', 'boy']
['boy', 'is']
['is', 'playing']
['playing', 'with']
['with', 'the']
['the', 'kitten']
['The', 'cute', 'little']
['cute', 'little', 'boy']
['little', 'boy', 'is']
['boy', 'is', 'playing']
['is', 'playing', 'with']
['playing', 'with', 'the']
['with', 'the', 'kitten']


ModuleNotFoundError: No module named 'textblob'

In [None]:
import requests   # Importing requests to extract content from a url
from bs4 import BeautifulSoup as bs # Beautifulsoup is for web scrapping...used to scrap specific content 
import re

from wordcloud import WordCloud
import matplotlib.pyplot as plt


# creating empty reviews list
oneplus_reviews=[]

for i in range(1,21):
  ip=[]  
  url="https://www.amazon.in/OnePlus-Silver-Storage-hands-free-capable/product-reviews/B09MQBRCSZ/ref=cm_cr_getr_d_paging_btm_prev_1?ie=UTF8&reviewerType=all_reviews&pageNumber="+str(i)  
  response = requests.get(url)
  soup = bs(response.content,"html.parser")# creating soup object to iterate over the extracted content 
  reviews = soup.find_all("span", attrs={"class","a-size-base review-text review-text-content"})# Extracting the content under specific tags  
  for i in range(len(reviews)):
    ip.append(reviews[i].text)  
 
  oneplus_reviews = oneplus_reviews + ip  # adding the reviews of one page to empty list which in future contains all the reviews

# writng reviews in a text file 
with open("oneplus.txt", "w", encoding='utf8') as output:
    output.write(str(oneplus_reviews))
	

# Joinining all the reviews into single paragraph 
ip_rev_string = " ".join(oneplus_reviews)

import nltk
# from nltk.corpus import stopwords

# Removing unwanted symbols incase if exists
ip_rev_string = re.sub("[^A-Za-z" "]+", " ", ip_rev_string).lower()
# ip_rev_string = re.sub("[0-9" "]+"," ", ip_rev_string)

# words that contained in the reviews
ip_reviews_words = ip_rev_string.split(" ")

ip_reviews_words = ip_reviews_words[1:]

#TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1, 1))
X = vectorizer.fit_transform(ip_reviews_words)


with open("C:\\Data\\textmining\\stop.txt", "r") as sw:
    stop_words = sw.read()
    
stop_words = stop_words.split("\n")

stop_words.extend(["oneplus","mobile","time","android","phone","device","product","day"])

ip_reviews_words = [w for w in ip_reviews_words if not w in stop_words]

# Joinining all the reviews into single paragraph 
ip_rev_string = " ".join(ip_reviews_words)

# WordCloud can be performed on the string inputs.
# Corpus level word cloud

wordcloud_ip = WordCloud(background_color='White',
                      width=1800,
                      height=1400
                     ).generate(ip_rev_string)
plt.imshow(wordcloud_ip)

# positive words # Choose the path for +ve words stored in system
with open("C:\\Data\\textmining\\positive-words.txt", "r") as pos:
  poswords = pos.read().split("\n")

# Positive word cloud
# Choosing the only words which are present in positive words
ip_pos_in_pos = " ".join ([w for w in ip_reviews_words if w in poswords])

wordcloud_pos_in_pos = WordCloud(
                      background_color='White',
                      width=1800,
                      height=1400
                     ).generate(ip_pos_in_pos)
plt.figure(2)
plt.imshow(wordcloud_pos_in_pos)

# negative words Choose path for -ve words stored in system
with open("C:\\Data\\textmining\\negative-words.txt", "r") as neg:
  negwords = neg.read().split("\n")

# negative word cloud
# Choosing the only words which are present in negwords
ip_neg_in_neg = " ".join ([w for w in ip_reviews_words if w in negwords])

wordcloud_neg_in_neg = WordCloud(
                      background_color='black',
                      width=1800,
                      height=1400
                     ).generate(ip_neg_in_neg)
plt.figure(3)
plt.imshow(wordcloud_neg_in_neg)

#################################################################
# Joinining all the reviews into single paragraph 
ip_rev_string = " ".join(oneplus_reviews)

# wordcloud with bigram
nltk.download('punkt')
from wordcloud import WordCloud, STOPWORDS

WNL = nltk.WordNetLemmatizer()

# Lowercase and tokenize
text = ip_rev_string.lower()

# Remove single quote early since it causes problems with the tokenizer.
text = text.replace("'", "")

tokens = nltk.word_tokenize(text)
text1 = nltk.Text(tokens)

# Remove extra chars and remove stop words.
text_content = [''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word)) for word in text1]

# Create a set of stopwords
stopwords_wc = set(STOPWORDS)
customised_words = ['price', 'great', '9rt'] # If you want to remove any particular word form text which does not contribute much in meaning

new_stopwords = stopwords_wc.union(customised_words)

# Remove stop words
text_content = [word for word in text_content if word not in new_stopwords]

# Take only non-empty entries
text_content = [s for s in text_content if len(s) != 0]

# Best to get the lemmas of each word to reduce the number of similar words
text_content = [WNL.lemmatize(t) for t in text_content]

# nltk_tokens = nltk.word_tokenize(text)  
bigrams_list = list(nltk.bigrams(text_content))
print(bigrams_list)

dictionary2 = [' '.join(tup) for tup in bigrams_list]
print (dictionary2)

# Using count vectoriser to view the frequency of bigrams
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(2, 2))
bag_of_words = vectorizer.fit_transform(dictionary2)
vectorizer.vocabulary_

sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
print(words_freq[:100])

# Generating wordcloud
words_dict = dict(words_freq)
WC_height = 1000
WC_width = 1500
WC_max_words = 100
wordCloud = WordCloud(max_words=WC_max_words, height=WC_height, width=WC_width, stopwords=new_stopwords)

wordCloud.generate_from_frequencies(words_dict)
plt.figure(4)
plt.title('Most frequently occurring bigrams connected by same colour and font size')
plt.imshow(wordCloud, interpolation='bilinear')
plt.axis("off")
plt.show()
