# **Import Regex Library**

In [1]:
import re
print(re.__version__)

2.2.1


# **Detect and Remove HTML Tags**

In [2]:
text = "<p>Follow this <b>website</b> for more details. </p>"
x = re.findall("<.*?>", text)
print(x)
z = re.sub("<.*?>", "", text)
print(z)

['<p>', '<b>', '</b>', '</p>']
Follow this website for more details. 


# **Removing special characters and keeping only alphabets and numbers**

In [3]:
text = "2022 #Partner of the Year Finalist Learning #Award%%"
x = re.sub("[^a-zA-Z0-9]+", " ", text)
print(x)

2022 Partner of the Year Finalist Learning Award 


# **Keeping only alphabets**

In [4]:
text = "2022 #Partner of the Year Finalist Learning #Award%%"
x = re.sub("[^a-zA-Z]+", " ", text)
print(x)

 Partner of the Year Finalist Learning Award 


# **Detect and Remove URLs**

In [5]:
text = "Visit www.cloudthat.com and login to https://www.skillpipe.com/#/account/login"
x = re.findall("https?://\S+|www\.\S+", text)
print(x)
z = re.sub("https?://\S+|www\.\S+", "", text)
print(z)

['www.cloudthat.com', 'https://www.skillpipe.com/#/account/login']
Visit  and login to 


# **Detect and Remove Email IDs**

In [6]:
text = "Please send your feedback to ctml@cloudthat.com or ctml123@gmail.com"
x = re.findall("[a-zA-Z0-9_\-\.]+@[a-zA-Z0-9_\-\.]+\.[a-zA-Z]{2,5}", text)
print(x)
z = re.sub("[a-zA-Z0-9_\-\.]+@[a-zA-Z0-9_\-\.]+\.[a-zA-Z]{2,5}", "", text)
print(z)

['ctml@cloudthat.com', 'ctml123@gmail.com']
Please send your feedback to  or 


# **Replacing Multi-Spaces**

In [7]:
text = "2022 Partner              of the Year          Finalist Learning Award"
x = re.sub("\s+", " ", text)
print(x)

2022 Partner of the Year Finalist Learning Award


# **Import NLTK**

In [8]:
import nltk
print(nltk.__version__)

3.8.1


# **Tokenization**

In [9]:
nltk.download('punkt')

paragraph = """Diversity is an important issue for any modern business, but it’s not enough to simply hire people of different nationalities, races, genders and sexual orientations. 
                Everyone needs to feel welcome, safe and free to be themselves in the workplace. 
                If you focus on diversity, equity and inclusion (DEI) in your workplace, your business’s culture and bottom line will benefit. 
                Inclusive workplaces go the extra mile to consider the safety and comfortability of all employees, especially those in marginalized groups. 
                For example, gendered bathrooms have the potential to make transgender and gender-nonconforming employees uncomfortable, especially in light of controversial “bathroom bills” in multiple states that could or already do impact transgender people’s rights. 
                On a broader level, inclusive spaces can be created simply by spending time with one another. 
                Consider hosting team lunches and other informal events where employees can casually connect with each other. 
                If your company is bigger, creating an in-office support group or network for diverse employees can help them connect with others who share their experiences."""
               
sentences = nltk.sent_tokenize(paragraph)
print(sentences)
print(len(sentences))

words = nltk.word_tokenize(paragraph)
print(words)
print(len(words))

[nltk_data] Downloading package punkt to /home/amits/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Diversity is an important issue for any modern business, but it’s not enough to simply hire people of different nationalities, races, genders and sexual orientations.', 'Everyone needs to feel welcome, safe and free to be themselves in the workplace.', 'If you focus on diversity, equity and inclusion (DEI) in your workplace, your business’s culture and bottom line will benefit.', 'Inclusive workplaces go the extra mile to consider the safety and comfortability of all employees, especially those in marginalized groups.', 'For example, gendered bathrooms have the potential to make transgender and gender-nonconforming employees uncomfortable, especially in light of controversial “bathroom bills” in multiple states that could or already do impact transgender people’s rights.', 'On a broader level, inclusive spaces can be created simply by spending time with one another.', 'Consider hosting team lunches and other informal events where employees can casually connect with each other.', 'If 

In [10]:
from nltk.util import ngrams

n = 3
sentence = 'You will face many defeats in life, but never let yourself be defeated.'
bigrams = ngrams(sentence.split(), n)

for item in bigrams:
    print(item)

('You', 'will', 'face')
('will', 'face', 'many')
('face', 'many', 'defeats')
('many', 'defeats', 'in')
('defeats', 'in', 'life,')
('in', 'life,', 'but')
('life,', 'but', 'never')
('but', 'never', 'let')
('never', 'let', 'yourself')
('let', 'yourself', 'be')
('yourself', 'be', 'defeated.')


# **Printing Stop Words in English**

In [11]:
from nltk.corpus import stopwords
nltk.download('stopwords')

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /home/amits/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# **Stemming**

In [12]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

paragraph = """Diversity is an important issue for any modern business, but it’s not enough to simply hire people of different nationalities, races, genders and sexual orientations. 
                Everyone needs to feel welcome, safe and free to be themselves in the workplace. 
                If you focus on diversity, equity and inclusion (DEI) in your workplace, your business’s culture and bottom line will benefit. 
                Inclusive workplaces go the extra mile to consider the safety and comfortability of all employees, especially those in marginalized groups. 
                For example, gendered bathrooms have the potential to make transgender and gender-nonconforming employees uncomfortable, especially in light of controversial “bathroom bills” in multiple states that could or already do impact transgender people’s rights. 
                On a broader level, inclusive spaces can be created simply by spending time with one another. 
                Consider hosting team lunches and other informal events where employees can casually connect with each other. 
                If your company is bigger, creating an in-office support group or network for diverse employees can help them connect with others who share their experiences."""              

sentences = nltk.sent_tokenize(paragraph)
stemmer = PorterStemmer()
corpus = []

for i in range(len(sentences)):
    sentence = re.sub("[^a-zA-Z]", " ", sentences[i])
    sentence = sentence.lower()
    words = nltk.word_tokenize(sentence)
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentence = ' '.join(words)
    corpus.append(sentence)  

print(corpus)    

['divers import issu modern busi enough simpli hire peopl differ nation race gender sexual orient', 'everyon need feel welcom safe free workplac', 'focu divers equiti inclus dei workplac busi cultur bottom line benefit', 'inclus workplac go extra mile consid safeti comfort employe especi margin group', 'exampl gender bathroom potenti make transgend gender nonconform employe uncomfort especi light controversi bathroom bill multipl state could alreadi impact transgend peopl right', 'broader level inclus space creat simpli spend time one anoth', 'consid host team lunch inform event employe casual connect', 'compani bigger creat offic support group network divers employe help connect other share experi']


# **Lemmatization**

In [13]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('omw-1.4')

paragraph = """Thank you all so very much. Thank you to the Academy. 
               Thank you to all of you in this room. I have to congratulate 
               the other incredible nominees this year. The Revenant was 
               the product of the tireless efforts of an unbelievable cast
               and crew. First off, to my brother in this endeavor, Mr. Tom 
               Hardy. Tom, your talent on screen can only be surpassed by 
               your friendship off screen … thank you for creating a t
               ranscendent cinematic experience. Thank you to everybody at 
               Fox and New Regency … my entire team. I have to thank 
               everyone from the very onset of my career … To my parents; 
               none of this would be possible without you. And to my 
               friends, I love you dearly; you know who you are. And lastly,
               I just want to say this: Making The Revenant was about
               man's relationship to the natural world. A world that we
               collectively felt in 2015 as the hottest year in recorded
               history. Our production needed to move to the southern
               tip of this planet just to be able to find snow. Climate
               change is real, it is happening right now. It is the most
               urgent threat facing our entire species, and we need to work
               collectively together and stop procrastinating. We need to
               support leaders around the world who do not speak for the 
               big polluters, but who speak for all of humanity, for the
               indigenous people of the world, for the billions and 
               billions of underprivileged people out there who would be
               most affected by this. For our children’s children, and 
               for those people out there whose voices have been drowned
               out by the politics of greed. I thank you all for this 
               amazing award tonight. Let us not take this planet for 
               granted. I do not take tonight for granted. Thank you so very much."""

sentences = nltk.sent_tokenize(paragraph)
lemmatizer = WordNetLemmatizer()
corpus = []

for i in range(len(sentences)):
    sentence = re.sub("[^a-zA-Z]", " ", sentences[i])
    sentence = sentence.lower()
    words = nltk.word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentence = ' '.join(words)
    corpus.append(sentence)  

print(corpus)  

[nltk_data] Downloading package wordnet to /home/amits/nltk_data...
[nltk_data] Downloading package omw-1.4 to /home/amits/nltk_data...


['thank much', 'thank academy', 'thank room', 'congratulate incredible nominee year', 'revenant product tireless effort unbelievable cast crew', 'first brother endeavor mr tom hardy', 'tom talent screen surpassed friendship screen thank creating ranscendent cinematic experience', 'thank everybody fox new regency entire team', 'thank everyone onset career parent none would possible without', 'friend love dearly know', 'lastly want say making revenant man relationship natural world', 'world collectively felt hottest year recorded history', 'production needed move southern tip planet able find snow', 'climate change real happening right', 'urgent threat facing entire specie need work collectively together stop procrastinating', 'need support leader around world speak big polluter speak humanity indigenous people world billion billion underprivileged people would affected', 'child child people whose voice drowned politics greed', 'thank amazing award tonight', 'let u take planet granted', 