**Install Dependencies**

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

**Case Conversion**

In [None]:
text = 'The quick brown fox jumped over The Big Dog'
text

'The quick brown fox jumped over The Big Dog'

In [None]:
text.lower()

'the quick brown fox jumped over the big dog'

In [None]:
text.upper()

'THE QUICK BROWN FOX JUMPED OVER THE BIG DOG'

In [None]:
text.title()

'The Quick Brown Fox Jumped Over The Big Dog'

**Tokenization**

In [None]:
sample_text = ("US unveils world's most powerful supercomputer, beats China. "
               "The US has unveiled the world's most powerful supercomputer called 'Summit', "
               "beating the previous record-holder China's Sunway TaihuLight. With a peak performance "
               "of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, "
               "which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, "
               "which reportedly take up the size of two tennis courts.")
sample_text

"US unveils world's most powerful supercomputer, beats China. The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight. With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, which reportedly take up the size of two tennis courts."

In [None]:
#In this code it seperated the lines where full stop (.) ends
import nltk

nltk.sent_tokenize(sample_text)

["US unveils world's most powerful supercomputer, beats China.",
 "The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight.",
 'With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second.',
 'Summit has 4,608 servers, which reportedly take up the size of two tennis courts.']

In [None]:
#It seperates every word
print(nltk.word_tokenize(sample_text))

['US', 'unveils', 'world', "'s", 'most', 'powerful', 'supercomputer', ',', 'beats', 'China', '.', 'The', 'US', 'has', 'unveiled', 'the', 'world', "'s", 'most', 'powerful', 'supercomputer', 'called', "'Summit", "'", ',', 'beating', 'the', 'previous', 'record-holder', 'China', "'s", 'Sunway', 'TaihuLight', '.', 'With', 'a', 'peak', 'performance', 'of', '200,000', 'trillion', 'calculations', 'per', 'second', ',', 'it', 'is', 'over', 'twice', 'as', 'fast', 'as', 'Sunway', 'TaihuLight', ',', 'which', 'is', 'capable', 'of', '93,000', 'trillion', 'calculations', 'per', 'second', '.', 'Summit', 'has', '4,608', 'servers', ',', 'which', 'reportedly', 'take', 'up', 'the', 'size', 'of', 'two', 'tennis', 'courts', '.']


In [None]:
#1-This line imports the spaCy library, which is a powerful and efficient NLP library in Python.
import spacy
#2-Load English Model:
#This line loads the English language model "en_core_web_sm" provided by spaCy. The model includes pre-trained statistical models and word vectors for
#the English language. "sm" stands for "small," indicating that it's a smaller model suitable for most general-purpose NLP tasks.
nlp = spacy.load("en_core_web_sm")
#3-Process Text:
#This line processes the sample_text using the loaded spaCy model. It tokenizes the text, performs part-of-speech tagging, entity recognition,
#and other linguistic annotations. The result is stored in the text_spacy object, which is a spaCy Doc object.
text_spacy = nlp(sample_text)

In [None]:
#This line of code extracts the individual sentences from the spaCy processed text (text_spacy). Let's break down how this code achieves that
[obj.text for obj in text_spacy.sents]

["US unveils world's most powerful supercomputer, beats China.",
 "The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight.",
 'With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second.',
 'Summit has 4,608 servers, which reportedly take up the size of two tennis courts.']

In [13]:
print([obj.text for obj in text_spacy])

['US', 'unveils', 'world', "'s", 'most', 'powerful', 'supercomputer', ',', 'beats', 'China', '.', 'The', 'US', 'has', 'unveiled', 'the', 'world', "'s", 'most', 'powerful', 'supercomputer', 'called', "'", 'Summit', "'", ',', 'beating', 'the', 'previous', 'record', '-', 'holder', 'China', "'s", 'Sunway', 'TaihuLight', '.', 'With', 'a', 'peak', 'performance', 'of', '200,000', 'trillion', 'calculations', 'per', 'second', ',', 'it', 'is', 'over', 'twice', 'as', 'fast', 'as', 'Sunway', 'TaihuLight', ',', 'which', 'is', 'capable', 'of', '93,000', 'trillion', 'calculations', 'per', 'second', '.', 'Summit', 'has', '4,608', 'servers', ',', 'which', 'reportedly', 'take', 'up', 'the', 'size', 'of', 'two', 'tennis', 'courts', '.']


**Removing HTML tags & noise**

In [14]:
import requests

data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
content = data.text
print(content[2745:3948])

ne spacing ("leading") */
    }
body > p {
    /* paras at <body> level - not in <div> or <table>  */
    text-align: justify;
    /* or left?? */
    text-indent: 1em;
    /* first-line indent */
    }
/* suppress indentation on paragraphs following heads  */
h2 + p, h3 + p, h4 + p {
    text-indent: 0
    }
/* tighter spacing for list item paragraphs */
dd, li {
    margin-top: 0.25em;
    margin-bottom: 0;
    line-height: 1.2em;
    /* a bit closer than p's */
    }
/* ************************************************************************
 * Head 2 is for chapter heads. 
 * ********************************************************************** */
h2 {
    /* text-align:center;  left-aligned by default. */
    margin-top: 3em;
    /* extra space above.. */
    margin-bottom: 2em;
    /* ..and below */
    clear: both;
    /* don't let sidebars overlap */
    }
/* ************************************************************************
 * Head 3 is 

In [15]:
import re
from bs4 import BeautifulSoup

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

clean_content = strip_html_tags(content)
print(clean_content[1163:1957])

form, and void; and darkness was
           upon the face of the deep. And the Spirit of God moved upon
           the face of the waters.
01:001:003 And God said, Let there be light: and there was light.
01:001:004 And God saw the light, that it was good: and God divided the
           light from the darkness.
01:001:005 And God called the light Day, and the darkness he called
           Night. And the evening and the morning were the first day.
01:001:006 And God said, Let there be a firmament in the midst of the
           waters, and let it divide the waters from the waters.
01:001:007 And God made the firmament, and divided the waters which were
           under the firmament from the waters which were above the
           firmament: and it was so.
01:001:008 And God called the 


**Removing Accented Characters**

In [16]:
import unicodedata

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [17]:
s = 'Sómě Áccěntěd těxt'
s

'Sómě Áccěntěd těxt'

In [18]:
remove_accented_chars(s)

'Some Accented text'

**Removing Special Characters, Numbers and Symbols**

In [19]:
import re

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

In [20]:
s = "Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂"
s

'Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂'

In [21]:
remove_special_characters(s, remove_digits=True)

'Well this was fun See you at  What do you think  '

In [22]:
remove_special_characters(s)

'Well this was fun See you at 730 What do you think 9318 '

**Expanding Contractions**

In [23]:
!pip install contractions
!pip install textsearch

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


In [24]:
s = "Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"
s

"Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"

In [25]:
import contractions

list(contractions.contractions_dict.items())[:10]

[("I'm", 'I am'),
 ("I'm'a", 'I am about to'),
 ("I'm'o", 'I am going to'),
 ("I've", 'I have'),
 ("I'll", 'I will'),
 ("I'll've", 'I will have'),
 ("I'd", 'I would'),
 ("I'd've", 'I would have'),
 ('Whatcha', 'What are you'),
 ("amn't", 'am not')]

In [26]:
contractions.fix(s)

'You all cannot expand contractions I would think! You would not be able to. How did you do it?'

**Stemming**

In [27]:
# Porter Stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')

('jump', 'jump', 'jump')

In [28]:
ps.stem('lying')

'lie'

In [29]:
ps.stem('strange')

'strang'

**Lemmatization**

In [30]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [31]:
help(wnl.lemmatize)

Help on method lemmatize in module nltk.stem.wordnet:

lemmatize(word: str, pos: str = 'n') -> str method of nltk.stem.wordnet.WordNetLemmatizer instance
    Lemmatize `word` using WordNet's built-in morphy function.
    Returns the input word unchanged if it cannot be found in WordNet.
    
    :param word: The input word to lemmatize.
    :type word: str
    :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns,
        `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"`
        for satellite adjectives.
    :param pos: str
    :return: The lemma of `word`, for the given `pos`.



In [32]:
# lemmatize nouns
print(wnl.lemmatize('cars', 'n'))
print(wnl.lemmatize('boxes', 'n'))

car
box


In [33]:
# lemmatize verbs
print(wnl.lemmatize('running', 'v'))
print(wnl.lemmatize('ate', 'v'))

run
eat


In [34]:
# lemmatize adjectives
print(wnl.lemmatize('saddest', 'a'))
print(wnl.lemmatize('fancier', 'a'))

sad
fancy


In [35]:
# ineffective lemmatization
print(wnl.lemmatize('ate', 'n'))
print(wnl.lemmatize('fancier', 'v'))
print(wnl.lemmatize('fancier'))

ate
fancier
fancier


In [36]:
s = 'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

**Tokenize**

In [37]:
tokens = nltk.word_tokenize(s)
print(tokens)

['The', 'brown', 'foxes', 'are', 'quick', 'and', 'they', 'are', 'jumping', 'over', 'the', 'sleeping', 'lazy', 'dogs', '!']


In [38]:
lemmatized_text = ' '.join(wnl.lemmatize(token) for token in tokens)
lemmatized_text

'The brown fox are quick and they are jumping over the sleeping lazy dog !'

**POS Tagging**

In [39]:
tagged_tokens = nltk.pos_tag(tokens)
print(tagged_tokens)

[('The', 'DT'), ('brown', 'JJ'), ('foxes', 'NNS'), ('are', 'VBP'), ('quick', 'JJ'), ('and', 'CC'), ('they', 'PRP'), ('are', 'VBP'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('sleeping', 'VBG'), ('lazy', 'JJ'), ('dogs', 'NNS'), ('!', '.')]


**Tag conversion to WordNet Tags**

In [40]:
from nltk.corpus import wordnet

def pos_tag_wordnet(tagged_tokens):
    tag_map = {'j': wordnet.ADJ, 'v': wordnet.VERB, 'n': wordnet.NOUN, 'r': wordnet.ADV}
    new_tagged_tokens = [(word, tag_map.get(tag[0].lower(), wordnet.NOUN))
                            for word, tag in tagged_tokens]
    return new_tagged_tokens

In [41]:
wordnet_tokens = pos_tag_wordnet(tagged_tokens)
print(wordnet_tokens)

[('The', 'n'), ('brown', 'a'), ('foxes', 'n'), ('are', 'v'), ('quick', 'a'), ('and', 'n'), ('they', 'n'), ('are', 'v'), ('jumping', 'v'), ('over', 'n'), ('the', 'n'), ('sleeping', 'v'), ('lazy', 'a'), ('dogs', 'n'), ('!', 'n')]


**Effective Lemmatization**


In [42]:
lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in wordnet_tokens)
lemmatized_text

'The brown fox be quick and they be jump over the sleep lazy dog !'

**Your turn: Define a function such that you put all the above steps together so that it does the following**

*Function name is wordnet_lemmatize_text(...)

Input is a variable text which should take in a document (bunch of words)

Call the earlier defined functions and utilize them

Return lemmatized text as the output (as a string)*

In [43]:
wnl = WordNetLemmatizer()

def wordnet_lemmatize_text(text):
    tagged_tokens = nltk.pos_tag(nltk.word_tokenize(text))
    wordnet_tokens = pos_tag_wordnet(tagged_tokens)
    lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in wordnet_tokens)
    return lemmatized_text

**Your Turn: Now call the function on the below sentence and test it**

In [44]:
s

'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [45]:
wordnet_lemmatize_text(s)

'The brown fox be quick and they be jump over the sleep lazy dog !'

**Lemmatization with Spacy**

In [49]:
import spacy

# Load the spaCy model without disabling components
nlp = spacy.load("en_core_web_sm")

# Disable parsing, tagging, and entity recognition
nlp.disable_pipe("parser")
nlp.disable_pipe("tagger")
nlp.disable_pipe("ner")

def spacy_lemmatize_text(text):
    # Process the text using the modified spaCy model
    text = nlp(text)

    # Lemmatize the text
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])

    return text


In [50]:
s

'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [51]:
spacy_lemmatize_text(s)



'the brown foxes are quick and they are jumping over the sleeping lazy dogs !'

**Stopword Removal**

In [52]:
def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]

    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]

    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [53]:
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [54]:
s

'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

**Your turn: Remove the words 'the' and 'brown' from the stop_words list and call the function with this new list**

In [55]:
stop_words.remove('the')
stop_words.append('brown')

In [56]:
remove_stopwords(s, is_lower_case=False, stopwords=stop_words)

'The foxes quick jumping the sleeping lazy dogs !'