Imports

In [1]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import spacy

Cleaning HTML

In [2]:
def clean_html(html):

    # parse html content
    soup = BeautifulSoup(html, "html.parser")

    for data in soup(['style', 'script', 'code', 'a']):
        # Remove tags
        data.decompose()

    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

In [3]:
a = """<p>My favorite color is <del>blue</del> <ins>red</ins>.</p>"""
clean_html(a)

'My favorite color is blue red .'



1.   Make the text lowercase. As you probably know, NLP is case-sensitive.
2.   Remove line breaks. Again, depending on your source, you might have encoded line breaks.
3.   Remove punctuation. This is using the string library. Other punctuation can be added as needed.
4.   Remove stop words using the NLTK library. There is a list in the next line to add additional stop words to the function as needed. These might be noisy domain words or anything else that makes the contextless clear.
5.  Removing numbers. Optional depending on your data.
6.  Stemming or Lemmatization. This process is an argument in the function. You can choose either one via with Stem or Lem. The default is to use none.


<H3>punctuation</H3>
INPUT:
`hey amazon - my package never arrived https://www.amazon.com/gp/css/order-history?ref_=nav_orders_first please fix asap! @amazonhelp`
<br>
OUTPUT:
`hey amazon my package never arrived please fix asap`

In [None]:
import string

text = "I had such high hopes for this dress 15 size or (my usual size) to work for me."
PUNCT_TO_REMOVE = string.punctuation
ans = text.translate(str.maketrans(",", PUNCT_TO_REMOVE))
print(ans)

Replacing the Repetitions of Punctuations

In [None]:
text1 = "I had such... high hopes for this dress!!!!"
ans = re.sub(r'(!)1+', ' ', text1)
print(ans)

I had such... high hopes for this dress!!!!


Normalizing Text

In [None]:
text = "Hey Amazon - my package never arrived https://www.amazon.com/gp/css/order-history?ref_=nav_orders_first FIX THIS ASAP! @AmazonHelp"

text = text.lower()

print(text)

hey amazon - my package never arrived https://www.amazon.com/gp/css/order-history?ref_=nav_orders_first fix this asap! @amazonhelp


Removing Unicode Characters

In [None]:
import re

text = "hey amazon - my package never arrived https://www.amazon.com/gp/css/order-history?ref_=nav_orders_first please fix asap! @amazonhelp"

text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)

print(text)

hey amazon  my package never arrived  please fix asap amazonhelp


Removing Extra Space

In [None]:
text = "I had such high hopes for         this dress 15 size     or    (my          usual            size) to work for me."
ans = " ".join(text.split())
print(ans)

I had such high hopes for this dress 15 size or (my usual size) to work for me.


Removing Numbers

In [None]:
text = "I had such high hopes for this dress 15 size or (my usual size) to work for me."
ans = ''.join([i for i in text if not i.isdigit()])
print(ans)

I had such high hopes for this dress  size or (my usual size) to work for me.


Remove Stopwords

In [None]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stop = stopwords.words('english')
text = "my package from amazon never arrived fix this asap"
text = " ".join([word for word in text.split() if word not in (stop)])

print(text)

package amazon never arrived fix asap


In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
text = "I had such high hopes for this dress 1-5 size to work for me."
STOPWORDS = set(stopwords.words('english'))
ans = " ".join([word for word in str(text).split() if word not in STOPWORDS])
ans

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'I high hopes dress 1-5 size work me.'

**Stemming**, the simpler of the two, groups words by their root stem. This allows us to recognize that ‘jumping’ ‘jumps’ and ‘jumped’ are all rooted to the same verb (jump) and thus are referring to similar problems.
<Br>
**Lemmatization**, on the other hand, groups words based on root definition, and allows us to differentiate between present, past, and indefinite.
So, ‘jumps’ and ‘jump’ are grouped into the present ‘jump’, as different from all uses of ‘jumped’ which are grouped together as past tense, and all instances of ‘jumping’ which are grouped together as the indefinite (meaning continuing/continuous).



Stemming

In [None]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

words = ["jump", "jumped", "jumps", "jumping"]
stemmer = PorterStemmer()
for word in words:
  print(word + " = " + stemmer.stem(word))

jump = jump
jumped = jump
jumps = jump
jumping = jump


Lemmatazing

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

words = ["jump", "jumped", "jumps", "jumping"]
lemmatizer = WordNetLemmatizer()
for word in words:
  print(word + " = " + lemmatizer.lemmatize(word))

jump = jump
jumped = jumped
jumps = jump
jumping = jumping


Removing Emojis

In [None]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"U0001F600-U0001F64F"  # emoticons
                           u"U0001F300-U0001F5FF"  # symbols & pictographs
                           u"U0001F680-U0001F6FF"  # transport & map symbols
                           u"U0001F1E0-U0001F1FF"  # flags (iOS)
                           u"U00002702-U000027B0"
                           u"U000024C2-U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
remove_emoji("game is on 🔥🔥")

Removing Emoticons


In [None]:
EMOTICONS = {
u”:‑)”:”Happy face or smiley”,
u”:)”:”Happy face or smiley”,
u”:-]”:”Happy face or smiley”,
u”:]”:”Happy face or smiley”,
u”:-3″:”Happy face smiley”,
u”:3″:”Happy face smiley”,
u”:->”:”Happy face smiley”,
u”:>”:”Happy face smiley”,
u”8-)”:”Happy face smiley”,
u”:o)”:”Happy face smiley”,
u”:-}”:”Happy face smiley”,
u”:}”:”Happy face smiley”,
u”:-)”:”Happy face smiley”,
u”:c)”:”Happy face smiley”,
u”:^)”:”Happy face smiley”,
u”=]”:”Happy face smiley”
}
text = "‘"I had such high hopes for this dress 15 size really wanted it to work for me :-)’
ans = re.compile(u'(‘ + u’|’.join(k for k in EMOTICONS) + u’)’)
ans = ans.sub(r”,text)
-----------

def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text
text = "Hello :-)"
convert_emoticons(text)

Removing Contractions

In [None]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


In [None]:
import contractions
text = "She'd like to know how I'd do that!"
contractions.fix(text)

'She would like to know how I would do that!'

Removing HTML Tags

In [None]:
import re
text = "I had such high hopes for this dress 15 size or (my usual size) to work for me."

without_html = re.sub(pattern=r'', repl='', string=text)
print(f"{without_html}")


I had such high hopes for this dress 15 size or (my usual size) to work for me.


Removing & Finding URL

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = 'My email is http://abcgmail.com'
doc = nlp(text)
for token in doc:
    if token.like_url:
        print(token)

http://abcgmail.com


In [None]:
text = 'Look at this link http://abcgmail.com for work purpose https://abd.com'
text_sp = text.split()
ans = ' '.join([i for i in text_sp if 'ht' not in i])
ans

'Look at this link for work purpose'

Removing & Finding Email id

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = 'My email is abc@gmail.com'
doc = nlp(text)
for token in doc:
    if token.like_email:
        print(token)

abc@gmail.com


In [None]:
text = 'My email is abc@gmail.com for work purpose'
text_sp = text.split()
ans = ' '.join([i for i in text_sp if '@' not in i])
ans

'My email is for work purpose'

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = 'My email is abc@gmail.com'
doc = nlp(text)
for token in doc:
    if not token.like_email:
        print(token)

My
email
is


Standardizing and Spell Check (autocorrect library)

In [None]:
!pip install autocorrect

Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.8/622.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l[?25hdone
  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622363 sha256=c6fc3094972ed0a017b7586b034ea8093464799406d6b8081e7998acfb88e389
  Stored in directory: /root/.cache/pip/wheels/b5/7b/6d/b76b29ce11ff8e2521c8c7dd0e5bfee4fb1789d76193124343
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1


In [None]:
import itertools
from autocorrect import Speller
text="A farmmer will lovdd this food"
#One letter in a word should not be present more than twice in continuation
text_correction = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
print("Normal Text:n{}".format(text_correction))
spell = Speller(lang='en')
ans = spell(text_correction)
print("After correcting text:n{}".format(ans))

Normal Text:nA farmmer will lovdd this food
After correcting text:nA farmer will loved this food


Chat Words Conversion

In [None]:
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace"""

In [None]:
chat_words_map_dict = {}
chat_words_list = []
for line in chat_words_str.split("n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

Text prepration with **Spacy**

In [None]:
# Load spacy
nlp = spacy.load('en_core_web_sm')

def clean_string(text, stem="None"):

    final_string = ""

    # Make lower
    text = text.lower()

    # Remove line breaks
    text = re.sub(r'\n', '', text)

    # Remove puncuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ['hi', 'im']

    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers
    text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text_filtered]

    # Stem or Lemmatize
    if stem == 'Stem':
        stemmer = PorterStemmer()
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    elif stem == 'Spacy':
        text_filtered = nlp(' '.join(text_filtered))
        text_stemmed = [y.lemma_ for y in text_filtered]
    else:
        text_stemmed = text_filtered

    final_string = ' '.join(text_stemmed)

    return final_string

Example

In [None]:
exs = """<p><a href="https://forge.autodesk.com/en/docs/data/v2/tutorials/download-file/#step-6-download-the-item" rel="nofollow noreferrer">https://forge.autodesk.com/en/docs/data/v2/tutorials/download-file/#step-6-download-the-item</a></p>\n\n<p>I have followed the tutorial and have successfully obtained the contents of the file, but where is the file being downloaded. In addition, how do I specify the location of where I want to download the file?</p>\n\n<p>Result on Postman\n<a href="https://i.stack.imgur.com/VrdqP.png" rel="nofollow noreferrer"><img src="https://i.stack.imgur.com/VrdqP.png" alt="enter image description here"></a></p>\n"""

In [None]:
x = clean_html(exs)
clean_string(x, stem='Stem')

Our Resource:<br>
https://www.dataknowsall.com/textcleaning.html<br>
https://monkeylearn.com/blog/text-cleaning/<br>
https://www.analyticsvidhya.com/blog/2022/01/text-cleaning-methods-in-nlp/<br>
https://www.analyticsvidhya.com/blog/2022/02/text-cleaning-methods-in-nlp-part-2/