## Nlp Data preprocessing

In [1]:
import re # importing regex

In [2]:
def html_remove(text):
    html_tag = re.sub(r'<.*?>|\d+', '', text)
    return html_tag

data = """<p>Hello, this is a small paragraph with HTML tags. I am using the <strong>strong</strong> tag to make some text bold, and the <em>em</em> tag to make some text italic. I can also use the <br> tag to insert a line break.</p>"""
result = html_remove(data)
result

'Hello, this is a small paragraph with HTML tags. I am using the strong tag to make some text bold, and the em tag to make some text italic. I can also use the  tag to insert a line break.'

In [3]:
def url_remove(text):
    url_tag = re.sub(r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*", '', text)
    return url_tag
txt_data = """https://www.w3schools.com/"Hi this is test of remove url link!"""
result2 = url_remove(txt_data)
result2

'"Hi this is test of remove url link!'

In [4]:
def punc_remove(text):
    punc_tags = re.sub(r"[^\w\s]", '', text)
    return punc_tags
punc_data = """"The bustling city?, with its towering skyscrapers and crowded streets, hums with energy and excitement!. People rush to and fro, immersed in the rhythm of urban life. The honking of cars, the chatter of pedestrians, and the occasional sirens create a symphony of city sounds."""
punc_result = punc_remove(punc_data)
punc_result

'The bustling city with its towering skyscrapers and crowded streets hums with energy and excitement People rush to and fro immersed in the rhythm of urban life The honking of cars the chatter of pedestrians and the occasional sirens create a symphony of city sounds'

## Tokenization

- Sentence tokenize using `nltk` and `spacy`

In [5]:
from nltk.tokenize import sent_tokenize

**Sentence tokenize using Nltk**

In [6]:
def sentence_tokenize(sent):
    sentence = sent_tokenize(sent)
    return sentence
sent_data = "This is example of sentence tokenize. It is use to convert corpus into meaningfull sentence. it is a part of text preprocessing."
sent_result = sent_tokenize(sent_data)
sent_result

['This is example of sentence tokenize.',
 'It is use to convert corpus into meaningfull sentence.',
 'it is a part of text preprocessing.']

**sentence Tokenization using Spacy**

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')

def spacy_sen_token(text):
    doc = nlp(text)
    sentence = [sent.text for sent in doc.sents]
    return sentence
spacy_sen_txt = "It is use to convert corpus into meaningfull sentence. it is a part of text preprocessing. This is example of sentence tokenize."
spacy_result = spacy_sen_token(spacy_sen_txt)
spacy_result

['It is use to convert corpus into meaningfull sentence.',
 'it is a part of text preprocessing.',
 'This is example of sentence tokenize.']

**word tokenize**
- word tokenize using `nltk` and `spacy`

In [8]:
from nltk.tokenize import word_tokenize

In [9]:
def word_token(text):
    word = word_tokenize(text)
    return word
word_token_data = "This is example of word tokenization using nltk"
result_word_tokenize = word_token(word_token_data)
result_word_tokenize

['This', 'is', 'example', 'of', 'word', 'tokenization', 'using', 'nltk']

In [10]:
import spacy
nlp = spacy.load('en_core_web_sm')
def word_spacy(text):
    doc = nlp(text)
    words = [word.text for word in doc]
    return words
spacy_word_data = "This is example of word tokenize using spacy and it is a powerfull techinique of word tokenize"
spacy_word_token_result = word_spacy(spacy_word_data)
spacy_word_token_result

['This',
 'is',
 'example',
 'of',
 'word',
 'tokenize',
 'using',
 'spacy',
 'and',
 'it',
 'is',
 'a',
 'powerfull',
 'techinique',
 'of',
 'word',
 'tokenize']

## Remove stop words

**Remove stop words using nltk**

In [11]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ak352\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ak352\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
def remove_stop_words(words):
    stop_words = (stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words

original_tokens = ['This','is','example','of','word','tokenize','using','spacy','and','it','is','a','powerfull','techinique','of','word','tokenize']

result_of_stopword = remove_stop_words(original_tokens)
print(result_of_stopword)

['example', 'word', 'tokenize', 'using', 'spacy', 'powerfull', 'techinique', 'word', 'tokenize']


**Remove stop words using spacy**

In [13]:
import spacy
nlp = spacy.load('en_core_web_sm')

def remove_stopword_spacy(text):
    doc = nlp(text)
    filtered_words = [token.text for token in doc if not token.is_stop]
    return filtered_words

original_data = "The bustling city with its towering skyscrapers and crowded streets hums with energy and excitement People rush to and fro immersed in the rhythm of urban life The honking of cars the chatter of pedestrians and the occasional sirens create a symphony of city sounds"
spacy_stop_word = remove_stopword_spacy(original_data)
spacy_stop_word

['bustling',
 'city',
 'towering',
 'skyscrapers',
 'crowded',
 'streets',
 'hums',
 'energy',
 'excitement',
 'People',
 'rush',
 'fro',
 'immersed',
 'rhythm',
 'urban',
 'life',
 'honking',
 'cars',
 'chatter',
 'pedestrians',
 'occasional',
 'sirens',
 'create',
 'symphony',
 'city',
 'sounds']

**Remove stopwords using custom stopwords list**

In [14]:
def custom_stop_word(text, custom_stopword):
    words = word_tokenize(text)
    word_filter = [word for word in words if word.lower() not in custom_stopword]
    return word_filter

custom_data = ['a', 'in', 'and', 'not', 'is']

data_text = 'This is not a custom data'
result_custom = custom_stop_word(data_text, custom_data)
result_custom

['This', 'custom', 'data']

## Stemming
- It will give root word

**Stemming using Porterstemmer**

In [15]:
from nltk.stem import PorterStemmer
def port_stem(text):
    stems = PorterStemmer()
    stemm_word = stems.stem(text)
    return stemm_word

stem_text = ['history', 'stories', 'stemming', 'going', 'runing', 'cried']
for i in stem_text:
    print(port_stem(i))
# stem_result = port_stem(stem_text)
# stem_result

histori
stori
stem
go
rune
cri


**Porterstemmer using spacy**

In [16]:
import spacy
nlp = spacy.load("en_core_web_sm")

def spacy_port_stem(text):
    doc = nlp(text)
    port_stem = PorterStemmer()
    stemm_sen = ' '.join([port_stem.stem(token.text) for token in doc])
    return stemm_sen
stem_text = ['loving','contributed','history', 'stories', 'stemming', 'going', 'runing', 'cried', 'gone']
for word in stem_text:
    print(spacy_port_stem(word))

love
contribut
histori
stori
stem
go
rune
cri
gone


**Stemming using Lancaster Stemmer**

In [17]:
from nltk.stem import LancasterStemmer
def lanc_stemmer(text):
    lanc_stem = LancasterStemmer()
    word_lancstem = lanc_stem.stem(text)
    return word_lancstem

lanc_text = ['loving','contributed','history', 'stories', 'stemming', 'going', 'runing', 'cried', 'gone']
for j in stem_text:
    print(lanc_stemmer(j))

lov
contribut
hist
story
stem
going
run
cri
gon


**Stemming using Lancaster Stemmer with spacy**

In [18]:
nlp = spacy.load('en_core_web_sm')
def lanc_stem_spacy(sentence):
    doc = nlp(sentence)
    lanc_stem = LancasterStemmer()
    lanc_sentence = ' '.join([lanc_stem.stem(token.text) for token in doc])
    return lanc_sentence

lancstem_text = ['loving','contributed','history', 'stories', 'stemming', 'going', 'runing', 'cried', 'gone']
for k in lancstem_text:
    print(lanc_stem_spacy(k))

lov
contribut
hist
story
stem
going
run
cri
gon


**Stemming using Snowball Stemmer**

In [19]:
from nltk.stem import SnowballStemmer

def snow_ball_stemmer(sent, language='english'):
    snow_ball = SnowballStemmer(language)
    sent_snowball = snow_ball.stem(sent)
    return sent_snowball


data = ['babies','loved','contributed','history', 'stories', 'stemming', 'going', 'runing', 'cried', 'gone']
for a in data:
    print(lanc_stem_spacy(a))

baby
lov
contribut
hist
story
stem
going
run
cri
gon


In [20]:
from nltk.stem import SnowballStemmer

def snow_ball_stemmer(sent, language='english'):
    snow_ball = SnowballStemmer(language)
    sent_snowball = snow_ball.stem(sent)
    return sent_snowball

data = ['babies','loved','contributed','history', 'stories', 'stemming', 'going', 'runing', 'cried', 'gone']
for word in data:
    print(snow_ball_stemmer(word))

babi
love
contribut
histori
stori
stem
go
rune
cri
gone


In [21]:
# import pandas as pd
# from nltk.stem import SnowballStemmer

# def snow_ball_stemmer(sent, language='english'):
#     snow_ball = SnowballStemmer(language)
#     sent_snowball = snow_ball.stem(sent)
#     return sent_snowball

# data = ['kites', 'babies', 'dogs', 'flying', 'smiling', 'driving', 'died', 'tried', 'feet']

# df = pd.DataFrame({'original_word': data, 'stem_word': [snow_ball_stemmer(word) for word in data]})

# print(df)

**Stemming using Snowball Stemmer with spacy**

In [22]:
import spacy
nlp = spacy.load('en_core_web_sm')

def snow_ball_stemmer_spacy(sent):
    doc = nlp(sent)
    snow_ball = SnowballStemmer('english')
    snow_ball_stem_sent = ' '.join([snow_ball.stem(token.text) for token in doc])
    return snow_ball_stem_sent

data_sent = ['babies','loved','contributed','history', 'stories', 'stemming', 'going', 'runing', 'cried', 'gone']
for b in data_sent:
    print(snow_ball_stemmer_spacy(b))

babi
love
contribut
histori
stori
stem
go
rune
cri
gone


## Lemmatization

**many techinique of lemmatization**
   - WordNet Lemmatizer
   - Spacy Lemmatization
   - NLTK Lemmatization
   - Stanford Lemmatizer etc

**wordnet lemma**

In [23]:
from nltk.stem import WordNetLemmatizer

def word_lemma(word):
    wordnet = WordNetLemmatizer()
    sent_lemma = wordnet.lemmatize(word)
    return sent_lemma
data = ['babies','countries','loved','contributed','histories', 'stories', 'stemming', 'going', 'runing', 'cried', 'gone']

final_result = [word_lemma(word) for word in data]
for result in final_result:
    print(word_lemma(result))

baby
country
loved
contributed
history
story
stemming
going
runing
cried
gone


In [24]:
from nltk.stem import WordNetLemmatizer

def word_lemma(sentence):
    wordnet = WordNetLemmatizer()
    words = sentence.split()
    sent_lemma = ' '.join([wordnet.lemmatize(word) for word in words])
    return sent_lemma

norm_data = ['glasses','babies','countries','histories', 'stories', 'words', 'goes', 'runs', 'carries', 'goals']
for c in norm_data:
    print(word_lemma(c))

glass
baby
country
history
story
word
go
run
carry
goal


**Lemma using spacy**

In [25]:
import spacy
nlp = spacy.load('en_core_web_sm')
def lemma_using_spacy(sent):
    doc = nlp(sent)
    lemma_sent = ' '.join([token.lemma_ for token in doc])
    return lemma_sent

sentences = ['babies','countries','loved','contributed','histories', 'stories', 'stemming', 'going', 'runing', 'cried', 'gone']
for i in sentences:
    print(lemma_using_spacy(i))

baby
country
love
contribute
history
story
stem
go
run
cry
go


**Lemma using textbolo**

In [26]:
from textblob import TextBlob
def lemma_using_blob(text):
    blob_obj = TextBlob(text)
    blob_lemma_text = ' '.join([i.lemmatize() for i in blob_obj.words])
    return blob_lemma_text

sent = ['babies','countries','loves','contributed','histories', 'stories', 'stemming', 'going', 'runing', 'carries', 'gone']
for i in sent:
    print(lemma_using_blob(i))

baby
country
love
contributed
history
story
stemming
going
runing
carry
gone


**Lemma using gensim, simple_preprocess and spacy**

In [41]:
from gensim.utils import simple_preprocess
import spacy
nlp = spacy.load('en_core_web_sm')
def gen_lemma(text):
    doc = nlp(text)
    gensim_word_lemma = simple_preprocess(text)
    gensim_word_lemma = ' '.join([token.lemma_ for token in doc])
    return gensim_word_lemma

sent = ['babies','countries','loves','contributed','histories', 'stories', 'stemming', 'going', 'runing', 'carries', 'gone']
for i in sent:
    print(gen_lemma(i))

baby
country
love
contribute
history
story
stem
go
run
carry
go
