# Processing Text

In [55]:
#Upgrade dependencies
!pip install --upgrade pip
!pip install --upgrade scikit-learn
!pip install --upgrade sagemaker



## 1. <a name="1">Working with simple text-cleaning processes</a>

In [56]:
text = "   This is a message to be cleaned. It might involve some things like: <br>, ?, :, ''  adjacent spaces, and tabs     .  "
print(text)

   This is a message to be cleaned. It might involve some things like: <br>, ?, :, ''  adjacent spaces, and tabs     .  


In [57]:
text = text.lower()
print(text)

   this is a message to be cleaned. it might involve some things like: <br>, ?, :, ''  adjacent spaces, and tabs     .  


In [58]:
text = text.strip()
print(text)

this is a message to be cleaned. it might involve some things like: <br>, ?, :, ''  adjacent spaces, and tabs     .


In [59]:
import re

text = re.compile('<.*?>').sub('', text)
print(text)

this is a message to be cleaned. it might involve some things like: , ?, :, ''  adjacent spaces, and tabs     .


In [60]:
import re, string

text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
print(text)

this is a message to be cleaned  it might involve some things like              adjacent spaces  and tabs      


In [61]:
import re

text = re.sub('\s+', ' ', text)
print(text)

this is a message to be cleaned it might involve some things like adjacent spaces and tabs 


## 2. Working with lexicon-based text processing

In [62]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Stopword removal

In [63]:
# Use a tokenizer from the NLTK library
import nltk
from nltk.tokenize import word_tokenize

filtered_sentence = []

# Stopword lists can be adjusted for your problem
stopwords = ["a", "an", "the", "this", "that", "is", "it", "to", "and"]

# Tokenize the sentence
words = word_tokenize(text)
for w in words:
    if w not in stopwords:
        filtered_sentence.append(w)
text = " ".join(filtered_sentence)

In [64]:
print(text)

message be cleaned might involve some things like adjacent spaces tabs


#### Stemming words

In [65]:
# Use a tokenizer and stemmer from the NLTK library
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

# Initialize the stemmer
snow = SnowballStemmer('english')

stemmed_sentence = []
# Tokenize the sentence
words = word_tokenize(text)
for w in words:
    # Stem the word/token
    stemmed_sentence.append(snow.stem(w))
stemmed_text = " ".join(stemmed_sentence)

In [66]:
print(stemmed_text)

messag be clean might involv some thing like adjac space tab


#### Lemmatizing words

In [67]:
# Importing the necessary functions
import nltk
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer
wl = WordNetLemmatizer()

# This is a helper function to map NTLK position tags
# Full list is available here: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemmatized_sentence = []
# Tokenize the sentence
words = word_tokenize(text)
# Get position tags
word_pos_tags = nltk.pos_tag(words)
# Map the position tag and lemmatize the word or token
for idx, tag in enumerate(word_pos_tags):
    lemmatized_sentence.append(wl.lemmatize(tag[0], get_wordnet_pos(tag[1])))

lemmatized_text = " ".join(lemmatized_sentence)

[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [68]:
print(lemmatized_text)

message be clean might involve some thing like adjacent space tabs
