# Natural Language Processing with Python 

## Tokenizing

In [4]:
from nltk import sent_tokenize, word_tokenize

In [2]:
example_text = "Hello Mr. Smith, how are you doing today? The weather is great and python is awesome. The sky is pinkish-blue"

In [5]:
print(sent_tokenize(example_text))

['Hello Mr. Smith, how are you doing today?', 'The weather is great and python is awesome.', 'The sky is pinkish-blue']


In [6]:
print(word_tokenize(example_text))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', 'and', 'python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue']


## Stop-Words

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [8]:
example_sentence = "This is an example showing off stop word filtration."

In [9]:
stop_words = set(stopwords.words("english"))

In [10]:
words = word_tokenize(example_sentence)

In [11]:
filtered_sentence = []

for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)
        
# filtered_sentence = [w for w in words if not w in stop_words]

In [12]:
print(filtered_sentence)

['This', 'example', 'showing', 'stop', 'word', 'filtration', '.']


## Stemming

In [13]:
from nltk.stem import PorterStemmer

In [14]:
ps = PorterStemmer()

In [15]:
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

In [16]:
for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [17]:
new_text = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned badly at least once"

In [18]:
words = word_tokenize(new_text)

In [19]:
for w in words:
    print(ps.stem(w))

It
is
veri
import
to
be
pythonli
while
you
are
python
with
python
.
All
python
have
python
badli
at
least
onc


## Part of Speech Tagging

In [20]:
from nltk.corpus import state_union

In [21]:
from nltk.tokenize import PunktSentenceTokenizer # unsipervised ML tokenizer

In [25]:
train_text = state_union.raw("2005-GWBush.txt")
test_text = state_union.raw("2006-GWBush.txt")

In [26]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [27]:
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [28]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            print(tagged)
    except Exception as e:
        print(str(e))

In [30]:
#process_content()