In [1]:
import warnings
warnings.filterwarnings('ignore')
import nltk # Imports the library
#nltk.download() #Download the necessary datasets

I. Preprocessing
=========

The objective is to **clean** and **standardize** your input data so that it can be manipulated easily afterward.

1. Tokenization
---------------

Split your input into **tokens**, according to specific rules.

In [2]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import MWETokenizer
from nltk.tokenize import RegexpTokenizer
import nltk.data

**Word tokenizer**

In [3]:
word_tokenize("Welcome to the Data For Good NLP workshops! Hope you'll enjoy it :)")

['Welcome',
 'to',
 'the',
 'Data',
 'For',
 'Good',
 'NLP',
 'workshops',
 '!',
 'Hope',
 'you',
 "'ll",
 'enjoy',
 'it',
 ':',
 ')']

**Tweet tokenizer**

In [4]:
tokenizer = TweetTokenizer()
tweet = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
tokenizer.tokenize(tweet)

['This',
 'is',
 'a',
 'cooool',
 '#dummysmiley',
 ':',
 ':-)',
 ':-P',
 '<3',
 'and',
 'some',
 'arrows',
 '<',
 '>',
 '->',
 '<--']

In [5]:
tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
excited_tweet = '@BZU: This workshop is sooooooooo cool!!!!!!'
tokenizer.tokenize(excited_tweet)

[':', 'This', 'workshop', 'is', 'sooo', 'cool', '!', '!', '!']

**Multi-word Expression tokenizer**

In [6]:
tokenizer = MWETokenizer([('Data','For','Good'), ('natural', 'language', 'processing')])
tokenizer.add_mwe(('wednesday', 'evening'))
tokenizer.tokenize('BZU is hosting Data For Good natural language processing workshops on saturay evening'.split(),)

['BZU',
 'is',
 'hosting',
 'Data_For_Good',
 'natural_language_processing',
 'workshops',
 'on',
 'saturay',
 'evening']

**Sentence tokenizer**

In [7]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
text = '''
This sentence tokenizer knows that the periods in Mr. Smith and Johann S. Bach
do not mark sentence boundaries.  And sometimes sentences can start with non-capitalized 
words.  i is a good variable name.
'''
sentences = sent_detector.tokenize(text.strip())
print('\n-----\n'.join(sentences))

This sentence tokenizer knows that the periods in Mr. Smith and Johann S. Bach
do not mark sentence boundaries.
-----
And sometimes sentences can start with non-capitalized 
words.
-----
i is a good variable name.


**Regexp tokenizer**

In [8]:
tokenizer = RegexpTokenizer('[A-Z]\w+')
tokenizer.tokenize('BZU is hosting Data For Good natural language processing Workshops on wednesday evening')

['BZU', 'Data', 'For', 'Good', 'Workshops']

2. Stemming
-----------

Normalize each token by **reducing** it to its linguistic root or **stem**.

In [9]:
from nltk.stem import SnowballStemmer

In [10]:
stemmer = SnowballStemmer("english")
sentence = '''Stemming is the term used in linguistic morphology and information retrieval to describe
the process for reducing inflected words to their word stem'''
for token in word_tokenize(sentence):
    print(token + ' --> ' + stemmer.stem(token))

Stemming --> stem
is --> is
the --> the
term --> term
used --> use
in --> in
linguistic --> linguist
morphology --> morpholog
and --> and
information --> inform
retrieval --> retriev
to --> to
describe --> describ
the --> the
process --> process
for --> for
reducing --> reduc
inflected --> inflect
words --> word
to --> to
their --> their
word --> word
stem --> stem


II. Features
=======

The idea is to build a representation of each token, which can be understood and manipulated easily by a learning algorithm.

Part-Of-Speech (POS) Tagging
--------------------------

Tags meaning here --> https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [11]:
from nltk import pos_tag

In [12]:
sentence = "this is the postagger in nltk for python users"
tokens = word_tokenize(sentence)
nltk.pos_tag(tokens)

[('this', 'DT'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('postagger', 'NN'),
 ('in', 'IN'),
 ('nltk', 'NN'),
 ('for', 'IN'),
 ('python', 'NN'),
 ('users', 'NNS')]