In [1]:
import numpy as np
import string
import re

In [2]:
import nltk
#nltk.download('gutenberg')
#nltk.download('punkt')
nltk.download('europarl_raw')
from nltk.corpus import gutenberg
from pprint import pprint

[nltk_data] Downloading package europarl_raw to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package europarl_raw is already up-to-date!


In [3]:
alice = gutenberg.raw(fileids='carroll-alice.txt')
sample_text = '''We will discuss briefly about the basic syntax, structure and 
design philosophies. There is a defined hierarchical syntax for Python code 
which you should remember when writing code! Python is a really powerful 
programming language!'''

In [4]:
print(len(alice))

144395


In [5]:
print(alice[0:100])

[Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was


## Tokenization

Default sentence tokenizer

In [6]:
default_st =nltk.sent_tokenize
alice_sentences = default_st(text= alice)
sample_sentences = default_st(text= sample_text)

In [7]:
alice_sentences

["[Alice's Adventures in Wonderland by Lewis Carroll 1865]\n\nCHAPTER I.",
 "Down the Rabbit-Hole\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into the\nbook her sister was reading, but it had no pictures or conversations in\nit, 'and what is the use of a book,' thought Alice 'without pictures or\nconversation?'",
 'So she was considering in her own mind (as well as she could, for the\nhot day made her feel very sleepy and stupid), whether the pleasure\nof making a daisy-chain would be worth the trouble of getting up and\npicking the daisies, when suddenly a White Rabbit with pink eyes ran\nclose by her.',
 "There was nothing so VERY remarkable in that; nor did Alice think it so\nVERY much out of the way to hear the Rabbit say to itself, 'Oh dear!",
 'Oh dear!',
 "I shall be late!'",
 '(when she thought it over afterwards, it\noccurred to her that she ought to have wondered at this, but at the 

In [8]:
sample_text

'We will discuss briefly about the basic syntax, structure and \ndesign philosophies. There is a defined hierarchical syntax for Python code \nwhich you should remember when writing code! Python is a really powerful \nprogramming language!'

Other languages sentence tokenization

In [9]:
from nltk.corpus import europarl_raw

german_text = europarl_raw.german.raw(fileids='ep-00-01-17.de')
# Total characters in the corpus
print(len(german_text))
# First 100 characters in the corpus
print(german_text[0:100])

157171
 
Wiederaufnahme der Sitzungsperiode Ich erkläre die am Freitag , dem 17. Dezember unterbrochene Sit


In [10]:
# default sentence tokenizer 
german_sentences_def = default_st(text=german_text, language='german')

# loading german text tokenizer into a PunktSentenceTokenizer instance  
german_tokenizer = nltk.data.load(resource_url='tokenizers/punkt/german.pickle')
german_sentences = german_tokenizer.tokenize(german_text)

In [11]:
print(type(german_tokenizer))

<class 'nltk.tokenize.punkt.PunktSentenceTokenizer'>


In [12]:
print(np.array(german_sentences[:5]))

[' \nWiederaufnahme der Sitzungsperiode Ich erkläre die am Freitag , dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen , wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe , daß Sie schöne Ferien hatten .'
 'Wie Sie feststellen konnten , ist der gefürchtete " Millenium-Bug " nicht eingetreten .'
 'Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden .'
 'Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sitzungsperiode in den nächsten Tagen .'
 'Heute möchte ich Sie bitten - das ist auch der Wunsch einiger Kolleginnen und Kollegen - , allen Opfern der Stürme , insbesondere in den verschiedenen Ländern der Europäischen Union , in einer Schweigeminute zu gedenken .']


In [13]:
punkt_st = nltk.tokenize.PunktSentenceTokenizer()
sample_sentences = punkt_st.tokenize(sample_text)
pprint(np.array(sample_sentences))

array(['We will discuss briefly about the basic syntax, structure and \ndesign philosophies.',
       'There is a defined hierarchical syntax for Python code \nwhich you should remember when writing code!',
       'Python is a really powerful \nprogramming language!'],
      dtype='<U100')


Using RegexpTokenizer for sentence tokenization

In [14]:
SENTENCE_TOKENS_PATTERN = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s'
regex_st = nltk.tokenize.RegexpTokenizer(
            pattern=SENTENCE_TOKENS_PATTERN,
            gaps=True)
sample_sentences = regex_st.tokenize(sample_text)
print(np.array(sample_sentences)) 

['We will discuss briefly about the basic syntax, structure and \ndesign philosophies.'
 'There is a defined hierarchical syntax for Python code \nwhich you should remember when writing code!'
 'Python is a really powerful \nprogramming language!']


##  Word Tokenization

Default word tokenizer

In [15]:
sentence = "The brown fox wasn't that quick and he couldn't win the race"
default_wt = nltk.word_tokenize
words = default_wt(sentence)
print(words)

['The', 'brown', 'fox', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'the', 'race']


Treebank word tokenizer

In [16]:
treebank_wt = nltk.TreebankWordTokenizer()
words = treebank_wt.tokenize(sentence)
print(words)

['The', 'brown', 'fox', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'the', 'race']


 egexpTokenizer 

In [17]:
TOKEN_PATTERN = r'\w+'        
regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN,
                                gaps=False)
words = regex_wt.tokenize(sentence)
np.array(words)

array(['The', 'brown', 'fox', 'wasn', 't', 'that', 'quick', 'and', 'he',
       'couldn', 't', 'win', 'the', 'race'], dtype='<U6')

In [18]:
GAP_PATTERN = r'\s+'        
regex_wt = nltk.RegexpTokenizer(pattern=GAP_PATTERN,
                                gaps=True)
words = regex_wt.tokenize(sentence)
np.array(words)

array(['The', 'brown', 'fox', "wasn't", 'that', 'quick', 'and', 'he',
       "couldn't", 'win', 'the', 'race'], dtype='<U8')

In [19]:
word_indices = list(regex_wt.span_tokenize(sentence))
print(word_indices)

[(0, 3), (4, 9), (10, 13), (14, 20), (21, 25), (26, 31), (32, 35), (36, 38), (39, 47), (48, 51), (52, 55), (56, 60)]


In [20]:
print([sentence[start:end] for start, end in word_indices])

['The', 'brown', 'fox', "wasn't", 'that', 'quick', 'and', 'he', "couldn't", 'win', 'the', 'race']


Derived regex tokenizers

In [21]:
wordpunkt_wt = nltk.WordPunctTokenizer() # use  r'\w+|[^\w\s]+ pattern
words = wordpunkt_wt.tokenize(sentence)
print(words)

['The', 'brown', 'fox', 'wasn', "'", 't', 'that', 'quick', 'and', 'he', 'couldn', "'", 't', 'win', 'the', 'race']


In [22]:
whitespace_wt = nltk.WhitespaceTokenizer() # use whitespaces like tabs, newlines and spaces
words = whitespace_wt.tokenize(sentence)
print(words)

['The', 'brown', 'fox', "wasn't", 'that', 'quick', 'and', 'he', "couldn't", 'win', 'the', 'race']


## Text Normalization

In [23]:
corpus = ["The brown fox wasn't that quick and he couldn't win the race", 
          "Hey that's a great deal! I just bought a phone for $199", 
          "@@You'll (learn) a **lot** in the book. Python is an amazing language  !@@"]

Cleaning text

Tokenizing text

In [24]:
def tokenize_text(text):
    sentences = nltk.sent_tokenize(text)
    word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
    return word_tokens

token_list = [tokenize_text(text) for text in corpus]
pprint(token_list)

[[['The',
   'brown',
   'fox',
   'was',
   "n't",
   'that',
   'quick',
   'and',
   'he',
   'could',
   "n't",
   'win',
   'the',
   'race']],
 [['Hey', 'that', "'s", 'a', 'great', 'deal', '!'],
  ['I', 'just', 'bought', 'a', 'phone', 'for', '$', '199']],
 [['@',
   '@',
   'You',
   "'ll",
   '(',
   'learn',
   ')',
   'a',
   '*',
   '*',
   'lot',
   '*',
   '*',
   'in',
   'the',
   'book',
   '.'],
  ['Python', 'is', 'an', 'amazing', 'language', '!'],
  ['@', '@']]]


Removing Special Characters

In [25]:
import re
def remove_special_characters(text, remove_digits=True):
    if remove_digits == False:
        pattern = r'[^a-zA-z0-9\s]' 
    else:
        pattern = r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("9Wel254l t#his was f1un! What do you think? 123#@!", remove_digits=False)

'9Wel254l this was f1un What do you think 123'