[Reference](https://admantium.medium.com/python-nlp-libary-nltk-5fbc6166b48a)

In [8]:
!python3 -m pip install nltk



In [9]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('reuters')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

# Tokenization

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize

# Source: Wikipedia, Artificial Intelligence, https://en.wikipedia.org/wiki/Artificial_intelligence
paragraph = '''Artificial intelligence was founded as an academic discipline in 1956, and in the years since it has experienced several waves of optimism, followed by disappointment and the loss of funding (known as an "AI winter"), followed by new approaches, success, and renewed funding. AI research has tried and discarded many different approaches, including simulating the brain, modeling human problem solving, formal logic, large databases of knowledge, and imitating animal behavior. In the first decades of the 21st century, highly mathematical and statistical machine learning has dominated the field, and this technique has proved highly successful, helping to solve many challenging problems throughout industry and academia.'''
sentences = []

for sent in sent_tokenize(paragraph):
  sentences.append(word_tokenize(sent))

sentences[0]

['Artificial',
 'intelligence',
 'was',
 'founded',
 'as',
 'an',
 'academic',
 'discipline',
 'in',
 '1956',
 ',',
 'and',
 'in',
 'the',
 'years',
 'since',
 'it',
 'has',
 'experienced',
 'several',
 'waves',
 'of',
 'optimism',
 ',',
 'followed',
 'by',
 'disappointment',
 'and',
 'the',
 'loss',
 'of',
 'funding',
 '(',
 'known',
 'as',
 'an',
 '``',
 'AI',
 'winter',
 "''",
 ')',
 ',',
 'followed',
 'by',
 'new',
 'approaches',
 ',',
 'success',
 ',',
 'and',
 'renewed',
 'funding',
 '.']

# Stemming and Lemmatization

In [11]:
from nltk.stem import LancasterStemmer

sent = 'Artificial intelligence was founded as an academic discipline in 1956, and in the years since it has experienced several waves of optimism, followed by disappointment and the loss of funding (known as an "AI winter"), followed by new approaches, success, and renewed funding.'
stemmer = LancasterStemmer()
stemmed_sent = [stemmer.stem(word) for word in word_tokenize(sent)]

print(stemmed_sent)

['art', 'intellig', 'was', 'found', 'as', 'an', 'academ', 'disciplin', 'in', '1956', ',', 'and', 'in', 'the', 'year', 'sint', 'it', 'has', 'expery', 'sev', 'wav', 'of', 'optim', ',', 'follow', 'by', 'disappoint', 'and', 'the', 'loss', 'of', 'fund', '(', 'known', 'as', 'an', '``', 'ai', 'wint', "''", ')', ',', 'follow', 'by', 'new', 'approach', ',', 'success', ',', 'and', 'renew', 'fund', '.']


In [12]:
from nltk.stem import WordNetLemmatizer

sent = 'Artificial intelligence was founded as an academic discipline in 1956, and in the years since it has experienced several waves of optimism, followed by disappointment and the loss of funding (known as an "AI winter"), followed by new approaches, success, and renewed funding.'
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(word) for word in word_tokenize(sent)]

print(lemmas)

['Artificial', 'intelligence', 'wa', 'founded', 'a', 'an', 'academic', 'discipline', 'in', '1956', ',', 'and', 'in', 'the', 'year', 'since', 'it', 'ha', 'experienced', 'several', 'wave', 'of', 'optimism', ',', 'followed', 'by', 'disappointment', 'and', 'the', 'loss', 'of', 'funding', '(', 'known', 'a', 'an', '``', 'AI', 'winter', "''", ')', ',', 'followed', 'by', 'new', 'approach', ',', 'success', ',', 'and', 'renewed', 'funding', '.']


# Part-of-Speech Tagging

In [13]:
from nltk import pos_tag

sent = 'Artificial intelligence was founded as an academic discipline in 1956, and in the years since it has experienced several waves of optimism, followed by disappointment and the loss of funding (known as an "AI winter"), followed by new approaches, success, and renewed funding.'
pos_tag(sentences[0])

[('Artificial', 'JJ'),
 ('intelligence', 'NN'),
 ('was', 'VBD'),
 ('founded', 'VBN'),
 ('as', 'IN'),
 ('an', 'DT'),
 ('academic', 'JJ'),
 ('discipline', 'NN'),
 ('in', 'IN'),
 ('1956', 'CD'),
 (',', ','),
 ('and', 'CC'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('years', 'NNS'),
 ('since', 'IN'),
 ('it', 'PRP'),
 ('has', 'VBZ'),
 ('experienced', 'VBN'),
 ('several', 'JJ'),
 ('waves', 'NNS'),
 ('of', 'IN'),
 ('optimism', 'NN'),
 (',', ','),
 ('followed', 'VBN'),
 ('by', 'IN'),
 ('disappointment', 'NN'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('loss', 'NN'),
 ('of', 'IN'),
 ('funding', 'NN'),
 ('(', '('),
 ('known', 'VBN'),
 ('as', 'IN'),
 ('an', 'DT'),
 ('``', '``'),
 ('AI', 'NNP'),
 ('winter', 'NN'),
 ("''", "''"),
 (')', ')'),
 (',', ','),
 ('followed', 'VBN'),
 ('by', 'IN'),
 ('new', 'JJ'),
 ('approaches', 'NNS'),
 (',', ','),
 ('success', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('renewed', 'VBN'),
 ('funding', 'NN'),
 ('.', '.')]

# Named Entity Recognition

In [14]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [15]:
from nltk.tokenize import sent_tokenize

# Source: Wikipedia, Artificial Intelligence, https://en.wikipedia.org/wiki/Artificial_intelligence
sentence= '''
In 2011, in a Jeopardy! quiz show exhibition match, IBM's question answering system, Watson, defeated the two greatest Jeopardy! champions, Brad Rutter and Ken Jennings, by a significant margin.
'''

tagged_sentence = nltk.pos_tag(word_tokenize(sentence))
tagged_sentence

[('In', 'IN'),
 ('2011', 'CD'),
 (',', ','),
 ('in', 'IN'),
 ('a', 'DT'),
 ('Jeopardy', 'NN'),
 ('!', '.'),
 ('quiz', 'NN'),
 ('show', 'NN'),
 ('exhibition', 'NN'),
 ('match', 'NN'),
 (',', ','),
 ('IBM', 'NNP'),
 ("'s", 'POS'),
 ('question', 'NN'),
 ('answering', 'NN'),
 ('system', 'NN'),
 (',', ','),
 ('Watson', 'NNP'),
 (',', ','),
 ('defeated', 'VBD'),
 ('the', 'DT'),
 ('two', 'CD'),
 ('greatest', 'JJS'),
 ('Jeopardy', 'NN'),
 ('!', '.'),
 ('champions', 'NNS'),
 (',', ','),
 ('Brad', 'NNP'),
 ('Rutter', 'NNP'),
 ('and', 'CC'),
 ('Ken', 'NNP'),
 ('Jennings', 'NNP'),
 (',', ','),
 ('by', 'IN'),
 ('a', 'DT'),
 ('significant', 'JJ'),
 ('margin', 'NN'),
 ('.', '.')]

In [16]:
print(nltk.ne_chunk(tagged_sentence))

(S
  In/IN
  2011/CD
  ,/,
  in/IN
  a/DT
  Jeopardy/NN
  !/.
  quiz/NN
  show/NN
  exhibition/NN
  match/NN
  ,/,
  (ORGANIZATION IBM/NNP)
  's/POS
  question/NN
  answering/NN
  system/NN
  ,/,
  (PERSON Watson/NNP)
  ,/,
  defeated/VBD
  the/DT
  two/CD
  greatest/JJS
  Jeopardy/NN
  !/.
  champions/NNS
  ,/,
  (PERSON Brad/NNP Rutter/NNP)
  and/CC
  (PERSON Ken/NNP Jennings/NNP)
  ,/,
  by/IN
  a/DT
  significant/JJ
  margin/NN
  ./.)


# Datasets

In [17]:
from nltk.corpus import reuters

print(reuters.categories()[:10])
#['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee']

print(reuters.fileids()[:10])
# ['test/14826', 'test/14828', 'test/14829', 'test/14832', 'test/14833', 'test/14839', 'test/14840', 'test/14841', 'test/14842', 'test/14843']

sample = 'test/14829'
categories = reuters.categories(sample)

print(categories)
# ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee']

content = ""
with reuters.open(sample) as stream:
    content = stream.read()

print(f"Categories #{categories} / file #{sample}")
# Categories #['crude', 'nat-gas'] / file #test/14829

print(f"Content:\#{content}")

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee']
['test/14826', 'test/14828', 'test/14829', 'test/14832', 'test/14833', 'test/14839', 'test/14840', 'test/14841', 'test/14842', 'test/14843']
['crude', 'nat-gas']
Categories #['crude', 'nat-gas'] / file #test/14829
Content:\#JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS
  The Ministry of International Trade and
  Industry (MITI) will revise its long-term energy supply/demand
  outlook by August to meet a forecast downtrend in Japanese
  energy demand, ministry officials said.
      MITI is expected to lower the projection for primary energy
  supplies in the year 2000 to 550 mln kilolitres (kl) from 600
  mln, they said.
      The decision follows the emergence of structural changes in
  Japanese industry following the rise in the value of the yen
  and a decline in domestic electric power demand.
      MITI is planning to work out a revised energy supply/demand
  outlook through d

# Corpus Management

In [19]:
from  nltk.corpus.reader.plaintext import PlaintextCorpusReader

corpus = PlaintextCorpusReader('wikipedia_articles', r'.*\.txt')

print(corpus.fileids())
print(corpus.sents())

In [20]:
from  nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.text import TextCollection

corpus = PlaintextCorpusReader('wikipedia_articles', r'.*\.txt')
col = TextCollection(corpus.sents())
print(col.count('the'))
# 973
print(col.common_contexts(['intelligence']))

In [21]:
from  nltk.corpus.reader.plaintext import PlaintextCorpusReader
corpus = PlaintextCorpusReader('wikipedia_articles', r'.*\.txt')

vocab = nltk.FreqDist(w.lower() for w in corpus.words())
#  FreqDist({'the': 65590, ',': 63310, '.': 52247, 'of': 39000, 'and': 30868, 'a': 30130, 'to': 27881, 'in': 24501, '-': 19867, '(': 18243, ...})
all_words = nltk.FreqDist(w.lower() for w in corpus.words())
word_features = list(all_words)