<a href="https://colab.research.google.com/gist/1UC1F3R616/87126d413efcb6d4097f48bfb5f77b9d/nlpcourse1-5hours.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Tokenize
- word
- sentence





In [0]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [0]:
dataset = 'Hi My dataset is created by Me. Lol I know'
dataset2 = """Hello Mr. Watson, how are you doing today?
             The weather is awsome. The garden is green.
             We should go out for a walk."""

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
print(sent_tokenize(dataset))
print(sent_tokenize(dataset2))

['Hi My dataset is created by Me.', 'Lol I know']
['Hello Mr. Watson, how are you doing today?', 'The weather is awsome.', 'The garden is green.', 'We should go out for a walk.']


In [0]:
print(word_tokenize(dataset))
print(word_tokenize(dataset2))

## Stop Words
- search engines are programmed to ignore the Stop Words

In [0]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [0]:
nltk.download('stopwords')

In [0]:
stop_words = set(stopwords.words('english'))
print(stop_words)

In [0]:
words = word_tokenize(dataset)
filtered_words = []
for word in word_tokenize(dataset):
  if word not in stop_words:
    filtered_words.append(word)

print(filtered_words)

['Hi', 'My', 'dataset', 'created', 'Me', '.', 'Lol', 'I', 'know']


## Stemming -
The process of removing prefixes, suffixes from the words and reduce them to their stem form.

For example, the word "computation" might be stemmed to "comput"

- Porter Stemming - The most common algorithm used for stemming in English,
It consists of several phases of word reductions that are applied sequentially.

In [0]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

In [0]:
dataset3 = ['love', 'lover', 'loving', 'loved', 'lovingly']

In [0]:
for w in dataset3:
  print(ps.stem(w))

## Tagging
- Automatic assignment of descriptors to the given tokens is called Tagging
- Tagging is a kind of classification

### POS (Parts-Of-Speech) Tagging
- The process of assigning one of the parts of a speech to the given word is POS tagging [noun, pronoun, verb, adverb, preposition, conjective, adjective, interjection] 
- eg. word:paper, tag:noun

- POS tagger is a program that does the job of POS tagging
- Taggers use several kinds of information: dictionaries, lexicons, rules
There are mainly two types of taggers rule-based and stochastic
  - Rule-bases tagger:
     - Hand-writtem rules to distinguish
  - Stochastic taggers
    - HMM based
    - likelihood of the word, tag sequence probability
    - decision trees and maximum entropy

In [0]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [0]:
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

In [12]:
pos_tag(word_tokenize(dataset))

[('Hi', 'NNP'),
 ('My', 'NNP'),
 ('dataset', 'NN'),
 ('is', 'VBZ'),
 ('created', 'VBN'),
 ('by', 'IN'),
 ('Me', 'NNP'),
 ('.', '.'),
 ('Lol', 'NNP'),
 ('I', 'PRP'),
 ('know', 'VBP')]

In [0]:
nltk.help.upenn_tagset()

## Chunking
- like we don't memorise a phone nnumber as seprate individual numbers, we group them together to memorise easily
- chunking is grouping of information

In [0]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import RegexpParser

In [0]:
tokenized_data = word_tokenize(dataset)
pos_tagging = pos_tag(tokenized_data)

In [0]:
chunk_sequence = """
chunk:
{<NNPS>+}
{<NNP>+}
{<NN>}"""

In [0]:
chunk = RegexpParser(chunk_sequence)

In [21]:
chunked_data = chunk.parse(pos_tagging)
print(chunked_data)

(S
  (chunk Hi/NNP My/NNP)
  (chunk dataset/NN)
  is/VBZ
  created/VBN
  by/IN
  (chunk Me/NNP)
  ./.
  (chunk Lol/NNP)
  I/PRP
  know/VBP)


## Named Entity Recognition
- Also known as
  - Entity Identification
  - Entity Chunking
  - Entity Extraction
- It is a subtask of information extraction that classify named entities into pre-defined categories such as names of persons, organizations, locations
- Tesla: Organization, Elon Musk: Person

### Applications
- classify the contents to news providers
- Efficent search Algorithms
- Content recommendation
- Question and Answer systems
- Automatic Forwarding
- Online document searching

In [0]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

In [0]:
tagged_data = pos_tag(word_tokenize(dataset))

In [0]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

In [30]:
# Applying Named Entity Recognization with ne_chunk
ne_data = ne_chunk(tagged_data)
print(ne_data)

(S
  Hi/NNP
  My/NNP
  dataset/NN
  is/VBZ
  created/VBN
  by/IN
  Me/NNP
  ./.
  Lol/NNP
  I/PRP
  know/VBP)


In [0]:
# won't work on GC
# ne_data.draw()

## Lemmatization
- Process of converting the words of a sentence into it's dictionary form.
word: Feet, Lemma: Foot
  - we get a meaningful name
  
## Stemming
- Process of converting the words of a sentence to it's non-changing portions
  - we may or may not get a meaningful name

In [0]:
from nltk.stem import WordNetLemmatizer

In [0]:
wnl = WordNetLemmatizer()

In [35]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [36]:
words = ['dogs', 'cars', 'feet', 'people']
for word in words:
  print(wnl.lemmatize(word))

print(wnl.lemmatize('fantasized', 'v'))

dog
car
foot
people
fantasize


## Corpus
- Large collection of text
- spoken material on which liguistic analysis is based

In [42]:
import nltk

nltk.download('state_union')
from nltk.corpus import state_union

dataset = state_union.raw('2001-GWBush-1.txt')

[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Package state_union is already up-to-date!


## Wordnet
- Lexical database in english language
- It group english words in antonyms and synonyms
- Also provides short examples and words


In [0]:
from nltk.corpus import wordnet

In [44]:
syns = wordnet.synsets('program')
print(syns)

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]


In [49]:
print(syns[0].lemmas())
print(syns[0].lemmas()[0].name()) # Getting the name of it
print(syns[0].lemmas()[1].name())

[Lemma('plan.n.01.plan'), Lemma('plan.n.01.program'), Lemma('plan.n.01.programme')]
plan
program


In [48]:
# Getting the meaning of that word
print(syns[0].definition())

a series of steps to be carried out or goals to be accomplished


In [52]:
print(syns[0].examples())

['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [55]:
# Antonyms
antonyms = []
synonyms = []

for syn in wordnet.synsets('good'):
  for l in syn.lemmas():
    synonyms.append(l.name())
    if l.antonyms():
      antonyms.append(l.antonyms()[0].name())

print(antonyms)
print(synonyms)

['evil', 'evilness', 'bad', 'badness', 'bad', 'evil', 'ill']
['good', 'good', 'goodness', 'good', 'goodness', 'commodity', 'trade_good', 'good', 'good', 'full', 'good', 'good', 'estimable', 'good', 'honorable', 'respectable', 'beneficial', 'good', 'good', 'good', 'just', 'upright', 'adept', 'expert', 'good', 'practiced', 'proficient', 'skillful', 'skilful', 'good', 'dear', 'good', 'near', 'dependable', 'good', 'safe', 'secure', 'good', 'right', 'ripe', 'good', 'well', 'effective', 'good', 'in_effect', 'in_force', 'good', 'good', 'serious', 'good', 'sound', 'good', 'salutary', 'good', 'honest', 'good', 'undecomposed', 'unspoiled', 'unspoilt', 'good', 'well', 'good', 'thoroughly', 'soundly', 'good']
