# PoS tagging

In [14]:
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize

In [17]:
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/ynjn/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/ynjn/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/ynjn/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/ynjn/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/ynjn/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/ynjn/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /home/y

True

In [18]:
sentence = "A very beautiful young lady is walking on the beach"
# tokenizing words
tokenized_words = word_tokenize(sentence)

# words in tokenized_words
tagged_words = nltk.pos_tag(tokenized_words)
tagged_words

[('A', 'DT'),
 ('very', 'RB'),
 ('beautiful', 'JJ'),
 ('young', 'JJ'),
 ('lady', 'NN'),
 ('is', 'VBZ'),
 ('walking', 'VBG'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('beach', 'NN')]

# Extracting noun phrase from text

In [19]:
grammar = "NP: {<DT>?<JJ>*<NN>}"  # DT:한정사  JJ:형용사  NN:단일명사

# create a parser
parser = nltk.RegexpParser(grammar)

print(parser.parse(tagged_words))

(S
  A/DT
  very/RB
  (NP beautiful/JJ young/JJ lady/NN)
  is/VBZ
  walking/VBG
  on/IN
  (NP the/DT beach/NN))


# Chinking

In [22]:
# excluding adjectives from the chunk
grammar = r""" NP: {<.*>+}
                }<JJ>+{"""
parser = nltk.RegexpParser(grammar)
print(parser.parse(tagged_words))

(S
  (NP A/DT very/RB)
  beautiful/JJ
  young/JJ
  (NP lady/NN is/VBZ walking/VBG on/IN the/DT beach/NN))


# Named Entity Recognition

In [23]:
sentence = "Mr. Smith made a deal on a beach of Switzerland near WHO."
tokenized_words = word_tokenize(sentence)
tagged_words = nltk.pos_tag(tokenized_words)

ner = nltk.ne_chunk(tagged_words,binary=False)
print(ner)

(S
  (PERSON Mr./NNP)
  (PERSON Smith/NNP)
  made/VBD
  a/DT
  deal/NN
  on/IN
  a/DT
  beach/NN
  of/IN
  (GPE Switzerland/NNP)
  near/IN
  (ORGANIZATION WHO/NNP)
  ./.)


# Wordnet

In [24]:
from nltk.corpus import wordnet

for words in wordnet.synsets("Fun"):
    print(words)

Synset('fun.n.01')
Synset('fun.n.02')
Synset('fun.n.03')
Synset('playfulness.n.02')


In [28]:
# word meaning with definitions
for words in wordnet.synsets("Fun"):
    print(words.name())
    print(words.definition())
    print(words.examples())
    
    for lemma in words.lemmas():
        print(lemma)
    print("".join(["="]*50))

fun.n.01
activities that are enjoyable or amusing
['I do it for the fun of it', 'he is fun to have around']
Lemma('fun.n.01.fun')
Lemma('fun.n.01.merriment')
Lemma('fun.n.01.playfulness')
fun.n.02
verbal wit or mockery (often at another's expense but not to be taken seriously)
['he became a figure of fun', 'he said it in sport']
Lemma('fun.n.02.fun')
Lemma('fun.n.02.play')
Lemma('fun.n.02.sport')
fun.n.03
violent and excited activity
['she asked for money and then the fun began', 'they began to fight like fun']
Lemma('fun.n.03.fun')
playfulness.n.02
a disposition to find (or make) causes for amusement
['her playfulness surprised me', 'he was fun to be with']
Lemma('playfulness.n.02.playfulness')
Lemma('playfulness.n.02.fun')


# Bag of Words

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
sentences = [
    "Jim and Pam travelled by the bus:",
    "The train was late",
    "The flight was full.Travelling by flight is expensive",
]

In [9]:
# create an object
cv = CountVectorizer()

# generate output for BOW
bow = cv.fit_transform(sentences).toarray()

In [10]:
# total words with their index in model
print(cv.vocabulary_)

{'jim': 7, 'and': 0, 'pam': 9, 'travelled': 12, 'by': 2, 'the': 10, 'bus': 1, 'train': 11, 'was': 14, 'late': 8, 'flight': 4, 'full': 5, 'travelling': 13, 'is': 6, 'expensive': 3}


In [11]:
# features
print(cv.get_feature_names())

['and', 'bus', 'by', 'expensive', 'flight', 'full', 'is', 'jim', 'late', 'pam', 'the', 'train', 'travelled', 'travelling', 'was']


In [12]:
print(bow)

[[1 1 1 0 0 0 0 1 0 1 1 0 1 0 0]
 [0 0 0 0 0 0 0 0 1 0 1 1 0 0 1]
 [0 0 1 1 2 1 1 0 0 0 1 0 0 1 1]]


# TF IDF

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# sentenses to analyze
sentences = [
    'This is the first document',
    'This document is the second document',
]

In [3]:
# create an object
vectorizer = TfidfVectorizer(norm=None) # norm{'l1', 'l2'}, default is 'l2'
# generate output for TF_IDF:
X = vectorizer.fit_transform(sentences).toarray()

In [4]:
# total words with their index in model
print(vectorizer.vocabulary_)

{'this': 5, 'is': 2, 'the': 4, 'first': 1, 'document': 0, 'second': 3}


In [5]:
# features
print(vectorizer.get_feature_names())

['document', 'first', 'is', 'second', 'the', 'this']


In [6]:
# the TF-IDF result
# The higher the TF*IDF score, the rarer or unique or valuable the term and vice versa.
print(X)

[[1.         1.40546511 1.         0.         1.         1.        ]
 [2.         0.         1.         1.40546511 1.         1.        ]]
