In [None]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Sentence Tokenization

In [None]:
text = "Backgammon is one of the oldest known board games. Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East. It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice."
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
    print(sentence)
    print()

Backgammon is one of the oldest known board games.

Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East.

It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.



# Word Tokenization

In [None]:
for sentence in sentences: # lopoping through every sentence
    words = nltk.word_tokenize(sentence) # we are extracting the words in the sentence
    print(words)
    print()
  

['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games', '.']

['Its', 'history', 'can', 'be', 'traced', 'back', 'nearly', '5,000', 'years', 'to', 'archeological', 'discoveries', 'in', 'the', 'Middle', 'East', '.']

['It', 'is', 'a', 'two', 'player', 'game', 'where', 'each', 'player', 'has', 'fifteen', 'checkers', 'which', 'move', 'between', 'twenty-four', 'points', 'according', 'to', 'the', 'roll', 'of', 'two', 'dice', '.']



# Stemming and Lemmatization

## Educational Purpose only

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('omw-1.4')

def compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word, pos):
    """
      Print the results of stemmind and lemmitization using the passed stemmer, lematizer, word and pos (part of speech)
    """
    print("Stemmer:", stemmer.stem(word))
    print("Lemmatizer:", lemmatizer.lemmatize(word, pos))
    print()

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "seen", pos = wordnet.VERB)
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "drove", pos = wordnet.VERB)
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "better", pos = wordnet.ADJ)
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "improvised", pos = wordnet.VERB)


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Stemmer: seen
Lemmatizer: see

Stemmer: drove
Lemmatizer: drive

Stemmer: better
Lemmatizer: good

Stemmer: improvis
Lemmatizer: improvise



## Stop words

In [None]:
from nltk.corpus import stopwords
import os
os.listdir('/root/nltk_data/corpora/stopwords/')
stop_words = set(stopwords.words("english"))
print(stop_words)

{'you', "it's", 're', 'himself', 'do', 'are', 'down', 'if', "shan't", 'above', 'why', 'a', 'only', 'weren', 'ma', 'needn', 'more', 'doesn', 'through', 'mustn', 'mightn', 'had', "weren't", 'hadn', "doesn't", 'about', 'some', "couldn't", 'hasn', 'where', 'other', 'who', 'once', 'themselves', "you've", 'so', "needn't", 'now', "hadn't", 'again', "mightn't", 'will', 'up', 'couldn', 'me', 've', 'all', 'with', 'then', 'very', 'whom', 'm', "aren't", 'below', 'just', "you'd", 'few', 'has', 'no', 'how', 'should', 'won', 'of', 'them', "she's", 'while', 'own', 'i', 'he', 'into', 'been', 'myself', "don't", 'too', "you'll", "hasn't", 'hers', 'both', 'which', 'your', "mustn't", 'that', "that'll", 'before', 'aren', 'having', 'most', 'here', "isn't", 'after', 'until', "didn't", 'his', 'during', 'on', 'off', 'wouldn', 'when', 'because', 'ain', "you're", 'such', 's', 'same', 'o', 'the', 'am', 'this', 'have', 'our', 'for', 'they', 'didn', 'isn', 'being', 'can', 'its', 'under', 'him', 'at', 'and', 'their',

In [None]:
stop_words = set(stopwords.words("english"))
sentence = "Backgammon is one of the oldest known board games."

words = nltk.word_tokenize(sentence)
print(words)
without_stop_words = [word for word in words if not word in stop_words]
print(without_stop_words)

['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games', '.']
['Backgammon', 'one', 'oldest', 'known', 'board', 'games', '.']


In [None]:
lemmatizer.lemmatize("games", wordnet.NOUN)

'game'

##Punctuation Removal

In [None]:
# This is an alternate way to word tokenize - advantage : it will remove the punctuation marks

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer('\w+')
# \w -> word character[A-Za-z0-9_] 
# \w+ -> whenever u see a word character, keep continuing
tokenizer.tokenize('Eight-seven miles to go, yet. Onward!')


['Eight', 'seven', 'miles', 'to', 'go', 'yet', 'Onward']

## Parts of speech tagging

In [None]:
sentence = "Backgammon is one of the oldest known board games."

words = nltk.word_tokenize(sentence) #tokenizes the sentence into words
tagged = nltk.tag.pos_tag(words)  # nlt.tag.pos_tag(list_of_words)
print(tagged)#evalletlet me fix all

[('Backgammon', 'NNP'), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('oldest', 'JJS'), ('known', 'VBN'), ('board', 'NN'), ('games', 'NNS'), ('.', '.')]


In [None]:
nltk.download('tagsets')
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


https://colab.research.google.com/drive/1fWXb6nOXoXIZvdbjPG8USYHpm1_hBxOv#scrollTo=MfbaMbNXtR0K


https://colab.research.google.com/drive/1P-LwrUB5ungPgpjFueiOPnfdKcsxSziq#scrollTo=fcPAvR0-kkpQ
