## Introduction to NLP


In [12]:
!pip install nltk



In [13]:
import nltk

In [14]:
text = "On Wednesday, the Association for Computing Machinery, the world’s largest society of computing professionals, announced that Hinton, LeCun and Bengio had won this year’s Turing Award for their work on neural networks. The Turing Award, which was introduced in 1966, is often called the Nobel Prize of computing, and it includes a $1 million prize, which the three scientists will share."


In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
from nltk.tokenize import sent_tokenize
sent_tk = sent_tokenize(text)
print("Sentence tokenizing the text: \n")
print(sent_tk)

Sentence tokenizing the text: 

['On Wednesday, the Association for Computing Machinery, the world’s largest society of computing professionals, announced that Hinton, LeCun and Bengio had won this year’s Turing Award for their work on neural networks.', 'The Turing Award, which was introduced in 1966, is often called the Nobel Prize of computing, and it includes a $1 million prize, which the three scientists will share.']


In [17]:
from nltk.tokenize import word_tokenize
word_tk = word_tokenize(text)
print("Word tokenizing the text:\n")
print(word_tk)

Word tokenizing the text:

['On', 'Wednesday', ',', 'the', 'Association', 'for', 'Computing', 'Machinery', ',', 'the', 'world', '’', 's', 'largest', 'society', 'of', 'computing', 'professionals', ',', 'announced', 'that', 'Hinton', ',', 'LeCun', 'and', 'Bengio', 'had', 'won', 'this', 'year', '’', 's', 'Turing', 'Award', 'for', 'their', 'work', 'on', 'neural', 'networks', '.', 'The', 'Turing', 'Award', ',', 'which', 'was', 'introduced', 'in', '1966', ',', 'is', 'often', 'called', 'the', 'Nobel', 'Prize', 'of', 'computing', ',', 'and', 'it', 'includes', 'a', '$', '1', 'million', 'prize', ',', 'which', 'the', 'three', 'scientists', 'will', 'share', '.']


In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
from nltk.corpus import stopwords

sw = set(stopwords.words("english"))
print("Stop words in English language are: \n")
print(sw)

Stop words in English language are: 

{'y', 'myself', 's', "won't", 'them', 'having', 'there', 'do', "she's", 'who', 'have', "shan't", 'ourselves', 'after', 'they', 'those', 'out', "doesn't", 'few', 'our', 'not', 'about', "wouldn't", "hadn't", 'yourself', 'further', 'just', 'hasn', 'as', 'while', 'up', 'if', 'being', 'did', "isn't", 'each', 'through', 'd', 'couldn', 'himself', 'with', 'some', 'your', 'which', "should've", "wasn't", 't', 'he', 'own', 'off', 'i', 'o', 'is', 'shouldn', 'wouldn', 'am', 'wasn', 'but', 'where', 'herself', 'themselves', 'on', 'that', 'above', 'mustn', 'no', 'should', 've', 'now', "mustn't", 'won', 'were', 'was', 'their', 'weren', 'or', 'because', 'both', "you're", 'whom', 'isn', 'she', 'we', 'to', 'll', 'are', 'when', 'only', 'under', 'ma', "it's", 'during', 'below', "couldn't", "don't", 'by', "hasn't", "that'll", 'until', 'against', 'more', 'doesn', 'again', "you'll", 'a', 'so', 'doing', 're', 'into', 'in', 'nor', 'ain', 'aren', 'here', 'between', 'very', 't

In [20]:
filtered_words = [w for w in word_tk if not w in sw]

print("The text after removing stop words \n")
print(filtered_words)

The text after removing stop words 

['On', 'Wednesday', ',', 'Association', 'Computing', 'Machinery', ',', 'world', '’', 'largest', 'society', 'computing', 'professionals', ',', 'announced', 'Hinton', ',', 'LeCun', 'Bengio', 'year', '’', 'Turing', 'Award', 'work', 'neural', 'networks', '.', 'The', 'Turing', 'Award', ',', 'introduced', '1966', ',', 'often', 'called', 'Nobel', 'Prize', 'computing', ',', 'includes', '$', '1', 'million', 'prize', ',', 'three', 'scientists', 'share', '.']


In [21]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

port_stem = PorterStemmer()

In [22]:
stemmed_words = []

for w in filtered_words:
    stemmed_words.append(port_stem.stem(w))
    
print("Filtered Sentence: \n", filtered_words, "\n")
print("Stemmed Sentence: \n", stemmed_words)

Filtered Sentence: 
 ['On', 'Wednesday', ',', 'Association', 'Computing', 'Machinery', ',', 'world', '’', 'largest', 'society', 'computing', 'professionals', ',', 'announced', 'Hinton', ',', 'LeCun', 'Bengio', 'year', '’', 'Turing', 'Award', 'work', 'neural', 'networks', '.', 'The', 'Turing', 'Award', ',', 'introduced', '1966', ',', 'often', 'called', 'Nobel', 'Prize', 'computing', ',', 'includes', '$', '1', 'million', 'prize', ',', 'three', 'scientists', 'share', '.'] 

Stemmed Sentence: 
 ['on', 'wednesday', ',', 'associ', 'comput', 'machineri', ',', 'world', '’', 'largest', 'societi', 'comput', 'profession', ',', 'announc', 'hinton', ',', 'lecun', 'bengio', 'year', '’', 'ture', 'award', 'work', 'neural', 'network', '.', 'the', 'ture', 'award', ',', 'introduc', '1966', ',', 'often', 'call', 'nobel', 'prize', 'comput', ',', 'includ', '$', '1', 'million', 'prize', ',', 'three', 'scientist', 'share', '.']


Sentence Tokenization: Breaking down paragraphs into individual sentences.

Word Tokenization: Breaking down sentences into individual words.

Stop Words: Common words that add little meaning and are often removed from text analysis.

Stemming and Lemmatizing: Reducing words to their root or base forms for analysis.

Part of Speech Tagging: Identifying the grammatical parts of speech in a text.

Frequency Distribution Plotting: Counting and plotting the frequency of words to understand text distribution and perform sentiment analysis.

In [23]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
lemm_words=[]



In [25]:
for i in range(len(filtered_words)):
    lemm_words.append(lem.lemmatize(filtered_words[i]))
    
print(lemm_words)

['On', 'Wednesday', ',', 'Association', 'Computing', 'Machinery', ',', 'world', '’', 'largest', 'society', 'computing', 'professional', ',', 'announced', 'Hinton', ',', 'LeCun', 'Bengio', 'year', '’', 'Turing', 'Award', 'work', 'neural', 'network', '.', 'The', 'Turing', 'Award', ',', 'introduced', '1966', ',', 'often', 'called', 'Nobel', 'Prize', 'computing', ',', 'includes', '$', '1', 'million', 'prize', ',', 'three', 'scientist', 'share', '.']


In [27]:
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [28]:
from nltk import pos_tag

In [30]:
pos_tagged_words = pos_tag(word_tk)
print(pos_tagged_words)

[('On', 'IN'), ('Wednesday', 'NNP'), (',', ','), ('the', 'DT'), ('Association', 'NNP'), ('for', 'IN'), ('Computing', 'VBG'), ('Machinery', 'NNP'), (',', ','), ('the', 'DT'), ('world', 'NN'), ('’', 'NNP'), ('s', 'RB'), ('largest', 'JJS'), ('society', 'NN'), ('of', 'IN'), ('computing', 'VBG'), ('professionals', 'NNS'), (',', ','), ('announced', 'VBD'), ('that', 'IN'), ('Hinton', 'NNP'), (',', ','), ('LeCun', 'NNP'), ('and', 'CC'), ('Bengio', 'NNP'), ('had', 'VBD'), ('won', 'VBN'), ('this', 'DT'), ('year', 'NN'), ('’', 'VBZ'), ('s', 'JJ'), ('Turing', 'NNP'), ('Award', 'NNP'), ('for', 'IN'), ('their', 'PRP$'), ('work', 'NN'), ('on', 'IN'), ('neural', 'JJ'), ('networks', 'NNS'), ('.', '.'), ('The', 'DT'), ('Turing', 'NNP'), ('Award', 'NNP'), (',', ','), ('which', 'WDT'), ('was', 'VBD'), ('introduced', 'VBN'), ('in', 'IN'), ('1966', 'CD'), (',', ','), ('is', 'VBZ'), ('often', 'RB'), ('called', 'VBN'), ('the', 'DT'), ('Nobel', 'NNP'), ('Prize', 'NNP'), ('of', 'IN'), ('computing', 'NN'), (',',

In [None]:
from nltk.probability import FreqDist
fd = FreqDist(word_tk)
print