In [1]:
import nltk # natural language toolkit

In [2]:
from nltk.tokenize import word_tokenize # NLP has two tokenizers
from nltk.tokenize import sent_tokenize

In [3]:
# Tokenization is breaking the sentences 
a = "Hello and welcome friends to the first NLP workshop. My name is Aryan Mishra. I will teaching you NLP from scratch"

In [4]:
A = word_tokenize(a) # word_tokenize converts words into tokens .
A

['Hello',
 'and',
 'welcome',
 'friends',
 'to',
 'the',
 'first',
 'NLP',
 'workshop',
 '.',
 'My',
 'name',
 'is',
 'Aryan',
 'Mishra',
 '.',
 'I',
 'will',
 'teaching',
 'you',
 'NLP',
 'from',
 'scratch']

In [5]:
S = sent_tokenize(a) # sentence tokenizer breaks sentences into parts when it sees a full stop then the sentence is complete.
S

['Hello and welcome friends to the first NLP workshop.',
 'My name is Aryan Mishra.',
 'I will teaching you NLP from scratch']

### Type, Length and Frequency Checking

In [6]:
type(A),len(A)

(list, 23)

In [7]:
from nltk.probability import FreqDist # 
frequency = FreqDist()

In [8]:
for i in A:
    frequency[i] = frequency[i]+1 # elements has occured

frequency

FreqDist({'NLP': 2, '.': 2, 'Hello': 1, 'and': 1, 'welcome': 1, 'friends': 1, 'to': 1, 'the': 1, 'first': 1, 'workshop': 1, ...})

### Stemming

In [9]:
from nltk.stem import PorterStemmer # converting a word into root word - Stemming just cut the words
pst = PorterStemmer()

In [10]:
pst.stem('Making')

'make'

In [11]:
for i in A:
    print(pst.stem(i))

hello
and
welcom
friend
to
the
first
nlp
workshop
.
my
name
is
aryan
mishra
.
i
will
teach
you
nlp
from
scratch


In [12]:
pst.stem('universal')

'univers'

In [13]:
pst.stem('universe')

'univers'

In [14]:
pst.stem('university')

'univers'

In [15]:
pst.stem('alumni') # Problem in stemming

'alumni'

In [16]:
pst.stem('alumus')

'alumu'

### Lemmatization 

In [17]:
import nltk # Lemmatization is used to prevent the drawback of stemming they do not give wrong words for different meanings
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [18]:
lemmatizer = WordNetLemmatizer()

In [19]:
pst.stem('trouble')

'troubl'

In [20]:
lemmatizer.lemmatize('trouble')

'trouble'

In [21]:
for i in A:
    print(lemmatizer.lemmatize(i))

Hello
and
welcome
friend
to
the
first
NLP
workshop
.
My
name
is
Aryan
Mishra
.
I
will
teaching
you
NLP
from
scratch


In [22]:
lemmatizer.lemmatize('alumnus')

'alumnus'

In [23]:
lemmatizer.lemmatize('alumni')

'alumnus'

In [24]:
# Lemmatization not only cut or split the words but also matches its meaning with the dictionary but in stemming it just cuts the words they 
# do not match the meaning with the dictionary this is the difference between lemmatization and stemming.

# Lemmatization takes more time than stemming

### Pos_tag

In [25]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno 11001] getaddrinfo failed>


False

In [26]:
for i in A:
    print(nltk.pos_tag([i]))

[('Hello', 'NN')]
[('and', 'CC')]
[('welcome', 'NN')]
[('friends', 'NNS')]
[('to', 'TO')]
[('the', 'DT')]
[('first', 'RB')]
[('NLP', 'NN')]
[('workshop', 'NN')]
[('.', '.')]
[('My', 'PRP$')]
[('name', 'NN')]
[('is', 'VBZ')]
[('Aryan', 'NN')]
[('Mishra', 'NN')]
[('.', '.')]
[('I', 'PRP')]
[('will', 'MD')]
[('teaching', 'VBG')]
[('you', 'PRP')]
[('NLP', 'NN')]
[('from', 'IN')]
[('scratch', 'NN')]


### Name Entity Recognition - 

In [27]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [28]:
text = 'Harry Lives in New York'
words = word_tokenize(text)
postags = pos_tag(words)

In [29]:
tree = nltk.ne_chunk(postags)
print(tree)

LookupError: 
**********************************************************************
  Resource [93mmaxent_ne_chunker[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('maxent_ne_chunker')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mchunkers/maxent_ne_chunker/english_ace_multiclass.pickle[0m

  Searched in:
    - 'C:\\Users\\user/nltk_data'
    - 'c:\\Users\\user\\miniconda3\\nltk_data'
    - 'c:\\Users\\user\\miniconda3\\share\\nltk_data'
    - 'c:\\Users\\user\\miniconda3\\lib\\nltk_data'
    - 'C:\\Users\\user\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


### Stopwords - 

In [30]:
from nltk.corpus import stopwords

In [31]:
stop_words = set(stopwords.words('english'))

In [32]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [33]:
msg = "My name is Aryan Mishra, I love making videos and watching cricket. My speciality is making things easy"

words = word_tokenize(msg)

filtered_sentence = []

for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)
    
print(words)
print(filtered_sentence) # stopwords are removed from the sentence in the output.

['My', 'name', 'is', 'Aryan', 'Mishra', ',', 'I', 'love', 'making', 'videos', 'and', 'watching', 'cricket', '.', 'My', 'speciality', 'is', 'making', 'things', 'easy']
['My', 'name', 'Aryan', 'Mishra', ',', 'I', 'love', 'making', 'videos', 'watching', 'cricket', '.', 'My', 'speciality', 'making', 'things', 'easy']


### BOW - Back of Word

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [35]:
A1 = 'hello and welcome dosto' # A sentence is document and the collection of sentence is known as corpus.
A2 = 'shri love NLP' # Vocabulary - unique words 
A3 = 'shri is good boy'

In [36]:
vectorizer = CountVectorizer(stop_words='english')
vectors = vectorizer.fit_transform([A1,A2,A3])
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()

result = pd.DataFrame(dense,columns=feature_names)

print(result)

   boy  dosto  good  hello  love  nlp  shri  welcome
0    0      1     0      1     0    0     0        1
1    0      0     0      0     1    1     1        0
2    1      0     1      0     0    0     1        0
