<a href="https://colab.research.google.com/github/11239m006/Natural_Language_Processing/blob/main/Nlp_exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Tokenization**



In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')  # <-- required in the latest NLTK
from nltk.tokenize import word_tokenize

text = "I love Natural Language Processing."
print(word_tokenize(text))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['I', 'love', 'Natural', 'Language', 'Processing', '.']


### **Stemming**

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
words = ["playing", "played", "plays"]
print([ps.stem(w) for w in words])


['play', 'play', 'play']


### **Lemmatization**

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lm = WordNetLemmatizer()
print(lm.lemmatize("running", pos='v'))


run


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### **Morphology (Prefix/Suffix Split Example)**

In [None]:
word = "unkindness"
prefix, root, suffix = word[:2], word[2:-4], word[-4:]
print(prefix, root, suffix)   # un + kind + ness


un kind ness


### **Normalization**


In [None]:
import re
text = "Hello!!! NLP, is FUN."
text = text.lower()
print(re.sub(r'[^a-zA-Z\s]', '', text))


hello nlp is fun


### **N-Gram (Unigram, Bigram, Trigram)**

In [None]:
from nltk import ngrams
text = "Natural language processing is amazing".split()
print(list(ngrams(text, 1)), list(ngrams(text, 2)), list(ngrams(text, 3)))


[('Natural',), ('language',), ('processing',), ('is',), ('amazing',)] [('Natural', 'language'), ('language', 'processing'), ('processing', 'is'), ('is', 'amazing')] [('Natural', 'language', 'processing'), ('language', 'processing', 'is'), ('processing', 'is', 'amazing')]


### **N-Gram Smoothing (Add-1 Laplace)**

In [None]:
from collections import Counter
words = "a a b a b c".split()
freq = Counter(words)
print((freq['m'] + 1) / (len(words) + len(freq)))  # P(a)


0.1111111111111111


### **POS Tagging**

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')  # <-- new required file

text = nltk.word_tokenize("He is playing football")
print(nltk.pos_tag(text))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[('He', 'PRP'), ('is', 'VBZ'), ('playing', 'VBG'), ('football', 'NN')]


### **Hidden Markov Model (Simple POS Example using NLTK)**

In [None]:
import nltk
from nltk.tag import hmm
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised([([ 'I', 'eat'], ['PRP','VB'])])
print(tagger.tag(['I','eat']))


[('I', 'eat'), ('eat', 'eat')]


  X[i, j] = self._transitions[si].logprob(self._states[j])
  O[i, k] = self._output_logprob(si, self._symbols[k])
  P[i] = self._priors.logprob(si)
  O[i, k] = self._output_logprob(si, self._symbols[k])


### **Brill/Bidirectional POS Tagger (Bending POS Tagger)**

In [None]:
from nltk.tag.brill import brill24
from nltk.tag import RegexpTagger, BrillTaggerTrainer
basic = RegexpTagger([(r'.*', 'NN')])
trainer = BrillTaggerTrainer(basic, brill24())
tagger = trainer.train([(['He','runs'],['PRP','VBZ'])])
print(tagger.tag(['He','runs']))


[('He', 'NN'), ('runs', 'NN')]
