<a href="https://colab.research.google.com/github/11239A086/NLP_RECORD/blob/main/NLP_RECORD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **TOKENIZATION**

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')
text = "I love Natural Language Processing."
print(word_tokenize(text))

['I', 'love', 'Natural', 'Language', 'Processing', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


**LEMMATIZATION**

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lm = WordNetLemmatizer()
print(lm.lemmatize("swimming", pos='v'))


swim


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**STEMMING**

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
words = ["playing", "played", "plays"]
print([ps.stem(w) for w in words])



['play', 'play', 'play']


**SPELL** **CORRECTION**

In [None]:
import difflib

words = ["apple", "banana", "orange", "grape", "mango"]  # Dictionary
word = "magno"       # Misspelled word
correct = difflib.get_close_matches(word, words, n=1)[0]
print(correct)


mango


**DEDUCTION**

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

ps = PorterStemmer()

def stem(text):
    return [ps.stem(w) for w in word_tokenize(text.lower())]

def deduce(p, h):
    return "entailment" if h in p else "no entailment"

print("Stems:", stem("running runner runs easily fairer"))
print("Deduction:", deduce("All men are mortal Socrates is a man", "Socrates is mortal"))

Stems: ['run', 'runner', 'run', 'easili', 'fairer']
Deduction: no entailment


**MORPHOLOGY**

In [None]:
import spacy

# Load the English tokenizer and tagger
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "The children are playing in the park."

# Process the text
doc = nlp(text)

# Display morphological features for each token
for token in doc:
    print(f"Word: {token.text}")
    print(f"  Lemma: {token.lemma_}")
    print(f"  POS: {token.pos_}")
    print(f"  Morph: {token.morph}")
    print("-" * 30)

Word: The
  Lemma: the
  POS: DET
  Morph: Definite=Def|PronType=Art
------------------------------
Word: children
  Lemma: child
  POS: NOUN
  Morph: Number=Plur
------------------------------
Word: are
  Lemma: be
  POS: AUX
  Morph: Mood=Ind|Tense=Pres|VerbForm=Fin
------------------------------
Word: playing
  Lemma: play
  POS: VERB
  Morph: Aspect=Prog|Tense=Pres|VerbForm=Part
------------------------------
Word: in
  Lemma: in
  POS: ADP
  Morph: 
------------------------------
Word: the
  Lemma: the
  POS: DET
  Morph: Definite=Def|PronType=Art
------------------------------
Word: park
  Lemma: park
  POS: NOUN
  Morph: Number=Sing
------------------------------
Word: .
  Lemma: .
  POS: PUNCT
  Morph: PunctType=Peri
------------------------------


# N-GRAMS

In [9]:
import nltk
from nltk import ngrams, FreqDist
from nltk.tokenize import word_tokenize

def ngram_probability_nltk(text, n):
    # Tokenize the text into words
    words = word_tokenize(text.lower())

    # Generate n-grams and (n-1)-grams
    ngrams_list = list(ngrams(words, n))
    n_minus_1_list = list(ngrams(words, n-1)) if n > 1 else None

    # Frequency counts
    ngram_freq = FreqDist(ngrams_list)
    if n > 1:
        n_minus_1_freq = FreqDist(n_minus_1_list)

    probabilities = {}
    for gram in ngram_freq:
        if n == 1:
            # Unigram probability
            probabilities[gram] = ngram_freq[gram] / len(words)
        else:
            # Bigram/Trigram probability
            probabilities[gram] = ngram_freq[gram] / n_minus_1_freq[gram[:-1]]

    return probabilities

# Example usage
text = "I love natural language processing and I love machine learning"
n = 3  # Change to 1 for unigram, 2 for bigram, 3 for trigram

prob = ngram_probability_nltk(text, n)
for gram, p in prob.items():
    print(f"{gram}: {p:.4f}")


('i', 'love', 'natural'): 0.5000
('love', 'natural', 'language'): 1.0000
('natural', 'language', 'processing'): 1.0000
('language', 'processing', 'and'): 1.0000
('processing', 'and', 'i'): 1.0000
('and', 'i', 'love'): 1.0000
('i', 'love', 'machine'): 0.5000
('love', 'machine', 'learning'): 1.0000


In [6]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# **SMOOTHING** **N-GRAM** (**LAPLACE** **SMOOTHING**)

In [10]:
def ngram_smooth(text, n=2):
    words = text.split()
    N = len(words) - n + 1
    ngrams = [tuple(words[i:i+n]) for i in range(N)]
    V = len(set(ngrams))
    for ng in sorted(set(ngrams)):
        count = ngrams.count(ng)
        print(ng, "->", round((count+1)/(N+V), 4))

text = "I love NLP and I love Python"
ngram_smooth(text, 2)

('I', 'love') -> 0.2727
('NLP', 'and') -> 0.1818
('and', 'I') -> 0.1818
('love', 'NLP') -> 0.1818
('love', 'Python') -> 0.1818


**POS** **TAGGING**

In [11]:
import spacy

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Input sentence
text = "The quick brown fox jumps over the lazy dog."

# Process the text
doc = nlp(text)

# Print word and its POS tag
for token in doc:
    print(token.text, "→", token.pos_)

The → DET
quick → ADJ
brown → ADJ
fox → NOUN
jumps → VERB
over → ADP
the → DET
lazy → ADJ
dog → NOUN
. → PUNCT


**BENDING** **POS** **TAGGER**

In [16]:
import nltk
from nltk.tag import brill, BrillTaggerTrainer

# Training data (small example)
train_data = [
    [('The','DT'),('dog','NN'),('barks','VBZ')],
    [('A','DT'),('cat','NN'),('meows','VBZ')]
]

# Base tagger (Unigram)
base_tagger = nltk.UnigramTagger(train_data)

# Brill trainer with fntbl37 templates
trainer = BrillTaggerTrainer(base_tagger, brill.fntbl37())
brill_tagger = trainer.train(train_data)

# Test
sentence = ['The','cat','barks']
print(brill_tagger.tag(sentence))


[('The', 'DT'), ('cat', 'NN'), ('barks', 'VBZ')]


**Hidden** **Markov** **Model**

In [15]:
import nltk
from nltk.corpus import treebank

# Training data
tagged_sentences = treebank.tagged_sents()[:3000]
test_sentences = treebank.tagged_sents()[3000:]

# Train Hidden Markov Model
from nltk.tag import hmm
trainer = hmm.HiddenMarkovModelTrainer()
hmm_model = trainer.train_supervised(tagged_sentences)

# Test the model
print("Predicted tags:", hmm_model.tag(["This", "is", "a", "test"]))
print("Accuracy:", hmm_model.evaluate(test_sentences))


Predicted tags: [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('test', 'NN')]


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Accuracy:", hmm_model.evaluate(test_sentences))


Accuracy: 0.36844377293330455


In [13]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

**NORMALIZATION**

In [17]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)

lem = WordNetLemmatizer()

text = "running runners ran easily fairer"
tokens = word_tokenize(text.lower())
norm = [lem.lemmatize(w) for w in tokens]

print("Original:", tokens)
print("Normalized:", norm)

Original: ['running', 'runners', 'ran', 'easily', 'fairer']
Normalized: ['running', 'runner', 'ran', 'easily', 'fairer']
