<a href="https://colab.research.google.com/github/Akhila-Karuvaje/NLP_lab/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install necessary packages (run only once in your environment)
!pip install nltk
!pip install langdetect

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from langdetect import detect

# Download necessary resources
nltk.download('punkt_tab')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    print("Tokenized Words:", tokens)

    # Filtration (Removing punctuations and special characters)
    filtered_tokens = [word for word in tokens if word.isalnum()]
    print("Filtered Tokens:", filtered_tokens)

    # Script Validation (Removing non-English words/characters)
    validated_tokens = [word for word in filtered_tokens if re.match("^[A-Za-z]+$", word)]
    print("Script Validated Tokens:", validated_tokens)

    # Stop Word Removal
    stop_words = set(stopwords.words('english'))
    tokens_without_stopwords = [word for word in validated_tokens if word.lower() not in stop_words]
    print("Tokens after Stop Word Removal:", tokens_without_stopwords)

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens_without_stopwords]
    print("Stemmed Tokens:", stemmed_tokens)

    return stemmed_tokens

# Example usage
text = "Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence!"
preprocessed_text = preprocess_text(text)

# Language detection example
text_for_detection = "नमस्ते, आप कैसे हैं?"
lang = detect(text_for_detection)
print(f"The language of the text is: {lang}")




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Tokenized Words: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence', '!']
Filtered Tokens: ['Natural', 'Language', 'Processing', 'NLP', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence']
Script Validated Tokens: ['Natural', 'Language', 'Processing', 'NLP', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence']
Tokens after Stop Word Removal: ['Natural', 'Language', 'Processing', 'NLP', 'fascinating', 'field', 'Artificial', 'Intelligence']
Stemmed Tokens: ['natur', 'languag', 'process', 'nlp', 'fascin', 'field', 'artifici', 'intellig']
The language of the text is: hi


In [4]:
# Install required package (only once)
!pip install nltk

import nltk
from nltk import word_tokenize, ngrams
from collections import Counter
import math

# Download tokenizer resources
nltk.download('punkt_tab')

# Function to generate n-grams
def generate_ngrams(text, n):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercase conversion
    return list(ngrams(tokens, n))

# Function to calculate probability distribution for n-grams
def calculate_ngram_probabilities(ngrams_list):
    ngram_counts = Counter(ngrams_list)  # Count occurrences of each n-gram
    total_ngrams = sum(ngram_counts.values())  # Total number of n-grams
    probabilities = {ngram: count / total_ngrams for ngram, count in ngram_counts.items()}
    return probabilities

# Corpus of sentences
sentences = [
    "I love programming.",
    "Programming is fun.",
    "I love coding and programming."
]
text = " ".join(sentences)  # Combine all sentences into one training corpus

# Generate n-grams
unigrams = generate_ngrams(text, 1)
bigrams = generate_ngrams(text, 2)
trigrams = generate_ngrams(text, 3)

# Calculate n-gram probabilities
unigram_prob = calculate_ngram_probabilities(unigrams)
bigram_prob = calculate_ngram_probabilities(bigrams)
trigram_prob = calculate_ngram_probabilities(trigrams)

# Display unigram probabilities
print("\nUnigram Probabilities:")
for unigram, prob in unigram_prob.items():
    print(f"{unigram}: {prob:.4f}")

# Display bigram probabilities
print("\nBigram Probabilities:")
for bigram, prob in bigram_prob.items():
    print(f"{bigram}: {prob:.4f}")

# Display trigram probabilities
print("\nTrigram Probabilities:")
for trigram, prob in trigram_prob.items():
    print(f"{trigram}: {prob:.4f}")

# Function to compute total bigram and trigram probability of a test sentence
def compute_total_probability(test_sentence, bigram_prob, trigram_prob):
    test_words = nltk.word_tokenize(test_sentence.lower())

    # Compute bigram probability
    bigram_total = 1
    for i in range(1, len(test_words)):
        bigrams = (test_words[i - 1], test_words[i])
        bigram_total *= bigram_prob.get(bigrams, 1e-10)  # smoothing

    # Compute trigram probability
    trigram_total = 1
    for i in range(2, len(test_words)):
        trigrams = (test_words[i - 2], test_words[i - 1], test_words[i])
        trigram_total *= trigram_prob.get(trigrams, 1e-10)  # smoothing

    return bigram_total, trigram_total

# Test sentence
test_sentence = "I love programming"

# Compute total probabilities
bigram_total_prob, trigram_total_prob = compute_total_probability(test_sentence, bigram_prob, trigram_prob)

# Display results
print(f"\nTotal Probability (Bigram) for '{test_sentence}': {bigram_total_prob:.10f}")
print(f"Total Probability (Trigram) for '{test_sentence}': {trigram_total_prob:.10f}")



Unigram Probabilities:
('i',): 0.1429
('love',): 0.1429
('programming',): 0.2143
('.',): 0.2143
('is',): 0.0714
('fun',): 0.0714
('coding',): 0.0714
('and',): 0.0714

Bigram Probabilities:
('i', 'love'): 0.1538
('love', 'programming'): 0.0769
('programming', '.'): 0.1538
('.', 'programming'): 0.0769
('programming', 'is'): 0.0769
('is', 'fun'): 0.0769
('fun', '.'): 0.0769
('.', 'i'): 0.0769
('love', 'coding'): 0.0769
('coding', 'and'): 0.0769
('and', 'programming'): 0.0769

Trigram Probabilities:
('i', 'love', 'programming'): 0.0833
('love', 'programming', '.'): 0.0833
('programming', '.', 'programming'): 0.0833
('.', 'programming', 'is'): 0.0833
('programming', 'is', 'fun'): 0.0833
('is', 'fun', '.'): 0.0833
('fun', '.', 'i'): 0.0833
('.', 'i', 'love'): 0.0833
('i', 'love', 'coding'): 0.0833
('love', 'coding', 'and'): 0.0833
('coding', 'and', 'programming'): 0.0833
('and', 'programming', '.'): 0.0833

Total Probability (Bigram) for 'I love programming': 0.0118343195
Total Probability 

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
def min_edit_distance(str1, str2):
    m, n = len(str1), len(str2)

    # Initialize a matrix to store distances
    dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)]

    # Base cases: transforming empty string to another string
    for i in range(m + 1):
        dp[i][0] = i  # Cost of deleting all characters
    for j in range(n + 1):
        dp[0][j] = j  # Cost of inserting all characters

    # Compute distances
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]  # No operation needed
            else:
                dp[i][j] = 1 + min(
                    dp[i - 1][j],    # Deletion
                    dp[i][j - 1],    # Insertion
                    dp[i - 1][j - 1] # Substitution
                )
    return dp[m][n]

# Test the algorithm with different variations
test_cases = [
    ("kitten", "sitting"),       # Substitutions, insertions
    ("flaw", "lawn"),            # Substitutions, insertions, deletions
    ("intention", "execution"),  # Multiple operations
    ("apple", "aple"),           # Deletion
    ("recieve", "receive")       # Substitution
]

# Evaluate MED for each test case
for str1, str2 in test_cases:
    print(f"MED between '{str1}' and '{str2}': {min_edit_distance(str1, str2)}")


MED between 'kitten' and 'sitting': 3
MED between 'flaw' and 'lawn': 2
MED between 'intention' and 'execution': 5
MED between 'apple' and 'aple': 1
MED between 'recieve' and 'receive': 2


In [10]:
class TopDownParser:
    def __init__(self, input_string):
        self.input = input_string
        self.index = 0

    def match(self, char):
        if self.index < len(self.input) and self.input[self.index] == char:
            self.index += 1
            return True
        return False

    def S(self):
        if self.match('a') and self.A() and self.B():
            return True
        return False

    def A(self):
        if self.match('b'):
            return True
        return True  # Epsilon transition

    def B(self):
        return self.match('c')

    def parse(self):
        return self.S() and self.index == len(self.input)

# Example Usage
input_string = "abc"
td_parser = TopDownParser(input_string)
print("Top-Down Parsing Result:", td_parser.parse())


Top-Down Parsing Result: True


In [13]:
class BottomUpParser:
    def __init__(self, input_string):
        self.input = list(input_string)
        self.stack = []

    def shift(self):
        if self.input:
            self.stack.append(self.input.pop(0))

    def reduce(self):
      if len(self.stack) >= 2 and ''.join(self.stack[-2:]) == 'bc':
        self.stack[-2:] = ['A']
        return True
      elif len(self.stack) >= 2 and ''.join(self.stack[-2:]) == 'aA':
        self.stack[-2:] = ['S']
        return True
      return False


    def parse(self):
        while self.input or len(self.stack) > 1:
            self.shift()
            while self.reduce():
                pass
        return self.stack == ['S']

# Example Usage
input_string = "abc"
bu_parser = BottomUpParser(input_string)
print("Bottom-Up Parsing Result:", bu_parser.parse())

Bottom-Up Parsing Result: True


In [14]:
from collections import defaultdict
import math

# Training dataset with (text, label)
data = [
    ("fun, couple, love, love, comedy", "Comedy"),
    ("fast, furious, shoot, action", "Action"),
    ("couple, fly, fast, fun, fun, comedy", "Comedy"),
    ("furious, shoot, shoot, fun, action", "Action"),
    ("fly, fast, shoot, love, action", "Action")
]

test_doc = "fast, couple, shoot, fly"

# Tokenization function
def tokenize(text):
    return text.lower().split(", ")

# Naïve Bayes Training Function
def train_naive_bayes(data):
    word_counts = defaultdict(lambda: defaultdict(int))
    class_counts = defaultdict(int)
    vocab = set()

    for text, label in data:
        class_counts[label] += 1
        for word in tokenize(text):
            word_counts[label][word] += 1
            vocab.add(word)

    return word_counts, class_counts, len(vocab)

# Naïve Bayes Classification Function
def classify(text, word_counts, class_counts, vocab_size):
    priors = {
        c: math.log(class_counts[c] / sum(class_counts.values()))
        for c in class_counts
    }
    posteriors = {
        c: priors[c] + sum(
            math.log((word_counts[c][w] + 1) /
                     (sum(word_counts[c].values()) + vocab_size))
            for w in tokenize(text)
        )
        for c in class_counts
    }
    return max(posteriors, key=posteriors.get)

# Train and test
word_counts, class_counts, vocab_size = train_naive_bayes(data)
predicted_class = classify(test_doc, word_counts, class_counts, vocab_size)

print(f"Test Doc: {test_doc}")
print(f"Predicted Class: {predicted_class}")


Test Doc: fast, couple, shoot, fly
Predicted Class: Action


In [19]:
import nltk
from nltk.corpus import brown, inaugural, reuters, udhr
from nltk import FreqDist, ConditionalFreqDist
from nltk.tokenize import word_tokenize
from nltk.tag import UnigramTagger
from nltk.probability import FreqDist
from nltk.corpus import PlaintextCorpusReader
from google.colab import files
uploaded = files.upload()  # This will prompt you to upload files


# Download required datasets
nltk.download('brown')
nltk.download('inaugural')
nltk.download('reuters')
nltk.download('udhr')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# 1. Study various Corpora
def explore_corpora():
    print("\nBrown Corpus Categories:", brown.categories())
    print("First 10 words in Inaugural Corpus:", inaugural.words()[:10])
    print("First 10 words in Reuters Corpus:", reuters.words()[:10])
    print("First 10 words in UDHR Corpus (English):", udhr.words('English-Latin1')[:10])

explore_corpora()

# 2. Create and use a custom corpus
def create_custom_corpus():
    corpus_root = "/content/"
    wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt')
    print("Files in Corpus:", wordlists.fileids())
    print("First 20 words:", wordlists.words()[:20])

create_custom_corpus()

# 3. Conditional Frequency Distribution
def conditional_frequency():
    cfd = ConditionalFreqDist(
        (genre, word.lower())
        for genre in brown.categories()
        for word in brown.words(categories=genre)
    )
    print("\nMost common words in 'news' category:", cfd['news'].most_common(10))

conditional_frequency()

# 4. Study Tagged Corpora & Find Most Frequent Noun Tags
def frequent_noun_tags():
    tagged_words = brown.tagged_words()
    noun_tags = [word for word, tag in tagged_words if tag.startswith('NN')]
    fdist = FreqDist(noun_tags)
    print("\nMost Frequent Nouns:", fdist.most_common(10))

frequent_noun_tags()

# 5. Map Words to Properties Using Python Dictionaries
def word_properties():
    word_dict = {
        "AI": "Artificial Intelligence",
        "NLP": "Natural Language Processing",
        "ML": "Machine Learning"
    }
    print("\nWord Properties:", word_dict["NLP"])

word_properties()

# 6. Rule-based Tagger & Unigram Tagger
def unigram_tagger_demo():
    tagged_sents = brown.tagged_sents(categories='news')
    train_data = tagged_sents[:4000]
    test_data = tagged_sents[4000:]
    unigram_tagger = UnigramTagger(train_data)
    accuracy = unigram_tagger.evaluate(test_data)
    print("\nUnigram Tagger Accuracy:", round(accuracy, 4))
    return unigram_tagger  # Return the actual tagger if needed

unigram_tagger_demo()

# 7. Word Segmentation from Continuous Text
def segment_text(text, corpus_words):
    words = []
    while text:
        for i in range(len(text), 0, -1):
            if text[:i] in corpus_words:
                words.append(text[:i])
                text = text[i:]
                break
        else:
            # If no match, take one character to avoid infinite loop
            words.append(text[0])
            text = text[1:]
    return words

def word_segmentation():
    corpus_words = set(w.lower() for w in brown.words())  # Lowercase for matching
    text = "thisisatextwithoutspaces"
    segmented_words = segment_text(text, corpus_words)
    print("\nSegmented Words:", segmented_words)

word_segmentation()

Saving a.txt.txt to a.txt (1).txt

Brown Corpus Categories: ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
First 10 words in Inaugural Corpus: ['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House']
First 10 words in Reuters Corpus: ['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN']
First 10 words in UDHR Corpus (English): ['Universal', 'Declaration', 'of', 'Human', 'Rights', 'Preamble', 'Whereas', 'recognition', 'of', 'the']
Files in Corpus: ['a.txt (1).txt', 'a.txt.txt']
First 20 words: ['i', 'am', 'going', 'to', 'college', 'i', 'am', 'going', 'to', 'college']


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package udhr to /root/nltk_data...
[nltk_data]   Package udhr is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!



Most common words in 'news' category: [('the', 6386), (',', 5188), ('.', 4030), ('of', 2861), ('and', 2186), ('to', 2144), ('a', 2130), ('in', 2020), ('for', 969), ('that', 829)]

Most Frequent Nouns: [('time', 1555), ('man', 1148), ('Af', 994), ('years', 942), ('way', 883), ('people', 809), ('men', 736), ('world', 684), ('life', 676), ('year', 647)]

Word Properties: Natural Language Processing


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  accuracy = unigram_tagger.evaluate(test_data)



Unigram Tagger Accuracy: 0.8111

Segmented Words: ['this', 'is', 'ate', 'x', 't', 'without', 'spaces']


In [20]:
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')

def get_synonyms_antonyms(word):
    synonyms = set()
    antonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
            if lemma.antonyms():
                antonyms.add(lemma.antonyms()[0].name())
    return synonyms, antonyms

# Get synonyms and antonyms of the word "active"
word = "active"
synonyms, antonyms = get_synonyms_antonyms(word)

print(f"Word: {word}")
print("Synonyms:", synonyms)
print("Antonyms:", antonyms)


[nltk_data] Downloading package wordnet to /root/nltk_data...


Word: active
Synonyms: {'active_agent', 'dynamic', 'active', 'alive', 'active_voice', 'combat-ready', 'participating', 'fighting'}
Antonyms: {'passive', 'stative', 'inactive', 'dormant', 'quiet', 'passive_voice', 'extinct'}


In [23]:
import torch
from transformers import MarianMTModel, MarianTokenizer

# Check if CUDA is available and use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)

# Translation function
def translate_text(text, src_lang="en", tgt_lang="fr"):
    inputs = tokenizer(text, return_tensors="pt", padding=True).to(device)
    translated_tokens = model.generate(**inputs)
    translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    return translated_text[0]

# Example usage
text = "Hello, how are you?"
translated = translate_text(text, src_lang="en", tgt_lang="fr")

print("Original Text:", text)
print("Translated Text:", translated)

Using device: cpu


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Original Text: Hello, how are you?
Translated Text: Bonjour, comment allez-vous ?
