<a href="https://colab.research.google.com/github/2403a52012-lgtm/AIAC/blob/main/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import nltk
from nltk import word_tokenize, pos_tag
from collections import defaultdict, Counter

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab') # Added to resolve LookupError
nltk.download('averaged_perceptron_tagger_eng') # Added to resolve specific LookupError

# Sample abstracts (shortened for lab demo)
abstracts = [
    "We propose a novel neural network architecture for image classification.",
    "This paper presents a probabilistic model for sequence labeling tasks.",
    "Experimental results demonstrate significant improvements over baseline methods.",
    "The algorithm achieves state of the art performance on benchmark datasets.",
    "We analyze the convergence properties of stochastic gradient descent."
]

# POS tagging
tagged_sentences = []
for abs_text in abstracts:
    tokens = word_tokenize(abs_text)
    tagged = pos_tag(tokens)
    tagged_sentences.append(tagged)

tagged_sentences

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[[('We', 'PRP'),
  ('propose', 'VBP'),
  ('a', 'DT'),
  ('novel', 'JJ'),
  ('neural', 'JJ'),
  ('network', 'NN'),
  ('architecture', 'NN'),
  ('for', 'IN'),
  ('image', 'NN'),
  ('classification', 'NN'),
  ('.', '.')],
 [('This', 'DT'),
  ('paper', 'NN'),
  ('presents', 'VBZ'),
  ('a', 'DT'),
  ('probabilistic', 'JJ'),
  ('model', 'NN'),
  ('for', 'IN'),
  ('sequence', 'NN'),
  ('labeling', 'NN'),
  ('tasks', 'NNS'),
  ('.', '.')],
 [('Experimental', 'JJ'),
  ('results', 'NNS'),
  ('demonstrate', 'JJ'),
  ('significant', 'JJ'),
  ('improvements', 'NNS'),
  ('over', 'IN'),
  ('baseline', 'JJ'),
  ('methods', 'NNS'),
  ('.', '.')],
 [('The', 'DT'),
  ('algorithm', 'NN'),
  ('achieves', 'VBZ'),
  ('state', 'NN'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('art', 'NN'),
  ('performance', 'NN'),
  ('on', 'IN'),
  ('benchmark', 'NN'),
  ('datasets', 'NNS'),
  ('.', '.')],
 [('We', 'PRP'),
  ('analyze', 'VBP'),
  ('the', 'DT'),
  ('convergence', 'NN'),
  ('properties', 'NNS'),
  ('of', 'IN'),
  ('st

In [6]:
transition_counts = defaultdict(lambda: defaultdict(int))
tag_denominators = defaultdict(int)

for sentence in tagged_sentences:
    # Extract only the tags from the (word, tag) tuples
    tags_only = [tag for word, tag in sentence]
    # Add <START> and <END> tags
    processed_tags = ['<START>'] + tags_only + ['<END>']

    for i in range(len(processed_tags) - 1):
        prev_tag = processed_tags[i]
        current_tag = processed_tags[i+1]

        # Increment denominator for the previous tag
        tag_denominators[prev_tag] += 1
        # Increment count for the (previous_tag, current_tag) pair
        transition_counts[prev_tag][current_tag] += 1

transition_probabilities = defaultdict(lambda: defaultdict(float))

for prev_tag, next_tag_counts in transition_counts.items():
    total_occurrences_of_prev_tag = tag_denominators[prev_tag]
    for current_tag, count in next_tag_counts.items():
        transition_probabilities[prev_tag][current_tag] = count / total_occurrences_of_prev_tag

print("Transition Probabilities:")
for prev_tag, probs in transition_probabilities.items():
    print(f"  {prev_tag}: {probs}")

Transition Probabilities:
  <START>: defaultdict(<class 'float'>, {'PRP': 0.4, 'DT': 0.4, 'JJ': 0.2})
  PRP: defaultdict(<class 'float'>, {'VBP': 1.0})
  VBP: defaultdict(<class 'float'>, {'DT': 1.0})
  DT: defaultdict(<class 'float'>, {'JJ': 0.3333333333333333, 'NN': 0.6666666666666666})
  JJ: defaultdict(<class 'float'>, {'JJ': 0.25, 'NN': 0.375, 'NNS': 0.375})
  NN: defaultdict(<class 'float'>, {'NN': 0.3125, 'IN': 0.25, '.': 0.125, 'VBZ': 0.125, 'NNS': 0.1875})
  IN: defaultdict(<class 'float'>, {'NN': 0.5, 'JJ': 0.3333333333333333, 'DT': 0.16666666666666666})
  .: defaultdict(<class 'float'>, {'<END>': 1.0})
  VBZ: defaultdict(<class 'float'>, {'DT': 0.5, 'NN': 0.5})
  NNS: defaultdict(<class 'float'>, {'.': 0.5, 'JJ': 0.16666666666666666, 'IN': 0.3333333333333333})


In [7]:
emission_counts = defaultdict(lambda: defaultdict(int))
tag_counts = defaultdict(int) # This will be the denominator for emission probabilities

for sentence in tagged_sentences:
    for word, tag in sentence:
        emission_counts[tag][word] += 1
        tag_counts[tag] += 1

emission_probabilities = defaultdict(lambda: defaultdict(float))

for tag, word_counts in emission_counts.items():
    total_occurrences_of_tag = tag_counts[tag]
    for word, count in word_counts.items():
        emission_probabilities[tag][word] = count / total_occurrences_of_tag

print("\nEmission Probabilities:")
for tag, probs in emission_probabilities.items():
    print(f"  {tag}: {probs}")


Emission Probabilities:
  PRP: defaultdict(<class 'float'>, {'We': 1.0})
  VBP: defaultdict(<class 'float'>, {'propose': 0.5, 'analyze': 0.5})
  DT: defaultdict(<class 'float'>, {'a': 0.3333333333333333, 'This': 0.16666666666666666, 'The': 0.16666666666666666, 'the': 0.3333333333333333})
  JJ: defaultdict(<class 'float'>, {'novel': 0.125, 'neural': 0.125, 'probabilistic': 0.125, 'Experimental': 0.125, 'demonstrate': 0.125, 'significant': 0.125, 'baseline': 0.125, 'stochastic': 0.125})
  NN: defaultdict(<class 'float'>, {'network': 0.0625, 'architecture': 0.0625, 'image': 0.0625, 'classification': 0.0625, 'paper': 0.0625, 'model': 0.0625, 'sequence': 0.0625, 'labeling': 0.0625, 'algorithm': 0.0625, 'state': 0.0625, 'art': 0.0625, 'performance': 0.0625, 'benchmark': 0.0625, 'convergence': 0.0625, 'gradient': 0.0625, 'descent': 0.0625})
  IN: defaultdict(<class 'float'>, {'for': 0.3333333333333333, 'over': 0.16666666666666666, 'of': 0.3333333333333333, 'on': 0.16666666666666666})
  .: de

In [10]:
transition_counts = defaultdict(Counter)
tag_counts = Counter()

for sentence in tagged_sentences:
    prev_tag = "<START>"
    for word, tag in sentence:
        transition_counts[prev_tag][tag] += 1
        tag_counts[prev_tag] += 1
        prev_tag = tag
    transition_counts[prev_tag]["<END>"] += 1
    tag_counts[prev_tag] += 1

# Transition probabilities
transition_probs = {
    prev: {tag: count / tag_counts[prev]
           for tag, count in tags.items()}
    for prev, tags in transition_counts.items()
}

print("Transition Probabilities:")
for prev_tag, probs in transition_probs.items():
    print(f"  {prev_tag}: {probs}")

Transition Probabilities:
  <START>: {'PRP': 0.4, 'DT': 0.4, 'JJ': 0.2}
  PRP: {'VBP': 1.0}
  VBP: {'DT': 1.0}
  DT: {'JJ': 0.3333333333333333, 'NN': 0.6666666666666666}
  JJ: {'JJ': 0.25, 'NN': 0.375, 'NNS': 0.375}
  NN: {'NN': 0.3125, 'IN': 0.25, '.': 0.125, 'VBZ': 0.125, 'NNS': 0.1875}
  IN: {'NN': 0.5, 'JJ': 0.3333333333333333, 'DT': 0.16666666666666666}
  .: {'<END>': 1.0}
  VBZ: {'DT': 0.5, 'NN': 0.5}
  NNS: {'.': 0.5, 'JJ': 0.16666666666666666, 'IN': 0.3333333333333333}


In [12]:
emission_counts = defaultdict(Counter)
tag_word_counts = Counter()

for sentence in tagged_sentences:
    for word, tag in sentence:
        emission_counts[tag][word.lower()] += 1
        tag_word_counts[tag] += 1

emission_probs = {
    tag: {word: count / tag_word_counts[tag]
          for word, count in words.items()}
    for tag, words in emission_counts.items()
}

print("\nEmission Probabilities:")
for tag, probs in emission_probs.items():
    print(f"  {tag}: {probs}")


Emission Probabilities:
  PRP: {'we': 1.0}
  VBP: {'propose': 0.5, 'analyze': 0.5}
  DT: {'a': 0.3333333333333333, 'this': 0.16666666666666666, 'the': 0.5}
  JJ: {'novel': 0.125, 'neural': 0.125, 'probabilistic': 0.125, 'experimental': 0.125, 'demonstrate': 0.125, 'significant': 0.125, 'baseline': 0.125, 'stochastic': 0.125}
  NN: {'network': 0.0625, 'architecture': 0.0625, 'image': 0.0625, 'classification': 0.0625, 'paper': 0.0625, 'model': 0.0625, 'sequence': 0.0625, 'labeling': 0.0625, 'algorithm': 0.0625, 'state': 0.0625, 'art': 0.0625, 'performance': 0.0625, 'benchmark': 0.0625, 'convergence': 0.0625, 'gradient': 0.0625, 'descent': 0.0625}
  IN: {'for': 0.3333333333333333, 'over': 0.16666666666666666, 'of': 0.3333333333333333, 'on': 0.16666666666666666}
  .: {'.': 1.0}
  VBZ: {'presents': 0.5, 'achieves': 0.5}
  NNS: {'tasks': 0.16666666666666666, 'results': 0.16666666666666666, 'improvements': 0.16666666666666666, 'methods': 0.16666666666666666, 'datasets': 0.16666666666666666, 

In [13]:
sorted_transitions = sorted(
    [(prev, tag, prob)
     for prev, tags in transition_probs.items()
     for tag, prob in tags.items()],
    key=lambda x: x[2],
    reverse=True
)

display(sorted_transitions[:5])

[('PRP', 'VBP', 1.0),
 ('VBP', 'DT', 1.0),
 ('.', '<END>', 1.0),
 ('DT', 'NN', 0.6666666666666666),
 ('IN', 'NN', 0.5)]

In [15]:
def viterbi(words, possible_tags, transition_probs, emission_probs):
    # Tokenize the input sentence
    tokens = [word.lower() for word in word_tokenize(words)]

    # Initialize Viterbi path and probabilities
    viterbi_table = defaultdict(lambda: defaultdict(float))
    backpointer = defaultdict(lambda: defaultdict(str))

    # <START> probabilities
    for tag in possible_tags:
        trans_prob = transition_probs.get('<START>', {}).get(tag, 0.0)
        # If the word is unknown, assign a small probability to avoid zero
        emit_prob = emission_probs.get(tag, {}).get(tokens[0], 1e-10) # Laplace smoothing or a small default
        viterbi_table[0][tag] = trans_prob * emit_prob
        backpointer[0][tag] = '<START>'

    # Fill Viterbi table
    for i in range(1, len(tokens)):
        for current_tag in possible_tags:
            max_prob = 0.0
            best_prev_tag = ''

            # If the word is unknown, assign a small probability to avoid zero
            emit_prob = emission_probs.get(current_tag, {}).get(tokens[i], 1e-10) # Laplace smoothing or a small default

            for prev_tag in possible_tags:
                # Transition from prev_tag to current_tag
                trans_prob = transition_probs.get(prev_tag, {}).get(current_tag, 0.0)

                prob = viterbi_table[i-1][prev_tag] * trans_prob * emit_prob

                if prob > max_prob:
                    max_prob = prob
                    best_prev_tag = prev_tag

            viterbi_table[i][current_tag] = max_prob
            backpointer[i][current_tag] = best_prev_tag

    # Handle <END> state
    max_prob = 0.0
    best_last_tag = ''
    for tag in possible_tags:
        trans_prob = transition_probs.get(tag, {}).get('<END>', 0.0)
        prob = viterbi_table[len(tokens) - 1][tag] * trans_prob
        if prob > max_prob:
            max_prob = prob
            best_last_tag = tag

    # Reconstruct the best path
    tagged_sequence = []
    current = best_last_tag
    for i in range(len(tokens) - 1, -1, -1):
        tagged_sequence.insert(0, (tokens[i], current))
        current = backpointer[i][current]

    return tagged_sequence

# Get all possible tags from the training data
possible_tags = set()
for sentence in tagged_sentences:
    for _, tag in sentence:
        possible_tags.add(tag)

# Run Viterbi algorithm
predicted_tags = viterbi(input_sentence, list(possible_tags), transition_probs, emission_probs)

# Format the output as requested
output_string = " ".join([f"{word}/{tag.upper()}" for word, tag in predicted_tags])
print(f"Output (Predicted POS Tags):\n{output_string}")

Output (Predicted POS Tags):
the/DT proposed/NN method/VBZ improves/DT classification/NN accuracy/NNS ./.


In [14]:
input_sentence = "The proposed method improves classification accuracy."
print(f"Input Sentence: {input_sentence}")

Input Sentence: The proposed method improves classification accuracy.
