<a href="https://colab.research.google.com/github/A-varshith/NLP_LAB/blob/main/NLP_LAB6_2403A52024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# NLP Assignment 6 – HMM-based POS Tagging on Tweets (Dataset-based)

**Tasks Covered**
- Load tweet dataset and preprocess text  
- POS tagging using NLTK  
- Build HMM parameters  
- Analyze transition irregularities  
- Analyze rare/unknown tokens  
- Manual Viterbi decoding for one tweet  


In [1]:

import pandas as pd
import nltk
import re
from collections import defaultdict, Counter

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## Load Dataset

In [2]:

# Load the uploaded dataset
df = pd.read_csv("Twitter_Data.csv")
df.head()


Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0



## Select Tweet Text Column

The code automatically selects the first text-based column as tweet text.


In [3]:

# Automatically detect a text column
text_col = df.select_dtypes(include='object').columns[0]
print("Using text column:", text_col)

tweets = df[text_col].dropna().astype(str).tolist()
tweets[:5]


Using text column: clean_text


['when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples',
 'talk all the nonsense and continue all the drama will vote for modi ',
 'what did just say vote for modi  welcome bjp told you rahul the main campaigner for modi think modi should just relax',
 'asking his supporters prefix chowkidar their names modi did great service now there confusion what read what not now crustal clear what will crass filthy nonsensical see how most abuses are coming from chowkidars',
 'answer who among these the most powerful world leader today trump putin modi may ']

## Preprocess Tweets

In [4]:

def preprocess(tweet):
    tweet = re.sub(r"http\S+", "", tweet)   # remove URLs
    tweet = re.sub(r"@\w+", "", tweet)      # remove mentions
    return tweet.strip()

clean_tweets = [preprocess(t) for t in tweets[:100]]  # limit for speed
clean_tweets[:5]


['when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples',
 'talk all the nonsense and continue all the drama will vote for modi',
 'what did just say vote for modi  welcome bjp told you rahul the main campaigner for modi think modi should just relax',
 'asking his supporters prefix chowkidar their names modi did great service now there confusion what read what not now crustal clear what will crass filthy nonsensical see how most abuses are coming from chowkidars',
 'answer who among these the most powerful world leader today trump putin modi may']

## POS Tagging using NLTK

In [5]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
tokenized = [nltk.word_tokenize(t) for t in clean_tweets]
tagged_tweets = [nltk.pos_tag(tokens) for tokens in tokenized]
tagged_tweets[:3]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[[('when', 'WRB'),
  ('modi', 'NN'),
  ('promised', 'VBD'),
  ('“', 'NNP'),
  ('minimum', 'JJ'),
  ('government', 'NN'),
  ('maximum', 'JJ'),
  ('governance', 'NN'),
  ('”', 'NNP'),
  ('expected', 'VBD'),
  ('him', 'PRP'),
  ('begin', 'VB'),
  ('the', 'DT'),
  ('difficult', 'JJ'),
  ('job', 'NN'),
  ('reforming', 'VBG'),
  ('the', 'DT'),
  ('state', 'NN'),
  ('why', 'WRB'),
  ('does', 'VBZ'),
  ('take', 'VB'),
  ('years', 'NNS'),
  ('get', 'VB'),
  ('justice', 'NN'),
  ('state', 'NN'),
  ('should', 'MD'),
  ('and', 'CC'),
  ('not', 'RB'),
  ('business', 'NN'),
  ('and', 'CC'),
  ('should', 'MD'),
  ('exit', 'VB'),
  ('psus', 'NN'),
  ('and', 'CC'),
  ('temples', 'NNS')],
 [('talk', 'NN'),
  ('all', 'PDT'),
  ('the', 'DT'),
  ('nonsense', 'NN'),
  ('and', 'CC'),
  ('continue', 'VB'),
  ('all', 'PDT'),
  ('the', 'DT'),
  ('drama', 'NN'),
  ('will', 'MD'),
  ('vote', 'VB'),
  ('for', 'IN'),
  ('modi', 'NN')],
 [('what', 'WP'),
  ('did', 'VBD'),
  ('just', 'RB'),
  ('say', 'VB'),
  ('vote'

## Build HMM Parameters

In [6]:

transition_counts = defaultdict(Counter)
emission_counts = defaultdict(Counter)
tag_counts = Counter()

for sentence in tagged_tweets:
    prev_tag = '<START>'
    for word, tag in sentence:
        transition_counts[prev_tag][tag] += 1
        emission_counts[tag][word.lower()] += 1
        tag_counts[tag] += 1
        prev_tag = tag
    transition_counts[prev_tag]['<END>'] += 1

transition_counts


defaultdict(collections.Counter,
            {'<START>': Counter({'WRB': 3,
                      'NN': 36,
                      'WP': 2,
                      'VBG': 4,
                      'JJ': 9,
                      'DT': 6,
                      'IN': 9,
                      'NNS': 6,
                      'CD': 3,
                      'VBD': 3,
                      'VBZ': 1,
                      'RB': 7,
                      'VB': 3,
                      'PRP$': 1,
                      'JJR': 1,
                      'MD': 2,
                      'PRP': 1,
                      'VBN': 3}),
             'WRB': Counter({'NN': 4,
                      'VBZ': 2,
                      'JJS': 1,
                      'JJ': 3,
                      'VBN': 1,
                      'VBP': 2,
                      'PRP': 2,
                      'RB': 2,
                      'VB': 1,
                      'DT': 1,
                      'NNS': 1}),
             'NN': Counter({'

## Transition Probability Analysis

In [7]:

transition_probs = {}

for prev_tag in transition_counts:
    total = sum(transition_counts[prev_tag].values())
    transition_probs[prev_tag] = {
        tag: count / total for tag, count in transition_counts[prev_tag].items()
    }

transition_probs


{'<START>': {'WRB': 0.03,
  'NN': 0.36,
  'WP': 0.02,
  'VBG': 0.04,
  'JJ': 0.09,
  'DT': 0.06,
  'IN': 0.09,
  'NNS': 0.06,
  'CD': 0.03,
  'VBD': 0.03,
  'VBZ': 0.01,
  'RB': 0.07,
  'VB': 0.03,
  'PRP$': 0.01,
  'JJR': 0.01,
  'MD': 0.02,
  'PRP': 0.01,
  'VBN': 0.03},
 'WRB': {'NN': 0.2,
  'VBZ': 0.1,
  'JJS': 0.05,
  'JJ': 0.15,
  'VBN': 0.05,
  'VBP': 0.1,
  'PRP': 0.1,
  'RB': 0.1,
  'VB': 0.05,
  'DT': 0.05,
  'NNS': 0.05},
 'NN': {'VBD': 0.03488372093023256,
  'JJ': 0.053156146179401995,
  'NNP': 0.0049833887043189366,
  'VBG': 0.036544850498338874,
  'WRB': 0.009966777408637873,
  'NN': 0.313953488372093,
  'MD': 0.029900332225913623,
  'CC': 0.05813953488372093,
  'PDT': 0.0033222591362126247,
  '<END>': 0.09966777408637874,
  'IN': 0.08305647840531562,
  'VBP': 0.03156146179401993,
  'RB': 0.03820598006644518,
  'WP': 0.009966777408637873,
  'WDT': 0.0033222591362126247,
  'VBZ': 0.03488372093023256,
  'DT': 0.03488372093023256,
  'NNS': 0.053156146179401995,
  'CD': 0.008

## Rare and Unknown Token Analysis

In [8]:

word_freq = Counter()

for sentence in tokenized:
    for word in sentence:
        word_freq[word.lower()] += 1

rare_tokens = [word for word, freq in word_freq.items() if freq == 1]
rare_tokens[:20]


['“',
 'minimum',
 'maximum',
 '”',
 'expected',
 'begin',
 'difficult',
 'reforming',
 'business',
 'temples',
 'nonsense',
 'continue',
 'drama',
 'welcome',
 'told',
 'main',
 'campaigner',
 'relax',
 'asking',
 'prefix']

## Manual Viterbi Decoding (One Tweet)

In [9]:

test_tweet = nltk.word_tokenize(clean_tweets[0])
tags = list(tag_counts.keys())

viterbi = [{}]
backpointer = [{}]

# Initialization
for tag in tags:
    trans_prob = transition_probs.get('<START>', {}).get(tag, 0.0001)
    emit_prob = emission_counts[tag].get(test_tweet[0].lower(), 0.0001) / tag_counts[tag]
    viterbi[0][tag] = trans_prob * emit_prob
    backpointer[0][tag] = None

# Recursion
for t in range(1, len(test_tweet)):
    viterbi.append({})
    backpointer.append({})
    for curr_tag in tags:
        max_prob, best_prev = max(
            (
                viterbi[t-1][prev_tag] *
                transition_probs.get(prev_tag, {}).get(curr_tag, 0.0001) *
                (emission_counts[curr_tag].get(test_tweet[t].lower(), 0.0001) / tag_counts[curr_tag]),
                prev_tag
            )
            for prev_tag in tags
        )
        viterbi[t][curr_tag] = max_prob
        backpointer[t][curr_tag] = best_prev

best_final_tag = max(viterbi[-1], key=viterbi[-1].get)
best_final_tag


'NNS'


## Result

This notebook demonstrates HMM-based POS tagging on a real tweet dataset,
including preprocessing, parameter estimation, irregularity analysis,
and manual Viterbi decoding.
