<a href="https://colab.research.google.com/github/Darshan235/NLP/blob/main/HMM_Tweet_POS_Tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# HMM POS Tagging on Tweets (NLTK)

This notebook:
- Loads tweet data
- Preprocesses text (removes URLs, mentions)
- POS-tags tweets using NLTK
- Builds HMM parameters
- Analyzes transition probabilities and rare tokens
- Demonstrates Viterbi decoding manually
- Discusses limitations of HMMs on social media text


In [17]:
import re
import nltk
from collections import defaultdict, Counter
import pandas as pd

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [8]:

# Load tweets
df = pd.read_csv('/content/Twitter_Data.csv')
tweets = df.iloc[:,0].dropna().tolist()
tweets[:5]


['when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples',
 'talk all the nonsense and continue all the drama will vote for modi ',
 'what did just say vote for modi  welcome bjp told you rahul the main campaigner for modi think modi should just relax',
 'asking his supporters prefix chowkidar their names modi did great service now there confusion what read what not now crustal clear what will crass filthy nonsensical see how most abuses are coming from chowkidars',
 'answer who among these the most powerful world leader today trump putin modi may ']

In [11]:
# Preprocessing: remove URLs and mentions
def preprocess(tweet):
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r'@\w+', '', tweet)
    return tweet.strip()

clean_tweets = [preprocess(t) for t in tweets]
clean_tweets[:5]


['when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples',
 'talk all the nonsense and continue all the drama will vote for modi',
 'what did just say vote for modi  welcome bjp told you rahul the main campaigner for modi think modi should just relax',
 'asking his supporters prefix chowkidar their names modi did great service now there confusion what read what not now crustal clear what will crass filthy nonsensical see how most abuses are coming from chowkidars',
 'answer who among these the most powerful world leader today trump putin modi may']

In [19]:

# POS tagging
tagged_tweets = [nltk.pos_tag(nltk.word_tokenize(t)) for t in clean_tweets]
tagged_tweets[:2]


[[('when', 'WRB'),
  ('modi', 'NN'),
  ('promised', 'VBD'),
  ('“', 'NNP'),
  ('minimum', 'JJ'),
  ('government', 'NN'),
  ('maximum', 'JJ'),
  ('governance', 'NN'),
  ('”', 'NNP'),
  ('expected', 'VBD'),
  ('him', 'PRP'),
  ('begin', 'VB'),
  ('the', 'DT'),
  ('difficult', 'JJ'),
  ('job', 'NN'),
  ('reforming', 'VBG'),
  ('the', 'DT'),
  ('state', 'NN'),
  ('why', 'WRB'),
  ('does', 'VBZ'),
  ('take', 'VB'),
  ('years', 'NNS'),
  ('get', 'VB'),
  ('justice', 'NN'),
  ('state', 'NN'),
  ('should', 'MD'),
  ('and', 'CC'),
  ('not', 'RB'),
  ('business', 'NN'),
  ('and', 'CC'),
  ('should', 'MD'),
  ('exit', 'VB'),
  ('psus', 'NN'),
  ('and', 'CC'),
  ('temples', 'NNS')],
 [('talk', 'NN'),
  ('all', 'PDT'),
  ('the', 'DT'),
  ('nonsense', 'NN'),
  ('and', 'CC'),
  ('continue', 'VB'),
  ('all', 'PDT'),
  ('the', 'DT'),
  ('drama', 'NN'),
  ('will', 'MD'),
  ('vote', 'VB'),
  ('for', 'IN'),
  ('modi', 'NN')]]

In [20]:

# Build HMM parameters
transition = defaultdict(Counter)
emission = defaultdict(Counter)
tag_counts = Counter()

for sent in tagged_tweets:
    prev_tag = '<START>'
    for word, tag in sent:
        transition[prev_tag][tag] += 1
        emission[tag][word.lower()] += 1
        tag_counts[tag] += 1
        prev_tag = tag
    transition[prev_tag]['<END>'] += 1

transition_prob = {
    t1: {t2: c/sum(cnt.values()) for t2, c in cnt.items()}
    for t1, cnt in transition.items()
}

emission_prob = {
    tag: {w: c/tag_counts[tag] for w, c in cnt.items()}
    for tag, cnt in emission.items()
}


## HMM Parameter Snapshot

In [21]:

list(transition_prob.items())[:3]


[('<START>',
  {'WRB': 0.03084503239740821,
   'NN': 0.3353929412919694,
   'WP': 0.017517916748478305,
   'VBG': 0.01973910268996662,
   'JJ': 0.13079226389161594,
   'DT': 0.07340344590614568,
   'IN': 0.03366139799725113,
   'NNS': 0.06966669939132142,
   'CD': 0.01413705085411349,
   'VBD': 0.01265216964461025,
   'VBZ': 0.008387738071863342,
   'RB': 0.09840712742980562,
   'VB': 0.04271181032790104,
   'PRP$': 0.015591252699784017,
   'JJR': 0.0007424406047516199,
   'MD': 0.018573286864323583,
   'PRP': 0.02864225407421952,
   'VBN': 0.01220425093265266,
   'VBP': 0.006651286078931867,
   'CC': 0.01645027488709994,
   'EX': 0.0025341154525819753,
   'RBR': 0.001607598664834086,
   'JJS': 0.0034422246220302375,
   'UH': 0.0005767720400549774,
   'WDT': 0.002877724327508345,
   'RBS': 0.0006749460043196544,
   'PDT': 0.001902120557628117,
   'FW': 9.20380914981347e-05,
   'WP$': 8.590221873159239e-05,
   'NNP': 3.067936383271157e-05,
   '<END>': 6.135872766542313e-06}),
 ('WRB',
 

In [25]:

list(emission_prob.items())[:3]


[('WRB',
  {'when': 0.262271086100901,
   'why': 0.3527957583840224,
   'how': 0.25147187977751034,
   'where': 0.1224669030348372,
   'whenever': 0.005269492242136421,
   'walo': 6.505545977946199e-05,
   'mover': 3.2527729889730993e-05,
   'wont': 0.0008131932472432749,
   'whereever': 6.505545977946199e-05,
   'wow': 0.00013011091955892397,
   'waiver': 0.00019516637933838596,
   'wasn': 3.2527729889730993e-05,
   'wld': 3.2527729889730993e-05,
   'wiselythe': 3.2527729889730993e-05,
   'wherever': 0.00035780502878704094,
   'write': 9.758318966919298e-05,
   'whatsapp': 3.2527729889730993e-05,
   'wasnt': 0.000292749569007579,
   'wan': 0.00026022183911784795,
   'won': 6.505545977946199e-05,
   'withot': 3.2527729889730993e-05,
   'wil': 0.00016263864944865498,
   'wait': 6.505545977946199e-05,
   'wali': 3.2527729889730993e-05,
   'whatsoever': 6.505545977946199e-05,
   'wud': 0.00013011091955892397,
   'wicket': 3.2527729889730993e-05,
   'wada': 3.2527729889730993e-05,
   'whos

## Viterbi Decoding (Manual Example)

In [23]:

sentence = "I love NLP"
tokens = sentence.split()
states = list(tag_counts.keys())

V = [{}]
path = {}

for state in states:
    V[0][state] = transition_prob.get('<START>',{}).get(state,0) *                   emission_prob.get(state,{}).get(tokens[0].lower(),1e-6)
    path[state] = [state]

for t in range(1, len(tokens)):
    V.append({})
    new_path = {}
    for curr in states:
        prob, prev = max(
            (V[t-1][p] * transition_prob.get(p,{}).get(curr,0) *
             emission_prob.get(curr,{}).get(tokens[t].lower(),1e-6), p)
            for p in states
        )
        V[t][curr] = prob
        new_path[curr] = path[prev] + [curr]
    path = new_path

final_state = max(V[-1], key=V[-1].get)
list(zip(tokens, path[final_state]))


[('I', 'NN'), ('love', 'NN'), ('NLP', 'NN')]


## Discussion Notes

- Transition irregularities due to informal grammar
- Rare/unknown tokens: emojis, slang, hashtags
- HMM limitations:
  - Assumes Markov property
  - No global context
  - Poor handling of noisy text
