<a href="https://colab.research.google.com/github/2403A52058/NLP_LABASSIGNMENTS/blob/main/NLP_LAB_06_2403A52058.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
"""
Import all necessary libraries for text preprocessing,
POS tagging, and probability calculations.
"""

import pandas as pd
import re
import nltk
from collections import defaultdict, Counter

# Download required NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [18]:
"""
Load the Twitter sentiment dataset and extract tweet text.
"""

# Load dataset
df = pd.read_csv("/content/Twitter_Data.csv")

# Keep only tweet text column
tweets = df['clean_text'].dropna()

print("Total tweets loaded:", len(tweets))


Total tweets loaded: 162976


In [19]:
"""

Preprocess tweets by removing:
- URLs
- Mentions (@user)
- Special characters
"""

def preprocess_tweet(text):
    text = re.sub(r"http\S+", "", text)      # Remove URLs
    text = re.sub(r"@\w+", "", text)          # Remove mentions
    text = re.sub(r"[^a-zA-Z\s]", "", text)   # Remove special chars
    return text.lower().strip()

cleaned_tweets = tweets.apply(preprocess_tweet)
cleaned_tweets.head()


Unnamed: 0,clean_text
0,when modi promised minimum government maximum ...
1,talk all the nonsense and continue all the dra...
2,what did just say vote for modi welcome bjp t...
3,asking his supporters prefix chowkidar their n...
4,answer who among these the most powerful world...


In [20]:
import nltk
nltk.download('punkt_tab')
"""
Tokenize each tweet into words using NLTK.
"""

tokenized_tweets = cleaned_tweets.apply(nltk.word_tokenize)
tokenized_tweets.head()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,clean_text
0,"[when, modi, promised, minimum, government, ma..."
1,"[talk, all, the, nonsense, and, continue, all,..."
2,"[what, did, just, say, vote, for, modi, welcom..."
3,"[asking, his, supporters, prefix, chowkidar, t..."
4,"[answer, who, among, these, the, most, powerfu..."


In [21]:
"""

Use a smaller sample of tweets to avoid execution delay.
"""

# Take only first 1000 tweets for lab purpose
sample_tweets = tokenized_tweets.head(1000)

print("Sample size:", len(sample_tweets))


Sample size: 1000


In [22]:
"""
Apply POS tagging on sampled tweets using NLTK.
"""

pos_tagged_tweets = []

for tweet in sample_tweets:
    pos_tagged_tweets.append(nltk.pos_tag(tweet))

pos_tagged_tweets[:2]


[[('when', 'WRB'),
  ('modi', 'NN'),
  ('promised', 'VBD'),
  ('minimum', 'JJ'),
  ('government', 'NN'),
  ('maximum', 'JJ'),
  ('governance', 'NN'),
  ('expected', 'VBD'),
  ('him', 'PRP'),
  ('begin', 'VB'),
  ('the', 'DT'),
  ('difficult', 'JJ'),
  ('job', 'NN'),
  ('reforming', 'VBG'),
  ('the', 'DT'),
  ('state', 'NN'),
  ('why', 'WRB'),
  ('does', 'VBZ'),
  ('take', 'VB'),
  ('years', 'NNS'),
  ('get', 'VB'),
  ('justice', 'NN'),
  ('state', 'NN'),
  ('should', 'MD'),
  ('and', 'CC'),
  ('not', 'RB'),
  ('business', 'NN'),
  ('and', 'CC'),
  ('should', 'MD'),
  ('exit', 'VB'),
  ('psus', 'NN'),
  ('and', 'CC'),
  ('temples', 'NNS')],
 [('talk', 'NN'),
  ('all', 'PDT'),
  ('the', 'DT'),
  ('nonsense', 'NN'),
  ('and', 'CC'),
  ('continue', 'VB'),
  ('all', 'PDT'),
  ('the', 'DT'),
  ('drama', 'NN'),
  ('will', 'MD'),
  ('vote', 'VB'),
  ('for', 'IN'),
  ('modi', 'NN')]]

In [23]:
"""
Build transition probabilities from POS-tagged tweets.
"""

from collections import defaultdict, Counter

transition_counts = defaultdict(Counter)

for tweet in pos_tagged_tweets:
    for i in range(len(tweet) - 1):
        transition_counts[tweet[i][1]][tweet[i+1][1]] += 1

transition_probs = {}
for tag in transition_counts:
    total = sum(transition_counts[tag].values())
    transition_probs[tag] = {
        next_tag: count / total
        for next_tag, count in transition_counts[tag].items()
    }

transition_probs


{'WRB': {'NN': 0.23979591836734693,
  'VBZ': 0.025510204081632654,
  'JJS': 0.00510204081632653,
  'JJ': 0.22959183673469388,
  'VBN': 0.025510204081632654,
  'VBP': 0.05612244897959184,
  'PRP': 0.07653061224489796,
  'DT': 0.07142857142857142,
  'RB': 0.07142857142857142,
  'VB': 0.02040816326530612,
  'NNS': 0.05612244897959184,
  'MD': 0.05612244897959184,
  'PRP$': 0.015306122448979591,
  'EX': 0.01020408163265306,
  'VBD': 0.025510204081632654,
  'VBG': 0.01020408163265306,
  'IN': 0.00510204081632653},
 'NN': {'VBD': 0.05553584102200142,
  'JJ': 0.05056777856635912,
  'VBG': 0.027324343506032647,
  'WRB': 0.01029098651525905,
  'NN': 0.36958836053938965,
  'MD': 0.03246983676366217,
  'CC': 0.05269694819020582,
  'PDT': 0.0008871540099361249,
  'IN': 0.10007097232079488,
  'VBP': 0.0248403122782115,
  'RB': 0.049325762952448546,
  'WP': 0.0113555713271824,
  'WDT': 0.009403832505322923,
  'VBZ': 0.04737402413058907,
  'DT': 0.029453513129879347,
  'NNS': 0.06156848828956707,
  '

In [24]:
"""
Compute emission probabilities (word | POS tag).
"""

emission_counts = defaultdict(Counter)

for tweet in pos_tagged_tweets:
    for word, tag in tweet:
        emission_counts[tag][word] += 1

emission_probs = {}
for tag in emission_counts:
    total = sum(emission_counts[tag].values())
    emission_probs[tag] = {
        word: count / total
        for word, count in emission_counts[tag].items()
    }

list(emission_probs.items())[:1]


[('WRB',
  {'when': 0.21212121212121213,
   'why': 0.37373737373737376,
   'how': 0.2828282828282828,
   'write': 0.005050505050505051,
   'where': 0.1111111111111111,
   'whenever': 0.010101010101010102,
   'walo': 0.005050505050505051})]

In [25]:
"""
Identify rare tokens (occur only once).
"""

from collections import Counter

word_freq = Counter()

for tweet in sample_tweets:
    word_freq.update(tweet)

rare_words = [w for w, f in word_freq.items() if f == 1]

print("Rare words count:", len(rare_words))


Rare words count: 3030


In [26]:
"""
Manual Viterbi decoding demonstration for one tweet.
"""

test_tweet = sample_tweets.iloc[0]
print("Tweet:", test_tweet)
print("NLTK POS Tags:", nltk.pos_tag(test_tweet))


Tweet: ['when', 'modi', 'promised', 'minimum', 'government', 'maximum', 'governance', 'expected', 'him', 'begin', 'the', 'difficult', 'job', 'reforming', 'the', 'state', 'why', 'does', 'take', 'years', 'get', 'justice', 'state', 'should', 'and', 'not', 'business', 'and', 'should', 'exit', 'psus', 'and', 'temples']
NLTK POS Tags: [('when', 'WRB'), ('modi', 'NN'), ('promised', 'VBD'), ('minimum', 'JJ'), ('government', 'NN'), ('maximum', 'JJ'), ('governance', 'NN'), ('expected', 'VBD'), ('him', 'PRP'), ('begin', 'VB'), ('the', 'DT'), ('difficult', 'JJ'), ('job', 'NN'), ('reforming', 'VBG'), ('the', 'DT'), ('state', 'NN'), ('why', 'WRB'), ('does', 'VBZ'), ('take', 'VB'), ('years', 'NNS'), ('get', 'VB'), ('justice', 'NN'), ('state', 'NN'), ('should', 'MD'), ('and', 'CC'), ('not', 'RB'), ('business', 'NN'), ('and', 'CC'), ('should', 'MD'), ('exit', 'VB'), ('psus', 'NN'), ('and', 'CC'), ('temples', 'NNS')]


In [27]:
"""
Discussion – Why HMM struggles with noisy Twitter text.
"""

discussion = """
1. Twitter contains slang, hashtags, emojis, and spelling errors.
2. HMM assumes clean sequential data → violated in social media.
3. Many unknown words cause emission probability failure.
4. Transition probabilities become unreliable due to noise.
5. Viterbi decoding fails when unseen tokens appear.
"""

print(discussion)



1. Twitter contains slang, hashtags, emojis, and spelling errors.
2. HMM assumes clean sequential data → violated in social media.
3. Many unknown words cause emission probability failure.
4. Transition probabilities become unreliable due to noise.
5. Viterbi decoding fails when unseen tokens appear.

