In [44]:
#STEP 1: Install and Import Everything (Run first cell)
!pip install -U nltk pandas




In [45]:
import pandas as pd
import re
import sys

# Clear any previously loaded NLTK modules from sys.modules to force a fresh import
for module_name in list(sys.modules.keys()):
    if module_name.startswith('nltk'):
        try:
            del sys.modules[module_name]
        except KeyError:
            pass

import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab') # Added to download the missing resource
nltk.download('averaged_perceptron_tagger_eng') # Added to download the specific English tagger

print("Setup complete")

Setup complete


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [46]:
#STEP 2: Load Dataset Safely (works even if column name differs)
data = pd.read_csv("/content/Twitter_Data.csv")

print("Columns in dataset:", data.columns)

# Take first column automatically (safe method)
tweets = data.iloc[:, 0].dropna().head(200)

print("Sample tweets:")
print(tweets.head())


Columns in dataset: Index(['clean_text', 'category'], dtype='object')
Sample tweets:
0    when modi promised “minimum government maximum...
1    talk all the nonsense and continue all the dra...
2    what did just say vote for modi  welcome bjp t...
3    asking his supporters prefix chowkidar their n...
4    answer who among these the most powerful world...
Name: clean_text, dtype: object


In [47]:
#STEP 3: Preprocess Tweets (no errors)
def clean_tweet(text):
    text = str(text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    return text.strip()

cleaned = tweets.apply(clean_tweet)

print("\nCleaned tweets:")
print(cleaned.head())


Cleaned tweets:
0    when modi promised “minimum government maximum...
1    talk all the nonsense and continue all the dra...
2    what did just say vote for modi  welcome bjp t...
3    asking his supporters prefix chowkidar their n...
4    answer who among these the most powerful world...
Name: clean_text, dtype: object


In [48]:
#STEP 4: POS Tagging using NLTK (guaranteed working)
tagged_sentences = []

for tweet in cleaned:
    tokens = nltk.word_tokenize(tweet)
    tagged = nltk.pos_tag(tokens)
    tagged_sentences.append(tagged)

print("\nFirst POS-tagged tweet:")
print(tagged_sentences[0])


First POS-tagged tweet:
[('when', 'WRB'), ('modi', 'NN'), ('promised', 'VBD'), ('“', 'NNP'), ('minimum', 'JJ'), ('government', 'NN'), ('maximum', 'JJ'), ('governance', 'NN'), ('”', 'NNP'), ('expected', 'VBD'), ('him', 'PRP'), ('begin', 'VB'), ('the', 'DT'), ('difficult', 'JJ'), ('job', 'NN'), ('reforming', 'VBG'), ('the', 'DT'), ('state', 'NN'), ('why', 'WRB'), ('does', 'VBZ'), ('take', 'VB'), ('years', 'NNS'), ('get', 'VB'), ('justice', 'NN'), ('state', 'NN'), ('should', 'MD'), ('and', 'CC'), ('not', 'RB'), ('business', 'NN'), ('and', 'CC'), ('should', 'MD'), ('exit', 'VB'), ('psus', 'NN'), ('and', 'CC'), ('temples', 'NNS')]


In [49]:
#STEP 5: Build HMM Transition & Emission Counts
from collections import defaultdict, Counter

transition = defaultdict(Counter)
emission = defaultdict(Counter)

for sent in tagged_sentences:
    prev_tag = "<s>"
    for word, tag in sent:
        transition[prev_tag][tag] += 1
        emission[tag][word.lower()] += 1
        prev_tag = tag

print("\nSample Transition Probabilities:")
print(dict(list(transition.items())[:3]))

print("\nSample Emission Probabilities:")
print(dict(list(emission.items())[:3]))


Sample Transition Probabilities:
{'<s>': Counter({'NN': 76, 'JJ': 22, 'RB': 20, 'DT': 12, 'IN': 12, 'NNS': 11, 'WRB': 9, 'VBG': 7, 'VB': 4, 'VBN': 4, 'WP': 3, 'CD': 3, 'VBD': 3, 'PRP$': 3, 'MD': 3, 'VBZ': 2, 'PRP': 2, 'VBP': 2, 'JJR': 1, 'CC': 1}), 'WRB': Counter({'NN': 6, 'JJ': 5, 'VBP': 3, 'RB': 3, 'VBZ': 2, 'VBN': 2, 'PRP': 2, 'VB': 2, 'DT': 2, 'JJS': 1, 'NNS': 1, 'MD': 1, 'PRP$': 1}), 'NN': Counter({'NN': 390, 'IN': 113, 'JJ': 67, 'NNS': 66, 'CC': 59, 'VBD': 55, 'RB': 49, 'VBZ': 38, 'VBP': 36, 'DT': 36, 'VBG': 32, 'MD': 32, 'PRP': 22, 'WP': 13, 'VB': 13, 'NNP': 10, 'WRB': 9, 'VBN': 9, 'CD': 7, 'FW': 6, 'PRP$': 6, 'WDT': 5, 'PDT': 3, 'RP': 2, 'EX': 2, 'RBR': 1, 'WP$': 1, 'JJR': 1})}

Sample Emission Probabilities:
{'WRB': Counter({'why': 13, 'when': 8, 'how': 5, 'where': 5}), 'NN': Counter({'modi': 124, 'india': 28, 'vote': 27, 'bjp': 14, 'time': 11, 'election': 11, 'country': 11, 'congress': 9, 'government': 8, 'chowkidar': 8, 'everyone': 8, 'minister': 7, 'nation': 7, 'corruption

## Analyze Transition Probability Irregularities

### Subtask:
Examine the 'transition' dictionary to identify any unusual or unexpected sequences of POS tags that might indicate irregularities in the learned grammar from the social media text.


In [50]:
transition_probabilities = defaultdict(dict)

for prev_tag, next_tags_counter in transition.items():
    total_count = sum(next_tags_counter.values())
    for next_tag, count in next_tags_counter.items():
        transition_probabilities[prev_tag][next_tag] = count / total_count if total_count > 0 else 0

print("\nCalculated Transition Probabilities (first 5 previous tags):")
# Sort the inner dictionaries by probability in descending order for better analysis
for prev_tag, next_probs in list(transition_probabilities.items())[:5]:
    sorted_next_probs = dict(sorted(next_probs.items(), key=lambda item: item[1], reverse=True))
    print(f"  {prev_tag}: {dict(list(sorted_next_probs.items())[:5])}") # Print top 5 for each prev_tag



Calculated Transition Probabilities (first 5 previous tags):
  <s>: {'NN': 0.38, 'JJ': 0.11, 'RB': 0.1, 'DT': 0.06, 'IN': 0.06}
  WRB: {'NN': 0.1935483870967742, 'JJ': 0.16129032258064516, 'VBP': 0.0967741935483871, 'RB': 0.0967741935483871, 'VBZ': 0.06451612903225806}
  NN: {'NN': 0.3601108033240997, 'IN': 0.10433979686057249, 'JJ': 0.061865189289012, 'NNS': 0.060941828254847646, 'CC': 0.05447830101569714}
  VBD: {'JJ': 0.19469026548672566, 'NN': 0.19469026548672566, 'DT': 0.07964601769911504, 'RB': 0.061946902654867256, 'IN': 0.05309734513274336}
  NNP: {'NN': 0.5625, 'JJ': 0.125, 'RB': 0.125, 'VBD': 0.0625, 'NNS': 0.0625}


### Analysis of Transition Probability Irregularities

Upon examining the calculated transition probabilities, several patterns emerge that reflect the nature of social media text, often deviating from standard English grammar. Here are some observations:

1.  **`<s>` (Start of Sentence) Transitioning Directly to Nouns (`NN`) or Adjectives (`JJ`):**
    *   `<s>`: `{'NN': 0.38, 'JJ': 0.11, 'RB': 0.1, 'DT': 0.06, 'IN': 0.06}`
    *   A high probability of a sentence starting directly with a Noun (NN, 38%) or Adjective (JJ, 11%) is common in headlines, short phrases, or sentences where articles/pronouns are omitted for brevity, characteristic of social media. For instance, "Modi promised..." starts with a noun, which is grammatically correct but often in more formal writing, one might see a determiner or adverb preceding it.

2.  **`WRB` (Wh-adverb) Transitioning to Nouns (`NN`) or Adjectives (`JJ`):**
    *   `WRB`: `{'NN': 0.1935, 'JJ': 0.1613, 'VBP': 0.0968, 'RB': 0.0968, 'VBZ': 0.0645}`
    *   It's somewhat unusual to see a `WRB` (like 'why', 'when', 'how') directly followed by a Noun or Adjective with such high probability. In standard English, `WRB` is typically followed by a verb or auxiliary verb to form a question or subordinate clause. However, in informal social media, incomplete sentences or direct exclamations like "Why Modi?" or "How good!" could contribute to this pattern.

3.  **`NN` (Noun) Transitioning to another `NN`:**
    *   `NN`: `{'NN': 0.3601, 'IN': 0.1043, 'JJ': 0.0619, 'NNS': 0.0609, 'CC': 0.0545}`
    *   The highest probability for a Noun to be followed by another Noun (36%) is notable. This could indicate frequent use of noun phrases, compound nouns (e.g., "government maximum governance"), or appositives, which are often condensed in social media. Examples might include

### Analysis of Transition Probability Irregularities

Upon examining the calculated transition probabilities, several patterns emerge that reflect the nature of social media text, often deviating from standard English grammar. Here are some observations:

1.  **`<s>` (Start of Sentence) Transitioning Directly to Nouns (`NN`) or Adjectives (`JJ`):**
    *   `<s>`: `{'NN': 0.38, 'JJ': 0.11, 'RB': 0.1, 'DT': 0.06, 'IN': 0.06}`
    *   A high probability of a sentence starting directly with a Noun (NN, 38%) or Adjective (JJ, 11%) is common in headlines, short phrases, or sentences where articles/pronouns are omitted for brevity, characteristic of social media. For instance, "Modi promised..." starts with a noun, which is grammatically correct but often in more formal writing, one might see a determiner or adverb preceding it.

2.  **`WRB` (Wh-adverb) Transitioning to Nouns (`NN`) or Adjectives (`JJ`):**
    *   `WRB`: `{'NN': 0.1935, 'JJ': 0.1613, 'VBP': 0.0968, 'RB': 0.0968, 'VBZ': 0.0645}`
    *   It's somewhat unusual to see a `WRB` (like 'why', 'when', 'how') directly followed by a Noun or Adjective with such high probability. In standard English, `WRB` is typically followed by a verb or auxiliary verb to form a question or subordinate clause. However, in informal social media, incomplete sentences or direct exclamations like "Why Modi?" or "How good!" could contribute to this pattern.

3.  **`NN` (Noun) Transitioning to another `NN`:**
    *   `NN`: `{'NN': 0.3601, 'IN': 0.1043, 'JJ': 0.0619, 'NNS': 0.0609, 'CC': 0.0545}`
    *   The highest probability for a Noun to be followed by another Noun (36%) is notable. This often indicates the frequent use of compound nouns or noun phrases without intervening prepositions or determiners (e.g., "government maximum governance", "election campaign"), which is a common feature in concise social media communication.

4.  **`NNP` (Proper Noun, Singular) Transitioning to `NN` or `JJ`:**
    *   `NNP`: `{'NN': 0.5625, 'JJ': 0.125, 'RB': 0.125, 'VBD': 0.0625, 'NNS': 0.0625}`
    *   A very high probability of an `NNP` being followed by an `NN` (56.25%) is expected for proper noun phrases (e.g., "New York City"). However, the POS tagger sometimes misclassifies non-proper nouns or even punctuation as `NNP` (e.g., '“' and '”' were tagged `NNP` in the sample output). If such misclassifications occur, this could lead to unusual but statistically frequent transitions, like `NNP` (punctuation) followed by `JJ` or `NN`.

In [51]:
emission_probabilities = defaultdict(dict)

for tag, words_counter in emission.items():
    total_count = sum(words_counter.values())
    for word, count in words_counter.items():
        emission_probabilities[tag][word] = count / total_count if total_count > 0 else 0

print("\nCalculated Emission Probabilities (first 5 POS tags):")
# Sort the inner dictionaries by probability in descending order for better analysis
for tag, word_probs in list(emission_probabilities.items())[:5]:
    sorted_word_probs = dict(sorted(word_probs.items(), key=lambda item: item[1], reverse=True))
    print(f"  {tag}: {dict(list(sorted_word_probs.items())[:5])}") # Print top 5 for each tag



Calculated Emission Probabilities (first 5 POS tags):
  WRB: {'why': 0.41935483870967744, 'when': 0.25806451612903225, 'how': 0.16129032258064516, 'where': 0.16129032258064516}
  NN: {'modi': 0.10299003322259136, 'india': 0.023255813953488372, 'vote': 0.022425249169435217, 'bjp': 0.011627906976744186, 'time': 0.009136212624584718}
  VBD: {'was': 0.14912280701754385, 'did': 0.07017543859649122, 'were': 0.06140350877192982, 'kar': 0.05263157894736842, 'said': 0.02631578947368421}
  NNP: {'’': 0.5, '”': 0.125, '‘': 0.125, '“': 0.0625, '₹': 0.0625}
  JJ: {'modi': 0.051685393258426963, 'narendra': 0.033707865168539325, 'indian': 0.02247191011235955, 'other': 0.020224719101123594, 'great': 0.01348314606741573}


### Analysis of Rare Tokens and Out-Of-Vocabulary (OOV) Terms

Having calculated the emission probabilities, we can now identify rare tokens and discuss the challenges posed by Out-Of-Vocabulary (OOV) terms, especially in the context of social media:

#### Identifying Rare Tokens:
To identify rare tokens, we look for words with very low emission probabilities for a given POS tag. These are often words that appear only once or twice (hapax legomena or dis legomena) within the training data for that specific tag.

Let's consider a few examples from the computed `emission_probabilities`:

*   **For `NN` (Noun) tag:**
    While words like 'modi' have a high probability (0.102), many words at the tail end of the `NN` emission list would have extremely low probabilities. For instance, words like 'nonsense', 'drama', 'campaigner', 'confusion', 'putin', 'grace', 'smriti', 'hema', 'introspect', 'saga', 'pair', 'brexit' (as seen in the original `emission` dictionary) likely have very low individual probabilities for the 'NN' tag because they appear infrequently. A word appearing only once for a given tag would have an emission probability of `1 / total_count_for_tag`.

*   **For `JJ` (Adjective) tag:**
    Similarly, while 'modi', 'narendra', 'indian' have higher probabilities, words like 'minimum', 'difficult', 'refresh', 'maarkefir', 'crustal', 'filthy', 'nonsensical', 'powerful' (from the full `emission` data for `JJ`) would represent rare tokens, each with a very low probability.

*   **Punctuation as `NNP`:**
    A peculiar observation is the high probability of punctuation marks like `’`, `”`, `‘`, `“` (apostrophe, double quotes) and `₹` (Rupee symbol) for the `NNP` (Proper Noun, Singular) tag. This is a clear misclassification by the POS tagger, where these symbols are treated as proper nouns due to their unique occurrence patterns or perhaps an anomaly in the training data it was built on. While not 'rare' in the sense of appearing infrequently as words, their classification as `NNP` is an irregularity that leads to unexpected high emission probabilities for these non-word tokens.

#### Out-Of-Vocabulary (OOV) Terms and HMM Challenges:

An HMM model assigns zero probability to any word it encounters during prediction that was not present in its training data for a specific POS tag. In the context of social media text, this poses significant challenges:

1.  **Dynamic Vocabulary:** Social media is highly dynamic, with new slang, hashtags, acronyms, and trending terms emerging constantly. An HMM trained on a static corpus will inevitably encounter many OOV words.
2.  **Informal Language:** Misspellings, abbreviations, and creative word usage are common, leading to words that might not be in a standard dictionary or a carefully curated training corpus.
3.  **Proper Nouns:** New names of people, places, brands, or events appear frequently. If these are not in the training data, they become OOV, making it difficult for the HMM to tag them correctly (e.g., as NNP).
4.  **Impact on Tagging:** When an HMM encounters an OOV word, it typically cannot assign it a POS tag with any confidence (or assigns a default, often incorrect, tag like 'NN' or 'UNK' if smoothing is applied). This can propagate errors, as the tag of the current word influences the probability of the next word's tag (via transition probabilities).

**Mitigation Strategies (beyond basic HMM):**
*   **Smoothing:** Techniques like Laplace smoothing (add-one smoothing) or more advanced methods can assign a small, non-zero probability to OOV words. However, this is a heuristic and doesn't genuinely 'understand' the word.
*   **Lexical Guessing:** Using morphological analysis (prefixes, suffixes) or word shape (e.g., capitalization, presence of numbers) to guess the POS of OOV words.
*   **Hybrid Models:** Combining HMMs with neural networks or other models that can learn word embeddings, which can handle words not explicitly seen in training by leveraging semantic similarity.

In summary, the presence of numerous low-probability emission words and the inherent problem of OOV terms in dynamic social media text highlight the limitations of a purely HMM-based approach for robust POS tagging in such domains.

In [52]:
#Manually Apply Viterbi Decoding
sample_tweet_text = cleaned.iloc[2] # Choosing the third cleaned tweet as an example
sample_tweet_tokens = nltk.word_tokenize(sample_tweet_text)

print(f"Selected Sample Tweet: {sample_tweet_text}")
print(f"Tokenized Sample Tweet: {sample_tweet_tokens}")

# Initialize dictionaries to store Viterbi probabilities and backpointers
viterbi_probabilities = defaultdict(lambda: defaultdict(float))
viterbi_backpointers = defaultdict(lambda: defaultdict(str))

# Get all unique tags from our transition probabilities for reference
all_tags = list(transition_probabilities.keys())

Selected Sample Tweet: what did just say vote for modi  welcome bjp told you rahul the main campaigner for modi think modi should just relax
Tokenized Sample Tweet: ['what', 'did', 'just', 'say', 'vote', 'for', 'modi', 'welcome', 'bjp', 'told', 'you', 'rahul', 'the', 'main', 'campaigner', 'for', 'modi', 'think', 'modi', 'should', 'just', 'relax']


In [53]:
first_word = sample_tweet_tokens[0].lower()
default_prob = 1e-6 # Small probability for unknown transitions/emissions

# Step 3: Calculate probabilities for the first word
for tag in all_tags:
    if tag == '<s>':
        continue # '<s>' is a start symbol, not a tag for a word

    # Get transition probability from start state '<s>' to current tag
    trans_prob = transition_probabilities['<s>'].get(tag, default_prob)

    # Get emission probability of the first word given the current tag
    # Handle OOV words: if word not in emission_probabilities[tag], assign default_prob
    emit_prob = emission_probabilities[tag].get(first_word, default_prob)

    viterbi_probabilities[0][tag] = trans_prob * emit_prob
    viterbi_backpointers[0][tag] = '<s>' # The previous state was always '<s>'

print(f"\nViterbi Probabilities for first word ('{first_word}'):")
sorted_first_word_probs = dict(sorted(viterbi_probabilities[0].items(), key=lambda item: item[1], reverse=True))
print(dict(list(sorted_first_word_probs.items())[:5])) # Print top 5 for brevity


Viterbi Probabilities for first word ('what'):
{'WP': 0.006923076923076923, 'NN': 3.7999999999999996e-07, 'WDT': 1.111111111111111e-07, 'JJ': 1.0999999999999999e-07, 'RB': 1e-07}


In [54]:
for i in range(1, len(sample_tweet_tokens)): # Start from the second word (index 1)
    current_word = sample_tweet_tokens[i].lower()

    for current_tag in all_tags:
        if current_tag == '<s>': # '<s>' is a start symbol, not a tag for a word
            continue

        max_prob_path = 0.0
        best_prev_tag = ''

        for prev_tag in all_tags:
            if prev_tag == '<s>': # '<s>' is a start symbol, not a tag for a word
                continue

            # Get previous Viterbi probability
            prev_viterbi_prob = viterbi_probabilities[i-1].get(prev_tag, 0.0)

            if prev_viterbi_prob == 0.0: # If previous state had zero probability, skip
                continue

            # Get transition probability from previous tag to current tag
            trans_prob = transition_probabilities[prev_tag].get(current_tag, default_prob)

            # Get emission probability of current word given current tag
            emit_prob = emission_probabilities[current_tag].get(current_word, default_prob)

            # Calculate current path probability
            current_path_prob = prev_viterbi_prob * trans_prob * emit_prob

            if current_path_prob > max_prob_path:
                max_prob_path = current_path_prob
                best_prev_tag = prev_tag

        viterbi_probabilities[i][current_tag] = max_prob_path
        viterbi_backpointers[i][current_tag] = best_prev_tag

    print(f"\nViterbi Probabilities for word {i+1} ('{current_word}'):")
    sorted_word_probs = dict(sorted(viterbi_probabilities[i].items(), key=lambda item: item[1], reverse=True))
    print(dict(list(sorted_word_probs.items())[:5])) # Print top 5 for brevity


Viterbi Probabilities for word 2 ('did'):
{'VBD': 0.00011211460604173155, 'VBP': 1.242603550295858e-09, 'VBZ': 7.10059171597633e-10, 'NN': 5.325443786982249e-10, 'JJ': 5.325443786982249e-10}

Viterbi Probabilities for word 3 ('just'):
{'RB': 5.168485645883487e-07, 'NN': 2.182762241520437e-11, 'JJ': 2.182762241520437e-11, 'DT': 8.92948189712906e-12, 'NNS': 5.952987931419374e-12}

Viterbi Probabilities for word 4 ('say'):
{'VBP': 1.0028538033147504e-09, 'VB': 5.14106662388941e-10, 'JJ': 1.0187880359674181e-13, 'RB': 6.212122170533037e-14, 'IN': 3.7272733023198224e-14}

Viterbi Probabilities for word 5 ('vote'):
{'NN': 3.6272978096530734e-12, 'VB': 9.56161261701653e-13, 'VBP': 3.1085960301959574e-13, 'JJ': 1.9410073612543554e-16, 'VBN': 1.155361524556164e-16}

Viterbi Probabilities for word 6 ('for'):
{'IN': 1.0834108299961867e-13, 'NN': 1.3062291281299155e-18, 'JJ': 2.24403465601806e-19, 'NNS': 2.2105416014506265e-19, 'CC': 1.9760902194785903e-19}

Viterbi Probabilities for word 7 ('mod

In [55]:
final_tags = []

# Find the tag with the maximum probability for the last word
last_word_index = len(sample_tweet_tokens) - 1

if last_word_index >= 0 and viterbi_probabilities[last_word_index]:
    max_final_prob = 0.0
    best_final_tag = ''

    for tag, prob in viterbi_probabilities[last_word_index].items():
        if prob > max_final_prob:
            max_final_prob = prob
            best_final_tag = tag

    # Backtrack to find the full sequence of tags
    current_tag = best_final_tag
    for i in range(last_word_index, -1, -1):
        final_tags.insert(0, current_tag)
        current_tag = viterbi_backpointers[i][current_tag]
else:
    print("Could not perform backtracking: no probabilities calculated or empty tweet.")

print("\n--- Viterbi Decoding Result ---")
print(f"Sample Tweet: {sample_tweet_text}")
print("Tokenized Tweet and Predicted POS Tags:")
for i in range(len(sample_tweet_tokens)): # Ensure both lists have the same length
    if i < len(final_tags):
        print(f"  {sample_tweet_tokens[i]}: {final_tags[i]}")
    else:
        print(f"  {sample_tweet_tokens[i]}: [TAG_MISSING]")



--- Viterbi Decoding Result ---
Sample Tweet: what did just say vote for modi  welcome bjp told you rahul the main campaigner for modi think modi should just relax
Tokenized Tweet and Predicted POS Tags:
  what: WP
  did: VBD
  just: RB
  say: VBP
  vote: NN
  for: IN
  modi: NN
  welcome: NN
  bjp: NN
  told: VBD
  you: PRP
  rahul: VB
  the: DT
  main: JJ
  campaigner: NN
  for: IN
  modi: NN
  think: NN
  modi: NN
  should: MD
  just: RB
  relax: VB


## Discuss HMM Challenges with Social Media Text

### Subtask:
Based on the analysis, discuss the inherent difficulties and limitations of Hidden Markov Models (HMMs) when applied to the informal, dynamic, and often grammatically unconventional nature of social media text.

#### Discussion of HMM Challenges with Social Media Text

Based on the analysis of transition probabilities and emission probabilities, several inherent difficulties and limitations of Hidden Markov Models (HMMs) become apparent when applied to the informal, dynamic, and often grammatically unconventional nature of social media text:

1.  **Irregular Transition Patterns and Grammatical Deviations:**
    *   **Start of Sentence (`<s>`) Transitions:** We observed high probabilities of sentences starting directly with Nouns (`NN`) or Adjectives (`JJ`), and `WRB` (Wh-adverb) followed by Nouns or Adjectives. While some are legitimate (e.g., compound nouns), others represent abbreviated, exclamatory, or headline-style constructs common in social media. HMMs rely on learned grammatical patterns. When these patterns are inconsistent or frequently deviate from standard English (as seen in social media), the transition probabilities can become skewed, leading to incorrect tag sequences for more conventional sentence structures.
    *   **High Noun-Noun (`NN` to `NN`) Transition:** The high probability of `NN` followed by `NN` reflects the prevalence of compound nouns or condensed noun phrases. While an HMM can learn this, it might struggle to differentiate between legitimate compounds and instances where other parts of speech (like determiners or prepositions) are omitted for brevity, leading to misinterpretations of the sentence's grammatical structure.

2.  **Out-Of-Vocabulary (OOV) Terms and Rare Tokens:**
    *   **Dynamic Vocabulary:** Social media language is characterized by rapid evolution, with new slang, abbreviations, hashtags, proper nouns, and trending terms emerging constantly. An HMM's emission probabilities are directly learned from its training corpus. If a word encountered during tagging was not present in the training data (an OOV term) or appeared very rarely, the model assigns it a very low (or zero) emission probability.
    *   **Impact on Tagging:** When an HMM encounters an OOV word, it cannot assign a tag with confidence. In our Viterbi implementation, a `default_prob` (1e-6) was used for unknown transitions or emissions. This is a heuristic that prevents zero probabilities from halting the algorithm but doesn't genuinely 'understand' the word. This uniform low probability can lead to arbitrary tag assignments for OOV words, which then propagate errors through the sequence due to the reliance on previous tags (via transition probabilities).
    *   **Examples from Analysis:** Words like 'modi' or 'bjp' might appear frequently, but many domain-specific terms, misspellings, or unique user-generated content will be rare or OOV. For instance, the misclassification of punctuation (`’`, `”`, `‘`, `“`, `₹`) as `NNP` (Proper Noun) is an artifact of the training data or tagger's rules, highlighting how unusual tokens can challenge standard POS tagging, even if they aren't strictly OOV for the tagger.

3.  **Limitations of `default_prob`:**
    *   The `default_prob` is a form of smoothing, a critical technique to handle unseen events. However, it's a blunt instrument. It treats all unseen words (given a tag) or unseen tag transitions equally. A more robust model would need to employ more sophisticated smoothing techniques (e.g., Kneser-Ney smoothing) or incorporate external knowledge (e.g., word embeddings, character-level features) to make more informed guesses about OOV words.
    *   For social media, where OOV is frequent, a simple `default_prob` means the model is essentially guessing, undermining the statistical power of the HMM.

**Conclusion:**

While HMMs provide a foundational understanding of sequential data, their reliance on discrete states and observable emissions from a fixed vocabulary makes them less suitable for the complexities of social media text. The informality, dynamic nature, and grammatical unconventionality lead to:
*   **Inaccurate Transition Probabilities:** Learned patterns don't always reflect the intended grammar.
*   **High OOV Rates:** Frequent encounters with unknown words result in arbitrary tag assignments.
*   **Propagation of Errors:** Mistakes with one word can cascade through the entire sequence.

Without significant modifications, such as more advanced smoothing, integrating neural word embeddings (as seen in hybrid models), or domain-specific training data, HMMs will struggle to achieve high accuracy and reliability for POS tagging in such dynamic and unconventional environments.