In [10]:
import pandas as pd
import re

# Load the dataset
try:
    df = pd.read_csv('/content/Twitter_Data.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Twitter_Data.csv not found. Please ensure the file is in the /content/ directory.")
    df = pd.DataFrame() # Create an empty DataFrame to avoid further errors

if not df.empty:
    # Assuming the tweet text column is named 'text' or 'tweet'. Adjust if necessary.
    # Let's inspect the columns to find the most likely text column
    print("\nAvailable columns:", df.columns.tolist())

    # Common column names for tweet text are 'text', 'tweet', 'Tweet', 'content', 'message'
    # Adding 'clean_text' as a candidate since it was found in the dataframe
    text_column_candidates = ['clean_text', 'text', 'tweet', 'Tweet', 'content', 'message']
    tweet_text_column = None

    for col in text_column_candidates:
        if col in df.columns:
            tweet_text_column = col
            break

    if tweet_text_column is None:
        print("\nCould not find a common tweet text column. Please identify the correct column from the list above and update the 'tweet_text_column' variable.")
        # If no column is found, we can't proceed with preprocessing
    else:
        print(f"\nUsing '{tweet_text_column}' as the tweet text column.")
        # Display the first 5 rows to confirm data loading
        display(df.head())
else:
    print("DataFrame is empty, unable to display.")

Dataset loaded successfully.

Available columns: ['clean_text', 'category']

Using 'clean_text' as the tweet text column.


Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [11]:
import pandas as pd
import re

# Load the dataset
try:
    df = pd.read_csv('/content/Twitter_Data.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Twitter_Data.csv not found. Please ensure the file is in the /content/ directory.")
    df = pd.DataFrame() # Create an empty DataFrame to avoid further errors

if not df.empty:
    # Assuming the tweet text column is named 'text' or 'tweet'. Adjust if necessary.
    # Let's inspect the columns to find the most likely text column
    print("\nAvailable columns:", df.columns.tolist())

    # Common column names for tweet text are 'text', 'tweet', 'Tweet', 'content', 'message'
    # Adding 'clean_text' as a candidate since it was found in the dataframe
    text_column_candidates = ['clean_text', 'text', 'tweet', 'Tweet', 'content', 'message']
    tweet_text_column = None

    for col in text_column_candidates:
        if col in df.columns:
            tweet_text_column = col
            break

    if tweet_text_column is None:
        print("\nCould not find a common tweet text column. Please identify the correct column from the list above and update the 'tweet_text_column' variable.")
        # If no column is found, we can't proceed with preprocessing
    else:
        print(f"\nUsing '{tweet_text_column}' as the tweet text column.")

        # Function to remove URLs
        def remove_urls(text):
            url_pattern = re.compile(r'https?://\S+|www\.\S+')
            return url_pattern.sub(r'', text)

        # Function to remove mentions
        def remove_mentions(text):
            mention_pattern = re.compile(r'@\w+')
            return mention_pattern.sub(r'', text)

        # Apply preprocessing
        print("\nPreprocessing tweet text...")
        df['cleaned_tweet_text'] = df[tweet_text_column].astype(str).apply(remove_urls)
        df['cleaned_tweet_text'] = df['cleaned_tweet_text'].apply(remove_mentions)

        print("\nOriginal tweet examples:")
        print(df[tweet_text_column].head())
        print("\nCleaned tweet examples:")
        print(df['cleaned_tweet_text'].head())
        print("\nPreprocessing complete. A new column 'cleaned_tweet_text' has been added to the DataFrame.")
else:
    print("DataFrame is empty, unable to perform preprocessing.")

Dataset loaded successfully.

Available columns: ['clean_text', 'category']

Using 'clean_text' as the tweet text column.

Preprocessing tweet text...

Original tweet examples:
0    when modi promised “minimum government maximum...
1    talk all the nonsense and continue all the dra...
2    what did just say vote for modi  welcome bjp t...
3    asking his supporters prefix chowkidar their n...
4    answer who among these the most powerful world...
Name: clean_text, dtype: object

Cleaned tweet examples:
0    when modi promised “minimum government maximum...
1    talk all the nonsense and continue all the dra...
2    what did just say vote for modi  welcome bjp t...
3    asking his supporters prefix chowkidar their n...
4    answer who among these the most powerful world...
Name: cleaned_tweet_text, dtype: object

Preprocessing complete. A new column 'cleaned_tweet_text' has been added to the DataFrame.


In [12]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet

# Download necessary NLTK data for POS tagging if not already present
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading 'punkt' for NLTK...")
    nltk.download('punkt')
    print("'punkt' downloaded.")

try:
    # Corrected resource name to 'averaged_perceptron_tagger'
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    print("Downloading 'averaged_perceptron_tagger' for NLTK...")
    nltk.download('averaged_perceptron_tagger')
    print("'averaged_perceptron_tagger' downloaded.")

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    print("Downloading 'wordnet' for NLTK...")
    nltk.download('wordnet')
    print("'wordnet' downloaded.")

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Downloading 'stopwords' for NLTK...")
    nltk.download('stopwords')
    print("'stopwords' downloaded.")

print("NLTK and its tagger data are ready.")

# Function to perform POS tagging
def pos_tag_text(text):
    tokens = nltk.word_tokenize(text)
    return nltk.pos_tag(tokens)

# Function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN # Default to noun if not found

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to lemmatize and remove stopwords
def lemmatize_and_remove_stopwords(pos_tags):
    lemmatized_tokens = []
    for word, tag in pos_tags:
        # Convert to lowercase and remove non-alphabetic characters
        cleaned_word = ''.join(filter(str.isalpha, word)).lower()
        if cleaned_word and cleaned_word not in stop_words:
            wntag = get_wordnet_pos(tag)
            lemmatized_tokens.append(lemmatizer.lemmatize(cleaned_word, wntag))
    return ' '.join(lemmatized_tokens)


if not df.empty and 'cleaned_tweet_text' in df.columns:
    print("\nPerforming POS tagging on 'cleaned_tweet_text'...")
    df['pos_tags'] = df['cleaned_tweet_text'].astype(str).apply(pos_tag_text)
    print("POS tagging complete. A new column 'pos_tags' has been added to the DataFrame.")
    print("\nExamples of POS tags:")
    display(df[['cleaned_tweet_text', 'pos_tags']].head())

    print("\nPerforming lemmatization and stopword removal...")
    df['lemmatized_text'] = df['pos_tags'].apply(lemmatize_and_remove_stopwords)
    print("Lemmatization and stopword removal complete. A new column 'lemmatized_text' has been added.")
    print("\nExamples of lemmatized text:")
    display(df[['cleaned_tweet_text', 'pos_tags', 'lemmatized_text']].head())
else:
    print("DataFrame is empty or 'cleaned_tweet_text' column not found. Cannot perform POS tagging, lemmatization or stopword removal.")

Downloading 'wordnet' for NLTK...
'wordnet' downloaded.
NLTK and its tagger data are ready.

Performing POS tagging on 'cleaned_tweet_text'...


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


POS tagging complete. A new column 'pos_tags' has been added to the DataFrame.

Examples of POS tags:


Unnamed: 0,cleaned_tweet_text,pos_tags
0,when modi promised “minimum government maximum...,"[(when, WRB), (modi, NN), (promised, VBD), (“,..."
1,talk all the nonsense and continue all the dra...,"[(talk, NN), (all, PDT), (the, DT), (nonsense,..."
2,what did just say vote for modi welcome bjp t...,"[(what, WP), (did, VBD), (just, RB), (say, VB)..."
3,asking his supporters prefix chowkidar their n...,"[(asking, VBG), (his, PRP$), (supporters, NNS)..."
4,answer who among these the most powerful world...,"[(answer, NN), (who, WP), (among, IN), (these,..."



Performing lemmatization and stopword removal...
Lemmatization and stopword removal complete. A new column 'lemmatized_text' has been added.

Examples of lemmatized text:


Unnamed: 0,cleaned_tweet_text,pos_tags,lemmatized_text
0,when modi promised “minimum government maximum...,"[(when, WRB), (modi, NN), (promised, VBD), (“,...",modi promise minimum government maximum govern...
1,talk all the nonsense and continue all the dra...,"[(talk, NN), (all, PDT), (the, DT), (nonsense,...",talk nonsense continue drama vote modi
2,what did just say vote for modi welcome bjp t...,"[(what, WP), (did, VBD), (just, RB), (say, VB)...",say vote modi welcome bjp tell rahul main camp...
3,asking his supporters prefix chowkidar their n...,"[(asking, VBG), (his, PRP$), (supporters, NNS)...",ask supporter prefix chowkidar name modi great...
4,answer who among these the most powerful world...,"[(answer, NN), (who, WP), (among, IN), (these,...",answer among powerful world leader today trump...


In [13]:
from collections import defaultdict

if not df.empty and 'pos_tags' in df.columns:
    print("\nBuilding HMM parameters...")

    # Initialize dictionaries for HMM parameters with Laplace smoothing (add 1)
    initial_probabilities = defaultdict(lambda: 1)
    transition_probabilities = defaultdict(lambda: defaultdict(lambda: 1))
    emission_probabilities = defaultdict(lambda: defaultdict(lambda: 1))
    tag_counts = defaultdict(lambda: 0) # To normalize probabilities

    all_tags_set = set() # To collect all unique tags

    # Process each tweet's POS tags
    for tags_sequence in df['pos_tags']:
        if tags_sequence:
            # Initial probabilities
            first_tag = tags_sequence[0][1] # (word, tag) -> tag
            initial_probabilities[first_tag] += 1
            all_tags_set.add(first_tag)

            # Transition and Emission probabilities
            for i, (word, tag) in enumerate(tags_sequence):
                tag_counts[tag] += 1
                all_tags_set.add(tag)
                emission_probabilities[tag][word.lower()] += 1 # Use lowercase for words

                if i > 0:
                    prev_tag = tags_sequence[i-1][1]
                    transition_probabilities[prev_tag][tag] += 1

    # Convert tag_counts to a regular dict for easier access/iteration
    tag_counts_dict = dict(tag_counts)

    # Normalize probabilities
    # Initial probabilities
    total_initial_count = sum(initial_probabilities.values())
    initial_probabilities = {tag: count / total_initial_count for tag, count in initial_probabilities.items()}

    # Transition probabilities
    for prev_tag, next_tag_counts in transition_probabilities.items():
        total_transitions_from_prev_tag = sum(next_tag_counts.values())
        transition_probabilities[prev_tag] = {next_tag: count / total_transitions_from_prev_tag for next_tag, count in next_tag_counts.items()}

    # Emission probabilities
    for tag, word_counts in emission_probabilities.items():
        total_words_for_tag = sum(word_counts.values())
        emission_probabilities[tag] = {word: count / total_words_for_tag for word, count in word_counts.items()}

    print("HMM parameters built successfully.")
    print("\n--- HMM Parameters Examples ---")
    print("\nInitial Probabilities (top 5):")
    print(dict(list(initial_probabilities.items())[:5]))
    print("\nTransition Probabilities (first 3 tags, top 3 transitions each):")
    for i, (prev_tag, next_tag_probs) in enumerate(transition_probabilities.items()):
        if i >= 3: break
        print(f"  {prev_tag}: {dict(list(next_tag_probs.items())[:3])}")
    print("\nEmission Probabilities (first 3 tags, top 3 emissions each):")
    for i, (tag, word_probs) in enumerate(emission_probabilities.items()):
        if i >= 3: break
        print(f"  {tag}: {dict(list(word_probs.items())[:3])}")

else:
    print("DataFrame is empty or 'pos_tags' column not found. Cannot build HMM parameters.")


Building HMM parameters...
HMM parameters built successfully.

--- HMM Parameters Examples ---

Initial Probabilities (top 5):
{'WRB': 0.030844922672981247, 'NN': 0.3353557165555276, 'WP': 0.017520505002791256, 'VBG': 0.019741241281156254, 'JJ': 0.13076578593819974}

Transition Probabilities (first 3 tags, top 3 transitions each):
  WRB: {'NN': 0.26288186705126526, 'VBZ': 0.028648223416808706, 'JJS': 0.0008522354792185657}
  NN: {'VBD': 0.06080158149560789, 'JJ': 0.054530325960094043, 'NNP': 0.008668070206789487}
  VBD: {'NNP': 0.0049813780260707635, 'PRP': 0.03550279329608939, 'RB': 0.09728119180633148}

Emission Probabilities (first 3 tags, top 3 emissions each):
  WRB: {'when': 0.2616143264988321, 'why': 0.351901116013496, 'how': 0.2508434985725409}
  NN: {'modi': 0.09749772590027594, 'government': 0.005617408520038831, 'governance': 0.0004959066204967092}
  VBD: {'promised': 0.006117570450592479, 'expected': 0.000676874041452109, 'did': 0.06012183732746138}


In [14]:
import math

if not df.empty and 'pos_tags' in df.columns and transition_probabilities:
    print("\n--- Analyzing Transition Probability Irregularities ---")

    # Helper function to print top N transitions for a given previous tag
    def print_top_transitions(prev_tag, num=5):
        if prev_tag in transition_probabilities:
            sorted_transitions = sorted(transition_probabilities[prev_tag].items(), key=lambda item: item[1], reverse=True)
            print(f"  Top {num} transitions from '{prev_tag}':")
            for next_tag, prob in sorted_transitions[:num]:
                print(f"    -> {next_tag}: {prob:.4f}")
        else:
            print(f"  No transitions found for '{prev_tag}'.")

    print("\nExamples of Transition Probabilities for common tags:")
    print_top_transitions('NN') # Noun
    print_top_transitions('VB') # Verb, base form
    print_top_transitions('JJ') # Adjective
    print_top_transitions('DT') # Determiner
    print_top_transitions('IN') # Preposition/subordinating conjunction

    # Calculate entropy for transition probabilities
    # Entropy = - sum(p * log(p)) for all possible next tags
    transition_entropies = {}
    for prev_tag, next_tag_probs in transition_probabilities.items():
        entropy = 0
        for prob in next_tag_probs.values():
            if prob > 0: # Avoid log(0)
                entropy -= prob * math.log(prob, 2) # Using log base 2 for bits
        transition_entropies[prev_tag] = entropy

    # Sort tags by entropy to find most/least predictable transitions
    sorted_entropies = sorted(transition_entropies.items(), key=lambda item: item[1])

    print("\nTags with the lowest transition entropy (most predictable next tags):")
    for tag, entropy in sorted_entropies[:5]:
        print(f"  '{tag}': {entropy:.4f} bits")
        print_top_transitions(tag, num=3)

    print("\nTags with the highest transition entropy (least predictable next tags):")
    for tag, entropy in sorted_entropies[-5:]: # Last 5 for highest entropy
        print(f"  '{tag}': {entropy:.4f} bits")
        print_top_transitions(tag, num=3)

else:
    print("HMM parameters (transition_probabilities) not available for analysis.")


--- Analyzing Transition Probability Irregularities ---

Examples of Transition Probabilities for common tags:
  Top 5 transitions from 'NN':
    -> NN: 0.3595
    -> IN: 0.0932
    -> VBD: 0.0608
    -> NNS: 0.0592
    -> JJ: 0.0545
  Top 5 transitions from 'VB':
    -> NN: 0.1905
    -> DT: 0.1486
    -> JJ: 0.1187
    -> PRP$: 0.0864
    -> IN: 0.0790
  Top 5 transitions from 'JJ':
    -> NN: 0.5665
    -> NNS: 0.1530
    -> JJ: 0.0995
    -> IN: 0.0329
    -> RB: 0.0208
  Top 5 transitions from 'DT':
    -> NN: 0.5083
    -> JJ: 0.1983
    -> NNS: 0.1178
    -> JJS: 0.0183
    -> RB: 0.0168
  Top 5 transitions from 'IN':
    -> NN: 0.3217
    -> JJ: 0.1733
    -> DT: 0.1480
    -> NNS: 0.0720
    -> PRP: 0.0532

Tags with the lowest transition entropy (most predictable next tags):
  'SYM': 0.0000 bits
  Top 3 transitions from 'SYM':
    -> NN: 1.0000
  '``': 0.0000 bits
  Top 3 transitions from '``':
    -> RB: 1.0000
  '$': 0.0725 bits
  Top 3 transitions from '$':
    -> CD: 0.9

In [15]:
print("\n--- Analyzing Rare and Low-Probability Tokens ---")

# Identify words with the lowest emission probabilities for each tag
print("\nTop 5 words with the lowest emission probabilities for each of the first 5 tags:")
for i, (tag, word_probs) in enumerate(emission_probabilities.items()):
    if i >= 5: break
    if word_probs:
        sorted_words = sorted(word_probs.items(), key=lambda item: item[1])
        print(f"  Tag '{tag}':")
        for word, prob in sorted_words[:5]:
            print(f"    '{word}': {prob:.8f}")
    else:
        print(f"  Tag '{tag}': No emission probabilities found.")

# Calculate overall word frequencies to find truly rare words in the dataset
word_frequencies = defaultdict(int)
for tags_sequence in df['pos_tags']:
    if tags_sequence:
        for word, _ in tags_sequence:
            word_frequencies[word.lower()] += 1

# Filter for words that appeared only once (singletons)
singletons = {word: count for word, count in word_frequencies.items() if count == 1}

print("\nExamples of words that appeared only once in the entire dataset (singletons):")
if singletons:
    # Display first 10 singletons
    for i, (word, count) in enumerate(list(singletons.items())[:10]):
        print(f"  '{word}' (count: {count})")
        # Check their emission probability if they have one
        # Find the tag for this singleton by searching the original pos_tags
        found_tags_for_singleton = []
        for tags_sequence in df['pos_tags']:
            if tags_sequence:
                for w, t in tags_sequence:
                    if w.lower() == word:
                        found_tags_for_singleton.append(t)
                        break # Only need one tag for example
                if found_tags_for_singleton: break

        if found_tags_for_singleton and found_tags_for_singleton[0] in emission_probabilities and word in emission_probabilities[found_tags_for_singleton[0]]:
             print(f"    Emission probability for tag '{found_tags_for_singleton[0]}': {emission_probabilities[found_tags_for_singleton[0]][word]:.8f}")
        else:
            print(f"    Emission probability for tag not found or word not in emission for its tag (due to smoothing, this might be a small default value).")
else:
    print("  No singletons found.")



--- Analyzing Rare and Low-Probability Tokens ---

Top 5 words with the lowest emission probabilities for each of the first 5 tags:
  Tag 'WRB':
    'mover': 0.00006488
    'wasn': 0.00006488
    'wld': 0.00006488
    'wiselythe': 0.00006488
    'whatsapp': 0.00006488
  Tag 'NN':
    'constituency2': 0.00000191
    'tuthukudi': 0.00000191
    'thuthukudi': 0.00000191
    'leadershipwho': 0.00000191
    'modiganga': 0.00000191
  Tag 'VBD':
    'disabilityif': 0.00001714
    'bursted': 0.00001714
    'mki': 0.00001714
    'stemmed': 0.00001714
    'piggybacked': 0.00001714
  Tag 'NNP':
    '➡': 0.00010847
    'िु': 0.00010847
    '⚡jai': 0.00010847
    'zayed': 0.00010847
    'mistakemodi': 0.00010847
  Tag 'JJ':
    'crustal': 0.00000494
    'maarkefir': 0.00000494
    'behaved': 0.00000494
    'likly': 0.00000494
    'insensitivearrogant': 0.00000494

Examples of words that appeared only once in the entire dataset (singletons):
  'crustal' (count: 1)
    Emission probability for tag '

In [22]:
import math

# Function to manually apply Viterbi decoding for a given sequence of words
def viterbi_decode(words, initial_prob, transition_prob, emission_prob, all_tags):
    """
    Applies the Viterbi algorithm to find the most likely sequence of hidden states (POS tags)
    for a given sequence of observations (words).

    Args:
        words (list): A list of tokenized words (observations).
        initial_prob (dict): Dictionary of initial state probabilities.
        transition_prob (dict): Dictionary of transition probabilities (tag_prev -> tag_curr).
        emission_prob (dict): Dictionary of emission probabilities (tag -> word).
        all_tags (list): A list of all possible POS tags.

    Returns:
        list: The most likely sequence of POS tags (hidden states) for the input words.
    """
    # Initialize Viterbi path and probabilities table (T)
    # T[tag] will store a list of dictionaries, one for each word in the sequence.
    # Each dictionary contains the log probability of the best path ending in that tag
    # at that position, and the previous tag in that path.
    T = {}
    for tag in all_tags:
        # Calculate log probability for the first word: P(tag) * P(word|tag)
        # Use 1e-10 for smoothing to avoid log(0) for unseen events
        T[tag] = [{"prob": math.log(initial_prob.get(tag, 1e-10)) + math.log(emission_prob.get(tag, {}).get(words[0].lower(), 1e-10)), "prev": None}]

    # Forward pass: fill the Viterbi table
    for i in range(1, len(words)): # Iterate through words from the second word onwards
        for current_tag in all_tags: # For each possible current tag
            max_log_prob = -float('inf') # Initialize with negative infinity
            best_prev_tag = None
            # Find the previous tag that leads to the maximum probability path to the current tag
            for prev_tag in all_tags:
                # Calculate log probability: P(path_to_prev_tag) * P(curr_tag|prev_tag) * P(word_curr|curr_tag)
                log_prob = T[prev_tag][-1]["prob"] + \
                           math.log(transition_prob.get(prev_tag, {}).get(current_tag, 1e-10)) + \
                           math.log(emission_prob.get(current_tag, {}).get(words[i].lower(), 1e-10))

                # Update if a better path is found
                if log_prob > max_log_prob:
                    max_log_prob = log_prob
                    best_prev_tag = prev_tag
            # Store the maximum log probability and the best previous tag for the current tag at the current position
            T[current_tag].append({"prob": max_log_prob, "prev": best_prev_tag})

    # Find the path with the highest probability at the very end of the sequence
    max_log_prob_path = -float('inf')
    last_tag_path = None
    for tag in all_tags:
        if T[tag][-1]["prob"] > max_log_prob_path:
            max_log_prob_path = T[tag][-1]["prob"]
            last_tag_path = tag

    # Backward pass: reconstruct the best path by backtracking
    best_path = [last_tag_path]
    # Iterate backwards from the second to last word
    for i in range(len(words) - 1, 0, -1):
        last_tag_path = T[last_tag_path][i]["prev"] # Get the best previous tag
        best_path.insert(0, last_tag_path) # Insert at the beginning to build the path in correct order

    return best_path

# Get a sample tweet for Viterbi decoding
sample_tweet_index = 0  # You can change this index to try different tweets
sample_tweet_text = df['cleaned_tweet_text'].iloc[sample_tweet_index]
sample_tweet_tokens = nltk.word_tokenize(sample_tweet_text)

# Get all unique tags from the training data to use in Viterbi
# Convert set to list for consistent iteration order within the Viterbi function
all_tags = list(all_tags_set)

print(f"\nApplying Viterbi decoding to sample tweet (index {sample_tweet_index}):")
print(f"Original Tweet: {sample_tweet_text}")
print(f"Tokenized: {sample_tweet_tokens}")

# Perform Viterbi decoding using the calculated HMM parameters
viterbi_path = viterbi_decode(sample_tweet_tokens, initial_probabilities, transition_probabilities, emission_probabilities, all_tags)

print(f"Viterbi Path (POS Tags): {viterbi_path}")

# Compare with actual POS tags (if available in the DataFrame)
if 'pos_tags' in df.columns:
    actual_pos_tags = [tag for word, tag in df['pos_tags'].iloc[sample_tweet_index]]
    print(f"Actual POS Tags:   {actual_pos_tags}")

    # Calculate accuracy for this specific tweet
    correct_tags = sum(1 for p, a in zip(viterbi_path, actual_pos_tags) if p == a)
    accuracy = correct_tags / len(actual_pos_tags) if len(actual_pos_tags) > 0 else 0
    print(f"Viterbi Accuracy for this tweet: {accuracy:.2f}")
else:
    print("Actual POS tags not available for comparison.")


Applying Viterbi decoding to sample tweet (index 0):
Original Tweet: when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples
Tokenized: ['when', 'modi', 'promised', '“', 'minimum', 'government', 'maximum', 'governance', '”', 'expected', 'him', 'begin', 'the', 'difficult', 'job', 'reforming', 'the', 'state', 'why', 'does', 'take', 'years', 'get', 'justice', 'state', 'should', 'and', 'not', 'business', 'and', 'should', 'exit', 'psus', 'and', 'temples']
Viterbi Path (POS Tags): ['NN', 'VBZ', 'VBN', 'NNP', 'NNP', 'NN', 'NN', 'NN', 'VBZ', 'VBN', 'PRP', 'IN', 'DT', 'JJ', 'NN', 'NN', 'NN', 'NN', 'NN', 'VBZ', 'VBN', 'NNS', 'VBP', 'RB', 'RB', 'RB', 'RB', 'RB', 'RB', 'CC', 'MD', 'NN', 'NNS', 'CC', 'NN']
Actual POS Tags:   ['WRB', 'NN', 'VBD', 'NNP', 'JJ', 'NN', 'JJ', 'NN', 'NNP', 'VBD', 'PRP', 'VB', 'DT', 'JJ', 'NN', 'VBG', 'DT', 'NN', 'WRB',

Hidden Markov Models (HMMs) can struggle significantly with social media text due to several inherent characteristics of this type of language:

Informal and Unstructured Language: Social media posts are often highly informal, featuring slang, abbreviations (e.g., 'lol', 'brb'), misspellings, grammatical errors, and a general lack of adherence to standard linguistic rules. HMMs rely heavily on the statistical regularities found in more formal, structured language for their transition and emission probabilities. When these regularities are absent or significantly altered, the model's predictions become unreliable.

Vocabulary Mismatch and Neologisms: Social media is a dynamic environment where new words, hashtags, and unique expressions (neologisms) emerge constantly. An HMM trained on traditional corpora will encounter many 'unknown words' in social media text. While techniques like Laplace smoothing can assign non-zero probabilities, these words will still have extremely low emission probabilities, making their POS tagging highly uncertain or often defaulting to a generic tag like a noun ('NN') regardless of their actual function (as we observed with some singletons in our analysis).

Noise and Special Characters: Social media is replete with emojis, emoticons, multiple punctuation marks, URLs, mentions (@user), and hashtags (#topic). While we performed some cleaning (removing URLs and mentions), the remaining noise can still confuse the HMM's statistical patterns, as these elements don't fit neatly into standard POS categories.

Ambiguity and Context Dependence: Many words in informal language can be highly ambiguous, and their meaning (and thus their POS) depends heavily on context. Social media posts are often short, providing limited context. HMMs, which primarily capture local dependencies (previous tag influencing current tag), may not be sophisticated enough to resolve these ambiguities accurately without richer contextual features.

Domain Mismatch: Typically, HMMs used for POS tagging are trained on formal text datasets (like the Penn Treebank). Applying such a model directly to a vastly different domain like social media without retraining or adaptation leads to poor performance. The statistical distributions of tags and words within tags are simply different.

Lack of Punctuation and Capitalization: The inconsistent use of punctuation and capitalization in social media can strip away cues that POS taggers, especially those relying on HMMs, often use to disambiguate words and identify sentence structure.

In essence, the 'rules' (statistical patterns) an HMM learns from formal text don't transfer well to the 'wild west' of social media language, leading to frequent misclassifications and lower accuracy, as demonstrated by the 40% accuracy on our sample tweet.

In [17]:
import pandas as pd
import re

# Load the dataset from the specified CSV file
try:
    df = pd.read_csv('/content/Twitter_Data.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Twitter_Data.csv not found. Please ensure the file is in the /content/ directory.")
    df = pd.DataFrame() # Create an empty DataFrame to avoid further errors if file is not found

if not df.empty:
    # Inspect available columns to identify the tweet text column
    print("\nAvailable columns:", df.columns.tolist())

    # Define common column names that might contain tweet text
    text_column_candidates = ['clean_text', 'text', 'tweet', 'Tweet', 'content', 'message']
    tweet_text_column = None

    # Iterate through candidates to find the actual tweet text column
    for col in text_column_candidates:
        if col in df.columns:
            tweet_text_column = col
            break

    if tweet_text_column is None:
        print("\nCould not find a common tweet text column. Please identify the correct column from the list above and update the 'tweet_text_column' variable.")
    else:
        print(f"\nUsing '{tweet_text_column}' as the tweet text column.")

        # Define a function to remove URLs from text
        def remove_urls(text):
            """
            Removes URLs from a given string.

            Args:
                text (str): The input string.

            Returns:
                str: The string with URLs removed.
            """
            url_pattern = re.compile(r'https?://\S+|www\.\S+')
            return url_pattern.sub(r'', text)

        # Define a function to remove mentions (@username) from text
        def remove_mentions(text):
            """
            Removes Twitter mentions (e.g., @username) from a given string.

            Args:
                text (str): The input string.

            Returns:
                str: The string with mentions removed.
            """
            mention_pattern = re.compile(r'@\w+')
            return mention_pattern.sub(r'', text)

        # Apply preprocessing steps to create a new 'cleaned_tweet_text' column
        print("\nPreprocessing tweet text...")
        # Convert the column to string type to avoid errors with non-string entries
        df['cleaned_tweet_text'] = df[tweet_text_column].astype(str).apply(remove_urls)
        df['cleaned_tweet_text'] = df['cleaned_tweet_text'].apply(remove_mentions)

        print("\nOriginal tweet examples:")
        print(df[tweet_text_column].head())
        print("\nCleaned tweet examples:")
        print(df['cleaned_tweet_text'].head())
        print("\nPreprocessing complete. A new column 'cleaned_tweet_text' has been added to the DataFrame.")
else:
    print("DataFrame is empty, unable to perform preprocessing.")

Dataset loaded successfully.

Available columns: ['clean_text', 'category']

Using 'clean_text' as the tweet text column.

Preprocessing tweet text...

Original tweet examples:
0    when modi promised “minimum government maximum...
1    talk all the nonsense and continue all the dra...
2    what did just say vote for modi  welcome bjp t...
3    asking his supporters prefix chowkidar their n...
4    answer who among these the most powerful world...
Name: clean_text, dtype: object

Cleaned tweet examples:
0    when modi promised “minimum government maximum...
1    talk all the nonsense and continue all the dra...
2    what did just say vote for modi  welcome bjp t...
3    asking his supporters prefix chowkidar their n...
4    answer who among these the most powerful world...
Name: cleaned_tweet_text, dtype: object

Preprocessing complete. A new column 'cleaned_tweet_text' has been added to the DataFrame.


In [18]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet

# Download necessary NLTK data for POS tagging and lemmatization if not already present
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading 'punkt' for NLTK...")
    nltk.download('punkt')
    print("'punkt' downloaded.")

try:
    # Corrected resource name to 'averaged_perceptron_tagger' for POS tagging
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    print("Downloading 'averaged_perceptron_tagger' for NLTK...")
    nltk.download('averaged_perceptron_tagger')
    print("'averaged_perceptron_tagger' downloaded.")

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    print("Downloading 'wordnet' for NLTK...")
    nltk.download('wordnet')
    print("'wordnet' downloaded.")

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Downloading 'stopwords' for NLTK...")
    nltk.download('stopwords')
    print("'stopwords' downloaded.")

print("NLTK and its tagger data are ready.")

# Function to perform POS tagging on a given text
def pos_tag_text(text):
    """
    Tokenizes the input text and performs Part-of-Speech tagging.

    Args:
        text (str): The input string (e.g., a cleaned tweet).

    Returns:
        list: A list of (word, tag) tuples.
    """
    tokens = nltk.word_tokenize(text)
    return nltk.pos_tag(tokens)

# Function to convert NLTK POS tags to WordNet POS tags, required for accurate lemmatization
def get_wordnet_pos(tag):
    """
    Converts an NLTK POS tag to a WordNet POS tag.
    This is necessary for the WordNetLemmatizer to work correctly.

    Args:
        tag (str): The NLTK POS tag.

    Returns:
        str: The corresponding WordNet POS tag (e.g., ADJ, VERB, NOUN, ADV).
             Defaults to NOUN if no specific mapping is found.
    """
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN # Default to noun if not found

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()
# Get a set of English stopwords for efficient lookup
stop_words = set(stopwords.words('english'))

# Function to lemmatize tokens and remove stopwords
def lemmatize_and_remove_stopwords(pos_tags):
    """
    Lemmatizes words based on their POS tags and removes common English stopwords.

    Args:
        pos_tags (list): A list of (word, tag) tuples from NLTK's pos_tag function.

    Returns:
        str: A space-separated string of lemmatized, non-stopwords.
    """
    lemmatized_tokens = []
    for word, tag in pos_tags:
        # Convert to lowercase and remove non-alphabetic characters
        cleaned_word = ''.join(filter(str.isalpha, word)).lower()
        # Process only if the word is not empty and not a stopword
        if cleaned_word and cleaned_word not in stop_words:
            wntag = get_wordnet_pos(tag)
            lemmatized_tokens.append(lemmatizer.lemmatize(cleaned_word, wntag))
    return ' '.join(lemmatized_tokens)


if not df.empty and 'cleaned_tweet_text' in df.columns:
    print("\nPerforming POS tagging on 'cleaned_tweet_text'...")
    # Apply POS tagging to the cleaned tweet text and store in a new 'pos_tags' column
    df['pos_tags'] = df['cleaned_tweet_text'].astype(str).apply(pos_tag_text)
    print("POS tagging complete. A new column 'pos_tags' has been added to the DataFrame.")
    print("\nExamples of POS tags:")
    display(df[['cleaned_tweet_text', 'pos_tags']].head())

    print("\nPerforming lemmatization and stopword removal...")
    # Apply lemmatization and stopword removal using the 'pos_tags' column
    df['lemmatized_text'] = df['pos_tags'].apply(lemmatize_and_remove_stopwords)
    print("Lemmatization and stopword removal complete. A new column 'lemmatized_text' has been added.")
    print("\nExamples of lemmatized text:")
    display(df[['cleaned_tweet_text', 'pos_tags', 'lemmatized_text']].head())
else:
    print("DataFrame is empty or 'cleaned_tweet_text' column not found. Cannot perform POS tagging, lemmatization or stopword removal.")

Downloading 'wordnet' for NLTK...
'wordnet' downloaded.
NLTK and its tagger data are ready.

Performing POS tagging on 'cleaned_tweet_text'...


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


POS tagging complete. A new column 'pos_tags' has been added to the DataFrame.

Examples of POS tags:


Unnamed: 0,cleaned_tweet_text,pos_tags
0,when modi promised “minimum government maximum...,"[(when, WRB), (modi, NN), (promised, VBD), (“,..."
1,talk all the nonsense and continue all the dra...,"[(talk, NN), (all, PDT), (the, DT), (nonsense,..."
2,what did just say vote for modi welcome bjp t...,"[(what, WP), (did, VBD), (just, RB), (say, VB)..."
3,asking his supporters prefix chowkidar their n...,"[(asking, VBG), (his, PRP$), (supporters, NNS)..."
4,answer who among these the most powerful world...,"[(answer, NN), (who, WP), (among, IN), (these,..."



Performing lemmatization and stopword removal...
Lemmatization and stopword removal complete. A new column 'lemmatized_text' has been added.

Examples of lemmatized text:


Unnamed: 0,cleaned_tweet_text,pos_tags,lemmatized_text
0,when modi promised “minimum government maximum...,"[(when, WRB), (modi, NN), (promised, VBD), (“,...",modi promise minimum government maximum govern...
1,talk all the nonsense and continue all the dra...,"[(talk, NN), (all, PDT), (the, DT), (nonsense,...",talk nonsense continue drama vote modi
2,what did just say vote for modi welcome bjp t...,"[(what, WP), (did, VBD), (just, RB), (say, VB)...",say vote modi welcome bjp tell rahul main camp...
3,asking his supporters prefix chowkidar their n...,"[(asking, VBG), (his, PRP$), (supporters, NNS)...",ask supporter prefix chowkidar name modi great...
4,answer who among these the most powerful world...,"[(answer, NN), (who, WP), (among, IN), (these,...",answer among powerful world leader today trump...


In [19]:
from collections import defaultdict

if not df.empty and 'pos_tags' in df.columns:
    print("\nBuilding HMM parameters...")

    # Initialize dictionaries for HMM parameters with Laplace smoothing (add 1)
    # Laplace smoothing ensures no zero probabilities, which can cause issues with logarithms.
    initial_probabilities = defaultdict(lambda: 1)
    transition_probabilities = defaultdict(lambda: defaultdict(lambda: 1))
    emission_probabilities = defaultdict(lambda: defaultdict(lambda: 1))
    tag_counts = defaultdict(lambda: 0) # Used to normalize probabilities later

    all_tags_set = set() # To collect all unique tags encountered in the dataset

    # Process each tweet's POS tags to build up counts for HMM parameters
    for tags_sequence in df['pos_tags']:
        if tags_sequence:
            # Initial probabilities: increment count for the first tag in each sequence
            first_tag = tags_sequence[0][1] # (word, tag) -> tag
            initial_probabilities[first_tag] += 1
            all_tags_set.add(first_tag)

            # Transition and Emission probabilities: iterate through each word-tag pair
            for i, (word, tag) in enumerate(tags_sequence):
                tag_counts[tag] += 1 # Count occurrences of each tag
                all_tags_set.add(tag)
                # Emission probabilities: count how many times a word is 'emitted' by a tag
                emission_probabilities[tag][word.lower()] += 1 # Use lowercase for words for consistency

                # Transition probabilities: count transitions from previous tag to current tag
                if i > 0:
                    prev_tag = tags_sequence[i-1][1]
                    transition_probabilities[prev_tag][tag] += 1

    # Convert tag_counts to a regular dict for easier access/iteration (though defaultdict works too)
    tag_counts_dict = dict(tag_counts)

    # Normalize probabilities for Initial, Transition, and Emission parameters
    # Normalize Initial probabilities
    total_initial_count = sum(initial_probabilities.values())
    initial_probabilities = {tag: count / total_initial_count for tag, count in initial_probabilities.items()}

    # Normalize Transition probabilities
    for prev_tag, next_tag_counts in transition_probabilities.items():
        total_transitions_from_prev_tag = sum(next_tag_counts.values())
        # Ensure we don't divide by zero if a tag never transitions to anything else
        if total_transitions_from_prev_tag > 0:
            transition_probabilities[prev_tag] = {next_tag: count / total_transitions_from_prev_tag for next_tag, count in next_tag_counts.items()}
        else:
            # If a tag never transitions, its probabilities remain as the smoothed default (1/N, where N is num of possible next states)
            transition_probabilities[prev_tag] = {next_tag: 1/len(all_tags_set) for next_tag in all_tags_set} # Fallback smoothing

    # Normalize Emission probabilities
    for tag, word_counts in emission_probabilities.items():
        total_words_for_tag = sum(word_counts.values())
        # Ensure we don't divide by zero if a tag never emits a word (unlikely with smoothing but good practice)
        if total_words_for_tag > 0:
            emission_probabilities[tag] = {word: count / total_words_for_tag for word, count in word_counts.items()}
        else:
            # Fallback smoothing for emission if a tag has no observed words (highly unlikely)
            emission_probabilities[tag] = {word: 1/len(word_frequencies) for word in word_frequencies} # Use total unique words

    print("HMM parameters built successfully.")
    print("\n--- HMM Parameters Examples ---")
    print("\nInitial Probabilities (top 5):")
    print(dict(list(initial_probabilities.items())[:5]))
    print("\nTransition Probabilities (first 3 tags, top 3 transitions each):")
    for i, (prev_tag, next_tag_probs) in enumerate(transition_probabilities.items()):
        if i >= 3: break
        print(f"  {prev_tag}: {dict(list(next_tag_probs.items())[:3])}")
    print("\nEmission Probabilities (first 3 tags, top 3 emissions each):")
    for i, (tag, word_probs) in enumerate(emission_probabilities.items()):
        if i >= 3: break
        print(f"  {tag}: {dict(list(word_probs.items())[:3])}")

else:
    print("DataFrame is empty or 'pos_tags' column not found. Cannot build HMM parameters.")


Building HMM parameters...
HMM parameters built successfully.

--- HMM Parameters Examples ---

Initial Probabilities (top 5):
{'WRB': 0.030844922672981247, 'NN': 0.3353557165555276, 'WP': 0.017520505002791256, 'VBG': 0.019741241281156254, 'JJ': 0.13076578593819974}

Transition Probabilities (first 3 tags, top 3 transitions each):
  WRB: {'NN': 0.26288186705126526, 'VBZ': 0.028648223416808706, 'JJS': 0.0008522354792185657}
  NN: {'VBD': 0.06080158149560789, 'JJ': 0.054530325960094043, 'NNP': 0.008668070206789487}
  VBD: {'NNP': 0.0049813780260707635, 'PRP': 0.03550279329608939, 'RB': 0.09728119180633148}

Emission Probabilities (first 3 tags, top 3 emissions each):
  WRB: {'when': 0.2616143264988321, 'why': 0.351901116013496, 'how': 0.2508434985725409}
  NN: {'modi': 0.09749772590027594, 'government': 0.005617408520038831, 'governance': 0.0004959066204967092}
  VBD: {'promised': 0.006117570450592479, 'expected': 0.000676874041452109, 'did': 0.06012183732746138}


In [20]:
import math

if not df.empty and 'pos_tags' in df.columns and transition_probabilities:
    print("\n--- Analyzing Transition Probability Irregularities ---")

    def print_top_transitions(prev_tag, num=5):
        """
        Prints the top N most probable next tags for a given previous tag.

        Args:
            prev_tag (str): The POS tag for which to show transitions.
            num (int): The number of top transitions to display.
        """
        if prev_tag in transition_probabilities:
            # Sort transitions by probability in descending order
            sorted_transitions = sorted(transition_probabilities[prev_tag].items(), key=lambda item: item[1], reverse=True)
            print(f"  Top {num} transitions from '{prev_tag}':")
            for next_tag, prob in sorted_transitions[:num]:
                print(f"    -> {next_tag}: {prob:.4f}")
        else:
            print(f"  No transitions found for '{prev_tag}'.")

    print("\nExamples of Transition Probabilities for common tags:")
    print_top_transitions('NN') # Noun
    print_top_transitions('VB') # Verb, base form
    print_top_transitions('JJ') # Adjective
    print_top_transitions('DT') # Determiner
    print_top_transitions('IN') # Preposition/subordinating conjunction

    # Calculate entropy for transition probabilities for each previous tag
    # Entropy quantifies the predictability; higher entropy means less predictable next tags.
    # Formula: Entropy = - sum(p * log(p))
    transition_entropies = {}
    for prev_tag, next_tag_probs in transition_probabilities.items():
        entropy = 0
        for prob in next_tag_probs.values():
            if prob > 0: # Avoid log(0) which is undefined
                entropy -= prob * math.log(prob, 2) # Using log base 2 for bits
        transition_entropies[prev_tag] = entropy

    # Sort tags by entropy to find those with most/least predictable transitions
    sorted_entropies = sorted(transition_entropies.items(), key=lambda item: item[1])

    print("\nTags with the lowest transition entropy (most predictable next tags):")
    for tag, entropy in sorted_entropies[:5]: # Display top 5 lowest entropy tags
        print(f"  '{tag}': {entropy:.4f} bits")
        print_top_transitions(tag, num=3) # Show top 3 transitions for context

    print("\nTags with the highest transition entropy (least predictable next tags):")
    for tag, entropy in sorted_entropies[-5:]: # Display top 5 highest entropy tags
        print(f"  '{tag}': {entropy:.4f} bits")
        print_top_transitions(tag, num=3) # Show top 3 transitions for context

else:
    print("HMM parameters (transition_probabilities) not available for analysis.")


--- Analyzing Transition Probability Irregularities ---

Examples of Transition Probabilities for common tags:
  Top 5 transitions from 'NN':
    -> NN: 0.3595
    -> IN: 0.0932
    -> VBD: 0.0608
    -> NNS: 0.0592
    -> JJ: 0.0545
  Top 5 transitions from 'VB':
    -> NN: 0.1905
    -> DT: 0.1486
    -> JJ: 0.1187
    -> PRP$: 0.0864
    -> IN: 0.0790
  Top 5 transitions from 'JJ':
    -> NN: 0.5665
    -> NNS: 0.1530
    -> JJ: 0.0995
    -> IN: 0.0329
    -> RB: 0.0208
  Top 5 transitions from 'DT':
    -> NN: 0.5083
    -> JJ: 0.1983
    -> NNS: 0.1178
    -> JJS: 0.0183
    -> RB: 0.0168
  Top 5 transitions from 'IN':
    -> NN: 0.3217
    -> JJ: 0.1733
    -> DT: 0.1480
    -> NNS: 0.0720
    -> PRP: 0.0532

Tags with the lowest transition entropy (most predictable next tags):
  'SYM': 0.0000 bits
  Top 3 transitions from 'SYM':
    -> NN: 1.0000
  '``': 0.0000 bits
  Top 3 transitions from '``':
    -> RB: 1.0000
  '$': 0.0725 bits
  Top 3 transitions from '$':
    -> CD: 0.9

In [21]:
from collections import defaultdict

print("\n--- Analyzing Rare and Low-Probability Tokens ---")

# Identify words with the lowest emission probabilities for each tag
print("\nTop 5 words with the lowest emission probabilities for each of the first 5 tags:")
# Iterate through a few tags to show examples
for i, (tag, word_probs) in enumerate(emission_probabilities.items()):
    if i >= 5: break # Limit to the first 5 tags for brevity
    if word_probs:
        # Sort words by their emission probability in ascending order
        sorted_words = sorted(word_probs.items(), key=lambda item: item[1])
        print(f"  Tag '{tag}':")
        for word, prob in sorted_words[:5]: # Display top 5 lowest probability words
            print(f"    '{word}': {prob:.8f}")
    else:
        print(f"  Tag '{tag}': No emission probabilities found.")

# Calculate overall word frequencies to find truly rare words in the dataset
word_frequencies = defaultdict(int)
for tags_sequence in df['pos_tags']:
    if tags_sequence:
        for word, _ in tags_sequence:
            word_frequencies[word.lower()] += 1 # Count all word occurrences, lowercased

# Filter for words that appeared only once (singletons) in the entire dataset
singletons = {word: count for word, count in word_frequencies.items() if count == 1}

print("\nExamples of words that appeared only once in the entire dataset (singletons):")
if singletons:
    # Display first 10 singletons for example
    for i, (word, count) in enumerate(list(singletons.items())[:10]):
        print(f"  '{word}' (count: {count})")
        # Attempt to find and print their emission probability if available
        found_tags_for_singleton = []
        for tags_sequence_full in df['pos_tags']:
            if tags_sequence_full:
                for w, t in tags_sequence_full:
                    if w.lower() == word:
                        found_tags_for_singleton.append(t)
                        break # Only need one tag for example, assuming consistent tagging
                if found_tags_for_singleton: break # Stop after finding the first tag

        if found_tags_for_singleton and found_tags_for_singleton[0] in emission_probabilities and word in emission_probabilities[found_tags_for_singleton[0]]:
             print(f"    Emission probability for tag '{found_tags_for_singleton[0]}': {emission_probabilities[found_tags_for_singleton[0]][word]:.8f}")
        else:
            # Due to Laplace smoothing, a word will always have a small probability,
            # but if the tag itself wasn't in emission_probabilities or the word wasn't
            # directly in its specific smoothed entry, it might be a general default.
            print(f"    Emission probability for tag not found or word not in emission for its tag (due to smoothing, this might be a small default value).")
else:
    print("  No singletons found.")



--- Analyzing Rare and Low-Probability Tokens ---

Top 5 words with the lowest emission probabilities for each of the first 5 tags:
  Tag 'WRB':
    'mover': 0.00006488
    'wasn': 0.00006488
    'wld': 0.00006488
    'wiselythe': 0.00006488
    'whatsapp': 0.00006488
  Tag 'NN':
    'constituency2': 0.00000191
    'tuthukudi': 0.00000191
    'thuthukudi': 0.00000191
    'leadershipwho': 0.00000191
    'modiganga': 0.00000191
  Tag 'VBD':
    'disabilityif': 0.00001714
    'bursted': 0.00001714
    'mki': 0.00001714
    'stemmed': 0.00001714
    'piggybacked': 0.00001714
  Tag 'NNP':
    '➡': 0.00010847
    'िु': 0.00010847
    '⚡jai': 0.00010847
    'zayed': 0.00010847
    'mistakemodi': 0.00010847
  Tag 'JJ':
    'crustal': 0.00000494
    'maarkefir': 0.00000494
    'behaved': 0.00000494
    'likly': 0.00000494
    'insensitivearrogant': 0.00000494

Examples of words that appeared only once in the entire dataset (singletons):
  'crustal' (count: 1)
    Emission probability for tag '