<a href="https://colab.research.google.com/github/2403a52030-sketch/NLP-LAB/blob/main/NLP_LAB_06_2403A52030.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from collections import defaultdict, Counter


In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# Example file name
df = pd.read_csv('/content/Twitter_Data.csv')

# Check columns
df.head()


Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [None]:
def clean_tweet(text):
    text = re.sub(r'http\S+', '', text)     # remove URLs
    text = re.sub(r'@\w+', '', text)        # remove mentions
    text = re.sub(r'#', '', text)           # remove hashtag symbol
    text = text.lower()
    return text.strip()

df['clean_text'] = df['clean_text'].astype(str).apply(clean_tweet)
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [None]:
tokenized_tweets = [
    nltk.word_tokenize(tweet)
    for tweet in df['clean_text'].dropna().head(5000)
    if tweet.strip() != ""
]

tagged_sentences = nltk.pos_tag_sents(tokenized_tweets)

tagged_sentences[0]


[('when', 'WRB'),
 ('modi', 'NN'),
 ('promised', 'VBD'),
 ('“', 'NNP'),
 ('minimum', 'JJ'),
 ('government', 'NN'),
 ('maximum', 'JJ'),
 ('governance', 'NN'),
 ('”', 'NNP'),
 ('expected', 'VBD'),
 ('him', 'PRP'),
 ('begin', 'VB'),
 ('the', 'DT'),
 ('difficult', 'JJ'),
 ('job', 'NN'),
 ('reforming', 'VBG'),
 ('the', 'DT'),
 ('state', 'NN'),
 ('why', 'WRB'),
 ('does', 'VBZ'),
 ('take', 'VB'),
 ('years', 'NNS'),
 ('get', 'VB'),
 ('justice', 'NN'),
 ('state', 'NN'),
 ('should', 'MD'),
 ('and', 'CC'),
 ('not', 'RB'),
 ('business', 'NN'),
 ('and', 'CC'),
 ('should', 'MD'),
 ('exit', 'VB'),
 ('psus', 'NN'),
 ('and', 'CC'),
 ('temples', 'NNS')]

In [None]:
transition_counts = defaultdict(Counter)
tag_counts = Counter()

for sentence in tagged_sentences:
    prev_tag = '<START>'
    for word, tag in sentence:
        transition_counts[prev_tag][tag] += 1
        tag_counts[tag] += 1
        prev_tag = tag


In [None]:
transition_probs = defaultdict(dict)

for prev_tag, tags in transition_counts.items():
    total = sum(tags.values())
    for tag, count in tags.items():
        transition_probs[prev_tag][tag] = count / total


In [None]:
emission_counts = defaultdict(Counter)

for sentence in tagged_sentences:
    for word, tag in sentence:
        emission_counts[tag][word] += 1


In [None]:
emission_probs = defaultdict(dict)

for tag, words in emission_counts.items():
    total = sum(words.values())
    for word, count in words.items():
        emission_probs[tag][word] = count / total


In [None]:
# Show unusual transitions from START
transition_probs['<START>']


{'WRB': 0.032674964178349676,
 'NN': 0.32899727474503415,
 'WP': 0.018936307700952434,
 'VBG': 0.018823926052875566,
 'JJ': 0.11918073778551963,
 'DT': 0.07658809316438625,
 'IN': 0.03573736408844436,
 'NNS': 0.06793470626246734,
 'CD': 0.014834377546146714,
 'VBD': 0.012951984940859158,
 'VBZ': 0.008063383249515355,
 'RB': 0.10145253280139353,
 'VB': 0.04618885735959318,
 'PRP$': 0.014862472958165932,
 'JJR': 0.001067625656730256,
 'MD': 0.019357738881240694,
 'PRP': 0.03169162475767707,
 'VBN': 0.01149102351585986,
 'VBP': 0.006040513584131712,
 'CC': 0.019020593937010086,
 'EX': 0.0021071559014412948,
 'RBR': 0.0016857247211530357,
 'JJS': 0.003427640266344506,
 'UH': 0.0004495265923074762,
 'WDT': 0.0033995448543252885,
 'RBS': 0.0005619082403843452,
 'PDT': 0.0021071559014412948,
 'FW': 0.00016857247211530358,
 'WP$': 0.0001404770600960863,
 'NNP': 5.619082403843452e-05}

In [None]:
# Check transitions for a random tag
list(transition_probs.items())[5]


('JJ',
 {'NN': 0.5563502991458199,
  'JJ': 0.09985743356038358,
  'WP': 0.0022177001718108377,
  'VB': 0.009955280441590407,
  'NNS': 0.15597012197350946,
  'VBZ': 0.006080397723811032,
  'VBP': 0.012514165255218297,
  'IN': 0.033557946555862896,
  'CC': 0.018716414636821135,
  'VBN': 0.004727844322322005,
  'RBS': 0.00030462914447951066,
  'RB': 0.02139715110824083,
  'CD': 0.01258727624989338,
  'PRP$': 0.004740029488101186,
  'DT': 0.011259093179962713,
  'VBG': 0.014220088464303556,
  'VBD': 0.007055210986145467,
  'PRP': 0.005385843274397748,
  'MD': 0.004203882193817247,
  'NNP': 0.006470323028744806,
  'JJS': 0.001571886385514275,
  'RBR': 0.00029244397870033023,
  'FW': 0.001888700695772966,
  'WRB': 0.004715659156542824,
  'JJR': 0.0019130710273313268,
  'WDT': 0.0009748132623344341,
  'PDT': 0.00036555497337541275,
  'RP': 0.00021933298402524766,
  'EX': 0.0002315181498044281,
  "''": 1.2185165779180425e-05,
  'WP$': 9.74813262334434e-05,
  '$': 2.437033155836085e-05,
  'UH':

In [None]:
word_freq = Counter()

for sentence in tagged_sentences:
    for word, tag in sentence:
        word_freq[word] += 1


In [None]:
# Rare words (appear only once)
rare_words = [w for w, c in word_freq.items() if c == 1]
len(rare_words)


24666

In [None]:
test_tweet = "love this movie sooo much"
tokens = nltk.word_tokenize(test_tweet.lower())
tokens


['love', 'this', 'movie', 'sooo', 'much']

In [None]:
def viterbi(tokens, transition_probs, emission_probs, tag_counts):
    V = [{}]
    path = {}

    tags = list(tag_counts.keys())

    # Initialization
    for tag in tags:
        trans_p = transition_probs['<START>'].get(tag, 1e-6)
        emit_p = emission_probs[tag].get(tokens[0], 1e-6)
        V[0][tag] = np.log(trans_p) + np.log(emit_p)
        path[tag] = [tag]

    # Recursion
    for t in range(1, len(tokens)):
        V.append({})
        new_path = {}

        for tag in tags:
            (prob, prev_tag) = max(
                (V[t-1][pt] + np.log(transition_probs[pt].get(tag, 1e-6)) +
                 np.log(emission_probs[tag].get(tokens[t], 1e-6)), pt)
                for pt in tags
            )

            V[t][tag] = prob
            new_path[tag] = path[prev_tag] + [tag]

        path = new_path

    # Termination
    final_tag = max(V[-1], key=V[-1].get)
    return list(zip(tokens, path[final_tag]))


In [None]:
viterbi(tokens, transition_probs, emission_probs, tag_counts)


[('love', 'VB'),
 ('this', 'DT'),
 ('movie', 'NN'),
 ('sooo', 'NN'),
 ('much', 'RB')]

# Task
Optimize the NLTK tagging loop by converting the `clean_text` column of the DataFrame `df` into a list before iterating over it for tokenization and POS tagging.

## Optimize NLTK tagging loop

### Subtask:
Convert the 'clean_text' column to a list before iterating to potentially speed up the NLTK tokenization and POS tagging process. This reduces Pandas Series overhead during the loop.


## Summary:

### Data Analysis Key Findings
The `clean_text` column was converted into a list format prior to performing NLTK tokenization and Part-of-Speech (POS) tagging. This conversion was performed to mitigate the overhead associated with iterating directly over a Pandas Series, thereby aiming to optimize the overall processing speed of the NLTK tagging loop.

### Insights or Next Steps
*   Quantify the performance improvement (e.g., execution time comparison) achieved by converting the column to a list versus iterating directly over the Pandas Series.
*   Explore alternative methods for vectorized text processing within Pandas or other libraries to further enhance efficiency for large datasets.
