<a href="https://colab.research.google.com/github/BhanuDanda/NLP/blob/main/27-10-2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd, re, nltk, matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.tag import hmm
from nltk.corpus import treebank
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

nltk.download('punkt')
nltk.download('treebank')
nltk.download('punkt_tab')

print("\n=== Task 1: POS Tagging with HMM ===")
data = pd.read_csv("/content/tweets[1].csv")
tweets = data['text'].head(5)

def clean_tweet(t):
    t = t.lower()
    t = re.sub(r'@\w+|#\w+|http\S+', '', t)
    t = re.sub(r'[^a-z\s]', '', t)
    return t.strip()

cleaned = [clean_tweet(t) for t in tweets]
train_data = treebank.tagged_sents()[:3000]
trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = trainer.train_supervised(train_data)

for t in cleaned:
    tokens = word_tokenize(t)
    tags = hmm_tagger.tag(tokens)
    print(f"\nTweet: {t}\nPOS Tags: {tags}")

print("\n POS tagging helps find action words like 'need', 'help', 'send', etc., which indicate urgent requests.")

print("\n=== Task 2: Word2Vec + Visualization ===")
tokenized = [word_tokenize(t) for t in cleaned]
model = Word2Vec(sentences=tokenized, vector_size=50, window=3, min_count=1, sg=1)

words = ["water", "food", "shelter", "flood", "earthquake", "storm", "help", "need"]
vecs = [model.wv[w] for w in words if w in model.wv]
vecs_array = np.array(vecs)

if vecs_array.shape[0] > 1:
    perplexity_val = min(30, vecs_array.shape[0] - 1)
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity_val)
    reduced = tsne.fit_transform(vecs_array)

    plt.figure(figsize=(8,6))
    for i, w in enumerate(words):
        if w in model.wv:
            plt.scatter(reduced[i,0], reduced[i,1])
            plt.text(reduced[i,0]+0.02, reduced[i,1]+0.02, w, fontsize=10)
    plt.title("t-SNE Visualization of Word Embeddings")
    plt.xlabel("Dim 1"); plt.ylabel("Dim 2")
    plt.show()
else:
    print("Not enough word vectors to perform t-SNE visualization.")


print("\n Words like ['water','food','shelter'] cluster together (relief), while ['flood','storm'] form disaster clusters.")

print("\n=== Task 3: Naive Bayes Classification ===")
X = data['text'].apply(clean_tweet)
y = data['target']
vectorizer = TfidfVectorizer(max_features=2000)
X_tfidf = vectorizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

print("\n Naive Bayes gives a good baseline, but CNN/LSTM models can better capture context in noisy tweets.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



=== Task 1: POS Tagging with HMM ===

Tweet: communal violence in bhainsa telangana stones were pelted on muslims houses and some houses and vehicles were set ablaze
POS Tags: [('communal', 'NNP'), ('violence', 'NNP'), ('in', 'NNP'), ('bhainsa', 'NNP'), ('telangana', 'NNP'), ('stones', 'NNP'), ('were', 'NNP'), ('pelted', 'NNP'), ('on', 'NNP'), ('muslims', 'NNP'), ('houses', 'NNP'), ('and', 'NNP'), ('some', 'NNP'), ('houses', 'NNP'), ('and', 'NNP'), ('vehicles', 'NNP'), ('were', 'NNP'), ('set', 'NNP'), ('ablaze', 'NNP')]

Tweet: telangana section  has been imposed in bhainsa from january  to  after clash erupted between two groups on january  po
POS Tags: [('telangana', 'NNP'), ('section', 'NNP'), ('has', 'NNP'), ('been', 'NNP'), ('imposed', 'NNP'), ('in', 'NNP'), ('bhainsa', 'NNP'), ('from', 'NNP'), ('january', 'NNP'), ('to', 'NNP'), ('after', 'NNP'), ('clash', 'NNP'), ('erupted', 'NNP'), ('between', 'NNP'), ('two', 'NNP'), ('groups', 'NNP'), ('on', 'NNP'), ('january', 'NNP'), ('po', 