<a href="https://colab.research.google.com/github/2303A51786/nlp/blob/main/PROJECT_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Added to download the missing resource

# Attempt to read the CSV with a more robust parser and skip bad lines
df = pd.read_csv("english_news_dataset.csv", engine='python', on_bad_lines='skip')
print("Dataset shape:", df.shape)
print(df.head())

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words and len(w) > 1]
    return " ".join(tokens)

df["clean_text"] = df["Headline"].apply(clean_text)
print(df["clean_text"].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Dataset shape: (76018, 4)
                                            Headline  \
0  Congress leader Baljinder Singh shot dead at h...   
1  17-year-old girl preparing for NEET dies by su...   
2  Hampers to welcome MPs in new Parliament tomor...   
3  Only 10% women lawmakers in RS, while only 14%...   
4  Ganesh temple decorated with notes, coins wort...   

                                             Content News Categories  \
0  Congress leader Baljinder Singh was shot dead ...    ['national']   
1  Another NEET aspirant died by suicide in Rajas...    ['national']   
2  In order to mark the first-ever working day of...    ['national']   
3  Congress President Mallikarjun Kharge, while s...    ['national']   
4  The Sri Sathya Ganapathi Temple in Bengaluru a...    ['national']   

         Date  
0  19-09-2023  
1  19-09-2023  
2  19-09-2023  
3  19-09-2023  
4  19-09-2023  
0    congress leader baljinder singh shot dead home...
1    year old girl preparing neet dies suicide raja..

In [None]:
from gensim.models import Word2Vec

# Tokenize sentences
sentences = [text.split() for text in df["clean_text"]]

# Train Word2Vec model
model_w2v = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, epochs=10)
print("Vocabulary size:", len(model_w2v.wv.index_to_key))

# Example: Similar words
word = model_w2v.wv.index_to_key[0]
print(f"Most similar words to '{word}':")
print(model_w2v.wv.most_similar(word))

Vocabulary size: 13204
Most similar words to 'india':
[('trade', 0.4408549964427948), ('immunity', 0.40714529156684875), ('lineup', 0.40662121772766113), ('see', 0.4007681906223297), ('escalation', 0.3967105746269226), ('projected', 0.39421820640563965), ('basis', 0.39131590723991394), ('jaishankar', 0.39049383997917175), ('dogg', 0.36691123247146606), ('uthappa', 0.3608863651752472)]


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load pretrained BERT model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

# Encode first 5 sentences
texts = df["clean_text"].head(5).tolist()
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)

print("Embeddings shape:", embeddings.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Embeddings shape: torch.Size([5, 768])


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# Prepare data
texts = df["clean_text"].values
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=40, padding="post")

# Dummy target (since dataset unlabeled)
y = np.array([len(t.split()) % 2 for t in texts])

X_train, X_val, y_train, y_val = train_test_split(padded, y, test_size=0.2, random_state=42)

# LSTM model
model_lstm = Sequential([
    Embedding(10000, 128, input_length=40),
    LSTM(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.summary()

# Train
model_lstm.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_val, y_val))




Epoch 1/3
[1m1901/1901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 42ms/step - accuracy: 0.4983 - loss: 0.6937 - val_accuracy: 0.5075 - val_loss: 0.6930
Epoch 2/3
[1m1901/1901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 47ms/step - accuracy: 0.5000 - loss: 0.6933 - val_accuracy: 0.5075 - val_loss: 0.6931
Epoch 3/3
[1m1901/1901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 45ms/step - accuracy: 0.5061 - loss: 0.6931 - val_accuracy: 0.5075 - val_loss: 0.6931


<keras.src.callbacks.history.History at 0x7cc662300170>

In [None]:
from tensorflow.keras.layers import Bidirectional

# BiLSTM model
model_bilstm = Sequential([
    Embedding(10000, 128, input_length=40),
    Bidirectional(LSTM(64)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_bilstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_bilstm.summary()

# Train
model_bilstm.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_val, y_val))


Epoch 1/3
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 177ms/step - accuracy: 0.5025 - loss: 0.6920 - val_accuracy: 0.5574 - val_loss: 0.6822
Epoch 2/3
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 173ms/step - accuracy: 0.5838 - loss: 0.6526 - val_accuracy: 0.5787 - val_loss: 0.6687
Epoch 3/3
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 135ms/step - accuracy: 0.8454 - loss: 0.5076 - val_accuracy: 0.5872 - val_loss: 0.7914


<keras.src.callbacks.history.History at 0x7cdba5f52450>

In [None]:
!pip install gensim tensorflow keras numpy pandas scikit-learn

import numpy as np
import pandas as pd
import gensim
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder




In [None]:
# Ensure 'clean_text' column is present in df
# This line is added to handle cases where 'df' might have been reset or 'clean_text' not generated
# It assumes the 'clean_text' function (defined earlier) is still in scope.
if 'clean_text' not in df.columns:
    df['clean_text'] = df['Headline'].apply(clean_text)

# Assume the dataset has 'text' and 'label' columns
texts = df['clean_text'].astype(str).values
labels = df['News Categories'].astype(str).values

# Encode labels
encoder = LabelEncoder()
labels = encoder.fit_transform(labels)

# Split data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [None]:
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)


In [None]:
# Train Word2Vec model
w2v_model = Word2Vec([text.split() for text in X_train], vector_size=100, window=5, min_count=1, workers=4)

# Create embedding matrix
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# Build LSTM model
model_w2v = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(len(np.unique(labels)), activation='softmax')
])

model_w2v.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_w2v.summary()

# Train
model_w2v.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=5, batch_size=64)




Epoch 1/5
[1m2353/2353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m629s[0m 265ms/step - accuracy: 0.3272 - loss: 3.0041 - val_accuracy: 0.5746 - val_loss: 1.5580
Epoch 2/5
[1m2353/2353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m697s[0m 272ms/step - accuracy: 0.5425 - loss: 1.6940 - val_accuracy: 0.6689 - val_loss: 1.1727
Epoch 3/5
[1m2353/2353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m611s[0m 260ms/step - accuracy: 0.6055 - loss: 1.4120 - val_accuracy: 0.7124 - val_loss: 0.9939
Epoch 4/5
[1m2353/2353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m612s[0m 255ms/step - accuracy: 0.6399 - loss: 1.2531 - val_accuracy: 0.7400 - val_loss: 0.8855
Epoch 5/5
[1m2353/2353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m621s[0m 255ms/step - accuracy: 0.6650 - loss: 1.1562 - val_accuracy: 0.7584 - val_loss: 0.8105


<keras.src.callbacks.history.History at 0x7cc64f2df9e0>

In [None]:
# Download GloVe embeddings (use 100D)
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

embeddings_index = {}
with open("glove.6B.100d.txt", encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create embedding matrix
embedding_matrix_glove = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_glove[i] = embedding_vector

# Build LSTM model using GloVe
model_glove = Sequential([
    Embedding(vocab_size, 100, weights=[embedding_matrix_glove], input_length=max_len, trainable=False),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(len(np.unique(labels)), activation='softmax')
])

model_glove.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_glove.summary()

# Train
model_glove.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=5, batch_size=64)


--2025-11-16 08:11:32--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-11-16 08:11:33--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-11-16 08:11:33--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’

glov

In [None]:
w2v_loss, w2v_acc = model_w2v.evaluate(X_test_pad, y_test)
glove_loss, glove_acc = model_glove.evaluate(X_test_pad, y_test)

print(f"Word2Vec LSTM Accuracy: {w2v_acc:.2f}")
print(f"GloVe LSTM Accuracy: {glove_acc:.2f}")
