<a href="https://colab.research.google.com/github/Devulapally-hansika/word2vec/blob/main/longds.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ------------------ 1. Install & Import ------------------
!pip install -q nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

import re
import io
import pandas as pd
from collections import Counter
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from google.colab import files

# ------------------ 2. Upload & Preprocess ------------------
uploaded = files.upload()
filename = list(uploaded.keys())[0]

if filename.endswith('.txt'):
    with open(filename, 'r', encoding='utf-8') as f:
        raw_corpus = f.read().splitlines()
elif filename.endswith('.csv'):
    df = pd.read_csv(io.BytesIO(uploaded[filename]))
    raw_corpus = df['sentence'].dropna().tolist()
else:
    raise ValueError("Unsupported file format. Use .txt or .csv")

stop_words = set(stopwords.words('english'))
tokenizer = TreebankWordTokenizer()

def clean_and_tokenize(corpus):
    cleaned = []
    for sentence in corpus:
        sentence = sentence.lower()
        sentence = re.sub(r'[^a-z\\s]', '', sentence)
        tokens = tokenizer.tokenize(sentence)
        tokens = [word for word in tokens if word not in stop_words]
        cleaned.append(' '.join(tokens))
    return cleaned

corpus = clean_and_tokenize(raw_corpus)

# ------------------ 3. Build Vocabulary ------------------
tokenized_corpus = [sent.split() for sent in corpus]
all_words = [w for sent in tokenized_corpus for w in sent]
vocab = sorted(set(all_words))
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for w,i in word2idx.items()}

# ------------------ 4. Create Hard NSP Data ------------------
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

sentence_vectors = []
for sent in tokenized_corpus:
    vec = np.zeros(len(vocab))
    for w in sent:
        if w in word2idx:
            vec[word2idx[w]] += 1
    sentence_vectors.append(vec)

sentence_vectors = np.array(sentence_vectors)
sim_matrix = cosine_similarity(sentence_vectors)

def create_hard_nsp_dataset(sentences, sim_matrix, top_k=3):
    data = []
    for i in range(len(sentences)-1):
        pos = (sentences[i], sentences[i+1], 1)
        data.append(pos)

        sim_scores = sim_matrix[i]
        sim_scores[i+1] = -1
        neg_idxs = sim_scores.argsort()[-top_k:][::-1]

        for j in range(top_k):
            neg = (sentences[i], sentences[neg_idxs[j]], 0)
            data.append(neg)
    return data

nsp_data = create_hard_nsp_dataset(corpus, sim_matrix, top_k=3)
print(f"✅ NSP Dataset: {len(nsp_data)} examples")

# ------------------ 5. Encode NSP Data ------------------
def encode(sentence):
    return torch.tensor([word2idx[w] for w in sentence.split() if w in word2idx], dtype=torch.long)

encoded_data = [(encode(a), encode(b), torch.tensor(label, dtype=torch.long)) for a,b,label in nsp_data]

class NSPDataset(Dataset):
    def __init__(self, data, max_len=20):
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def pad(self, x):
        x = x[:self.max_len]
        pad_len = self.max_len - len(x)
        if pad_len > 0:
            x = torch.cat([x, torch.zeros(pad_len, dtype=torch.long)])
        return x.long()

    def __getitem__(self, idx):
        a, b, label = self.data[idx]
        return self.pad(a), self.pad(b), label

dataset = NSPDataset(encoded_data)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

# ------------------ 6. Define Model ------------------
class NSPTransformer(nn.Module):
    def __init__(self, vocab_size, dim=256, nhead=8, num_layers=2, max_len=20):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, dim)
        self.pos_enc = nn.Parameter(torch.randn(1, max_len*2, dim))
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim, nhead=nhead, dropout=0.1, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.dropout = nn.Dropout(0.2)
        self.cls = nn.Linear(dim, 2)

    def forward(self, x1, x2):
        x = torch.cat([x1, x2], dim=1)
        x = self.embedding(x) + self.pos_enc[:, :x.size(1), :]
        x = self.transformer(x)
        x = self.dropout(x.mean(dim=1))
        return self.cls(x)

model = NSPTransformer(len(vocab)).to('cpu')
optimizer = optim.AdamW(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# ------------------ 7. Training Loop ------------------
best_val_acc = 0
patience = 10
patience_counter = 0

for epoch in range(1, 50+1):
    model.train()
    correct = total = 0
    for a, b, y in train_loader:
        out = model(a, b)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        preds = out.argmax(dim=1)
        correct += (preds == y).sum().item()
        total += y.size(0)

    train_acc = correct / total

    model.eval()
    correct = total = 0
    with torch.no_grad():
        for a, b, y in val_loader:
            out = model(a, b)
            preds = out.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)
    val_acc = correct / total

    print(f"Epoch {epoch:02d} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("⏹️ Early stopping triggered.")
            break

print(f"✅ Best Val Accuracy: {best_val_acc:.4f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Saving longds2.txt to longds2 (1).txt
✅ NSP Dataset: 7772 examples
Epoch 01 | Train Acc: 0.7529 | Val Acc: 0.7949
Epoch 02 | Train Acc: 0.8437 | Val Acc: 0.8836
Epoch 03 | Train Acc: 0.8969 | Val Acc: 0.9042
Epoch 04 | Train Acc: 0.9279 | Val Acc: 0.9235
Epoch 05 | Train Acc: 0.9387 | Val Acc: 0.9222
Epoch 06 | Train Acc: 0.9378 | Val Acc: 0.9158
Epoch 07 | Train Acc: 0.9432 | Val Acc: 0.8900
Epoch 08 | Train Acc: 0.9460 | Val Acc: 0.9061
Epoch 09 | Train Acc: 0.9492 | Val Acc: 0.9196
Epoch 10 | Train Acc: 0.9403 | Val Acc: 0.9125
Epoch 11 | Train Acc: 0.9554 | Val Acc: 0.9080
Epoch 12 | Train Acc: 0.9571 | Val Acc: 0.9080
Epoch 13 | Train Acc: 0.9654 | Val Acc: 0.9080
Epoch 14 | Train Acc: 0.9683 | Val Acc: 0.9138
⏹️ Early stopping triggered.
✅ Best Val Accuracy: 0.9235


In [None]:
def predict_next_sentence(model, sentence_a, sentence_b):
    model.eval()
    a = NSPDataset.pad(encode(sentence_a))
    b = NSPDataset.pad(encode(sentence_b))
    out = model(a.unsqueeze(0), b.unsqueeze(0))
    pred = out.argmax(dim=1).item()
    return "Next" if pred == 1 else "Not Next"


In [None]:
# 🧪 Predict Next Sentence — Interactive
def predict_next_sentence(model, sentence_a, sentence_b, word2idx, max_len=20):
    model.eval()

    def encode(sentence):
        return torch.tensor([word2idx[w] for w in sentence.split() if w in word2idx], dtype=torch.long)

    def pad(x):
        x = x[:max_len]
        pad_len = max_len - len(x)
        if pad_len > 0:
            x = torch.cat([x, torch.zeros(pad_len, dtype=torch.long)])
        return x.long()

    a = pad(encode(sentence_a))
    b = pad(encode(sentence_b))

    with torch.no_grad():
        out = model(a.unsqueeze(0), b.unsqueeze(0))
        pred = out.argmax(dim=1).item()

    return "✅ Next Sentence" if pred == 1 else "🚫 Not Next Sentence"

# 🔷 User Input
while True:
    print("\n🔷 Enter two sentences to check if the second follows the first.")
    sentence_a = input("Enter Sentence A (or type 'exit' to quit): ").strip()
    if sentence_a.lower() == "exit":
        break
    sentence_b = input("Enter Sentence B: ").strip()

    result = predict_next_sentence(model, sentence_a, sentence_b, word2idx)
    print(f"\n🔍 Prediction: {result}")




🔷 Enter two sentences to check if the second follows the first.
Enter Sentence A (or type 'exit' to quit): it absorbs alot of water
Enter Sentence B: might be because of temperature

🔍 Prediction: 🚫 Not Next Sentence

🔷 Enter two sentences to check if the second follows the first.
Enter Sentence A (or type 'exit' to quit): exit


In [None]:
def predict_best_next_sentence(model, sentence_a, corpus, word2idx, max_len=20):
    model.eval()

    def encode(sentence):
        return torch.tensor([word2idx[w] for w in sentence.split() if w in word2idx], dtype=torch.long)

    def pad(x):
        x = x[:max_len]
        pad_len = max_len - len(x)
        if pad_len > 0:
            x = torch.cat([x, torch.zeros(pad_len, dtype=torch.long)])
        return x.long()

    a = pad(encode(sentence_a)).unsqueeze(0)

    best_score = float('-inf')
    best_b = None

    for candidate_b in corpus:
        b = pad(encode(candidate_b)).unsqueeze(0)
        with torch.no_grad():
            out = model(a, b)
            prob_next = nn.functional.softmax(out, dim=1)[0][1].item()  # probability of 'next'
            if prob_next > best_score:
                best_score = prob_next
                best_b = candidate_b

    return best_b, best_score


In [None]:
sentence_a = input("Enter your sentence: ").strip()
best_b, score = predict_best_next_sentence(model, sentence_a, corpus, word2idx)

print(f"\n🔍 Predicted Next Sentence:\n✅ {best_b}\n(Confidence: {score:.4f})")


Enter your sentence: we had everything before us,\

🔍 Predicted Next Sentence:
✅ \holdthentrue\mutteredherhusband\gentlemenmywife\\
(Confidence: 1.0000)


In [None]:
def clean_input(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[^a-z\\s]', '', sentence)
    tokens = tokenizer.tokenize(sentence)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

while True:
    print("\n🔷 Enter a sentence to predict its most likely next sentences.")
    sentence_a = input("Enter Sentence A (or type 'exit' to quit): ").strip()
    if sentence_a.lower() == "exit":
        break

    cleaned_sentence_a = clean_input(sentence_a)

    if not cleaned_sentence_a:
        print("⚠️ Your sentence is empty after cleaning or contains only stopwords.")
        continue

    top3 = predict_top3_next_sentences(model, cleaned_sentence_a, clean_corpus, word2idx)

    if top3:
        print("\n🔍 Top 3 Predicted Next Sentences:")
        for i, (sent, score) in enumerate(top3, 1):
            print(f"✅ {i}. {sent} (Confidence: {score:.4f})")
    else:
        print("🤷 No suitable next sentence found.")



🔷 Enter a sentence to predict its most likely next sentences.
🤷 No suitable next sentence found.

🔷 Enter a sentence to predict its most likely next sentences.
🤷 No suitable next sentence found.

🔷 Enter a sentence to predict its most likely next sentences.
🤷 No suitable next sentence found.

🔷 Enter a sentence to predict its most likely next sentences.
