In [16]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [25]:
from bs4 import BeautifulSoup
import re
from nltk.stem import PorterStemmer

In [33]:
# --- Full Text Preprocessing on Dummy DataFrame (No Emoji, No Crash) ---
import pandas as pd, re, string, nltk, requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

# NLTK setup
nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet'); nltk.download('averaged_perceptron_tagger')

# Dummy dataset
df = pd.DataFrame({'text':[
    "I'm sooo happppy!!! <h1>Visit</h1> our website at https://nirmauni.ac.in <br> I can't wait 2 see u again!!! LOL that was gr8 :) <3",
    "OMG!!! This movie was soooo bad :( but soundtrack was gr8 <3 visit www.example.com"
]})

# Setup
stop_words=set(stopwords.words('english'))
ps, sb = PorterStemmer(), SnowballStemmer("english")
lemm = WordNetLemmatizer()
chat_dict={"u":"you","gr8":"great","lol":"laughing out loud","omg":"oh my god","idk":"i don't know"}

def get_pos(w):
    tag=nltk.pos_tag([w])[0][1]
    return wordnet.ADJ if tag.startswith('J') else wordnet.VERB if tag.startswith('V') else wordnet.NOUN if tag.startswith('N') else wordnet.ADV

def clean_text(t):
    t=t.lower()
    t=BeautifulSoup(t,'html.parser').get_text()
    t=re.sub(r'http\S+|www\S+','',t)
    emoticons={':)':'smile',':(':'sad',':d':'laugh','<3':'love',';)':'wink'}
    for e,m in emoticons.items(): t=t.replace(e,m)
    t=t.translate(str.maketrans('','',string.punctuation))
    tokens=[w for w in word_tokenize(t) if w not in stop_words and len(w)>2]
    if not tokens: return ""
    freq=nltk.FreqDist(tokens)
    common=[w for w,_ in freq.most_common(2)]
    rare=[w for w,c in freq.items() if c==1]
    tokens=[w for w in tokens if w not in common+rare]
    tokens=[ps.stem(w) for w in tokens]
    tokens=[sb.stem(w) for w in tokens]
    tokens=[lemm.lemmatize(w,get_pos(w)) for w in tokens]
    tokens=[chat_dict.get(w,w) for w in tokens]
    return ' '.join(tokens)

df['clean_text']=df['text'].apply(clean_text)
print(df[['text','clean_text']])

# Combine cleaned text
combined_text=' '.join(df['clean_text']).strip()
if combined_text:
    plt.figure(figsize=(6,4))
    wc=WordCloud(width=600,height=400,background_color='white').generate(combined_text)
    plt.imshow(wc,interpolation='bilinear'); plt.axis('off'); plt.title('Word Cloud')

    heart=np.array(Image.open(requests.get('https://i.imgur.com/1n6KjzG.png',stream=True).raw))
    wc2=WordCloud(mask=heart,background_color='white').generate(combined_text)
    plt.figure(figsize=(6,4)); plt.imshow(wc2,interpolation='bilinear'); plt.axis('off'); plt.title('Heart-Shaped Word Cloud')
    plt.show()
else:
    print("\n⚠️ No words remaining after cleaning to generate Word Cloud.")


                                                text clean_text
0  I'm sooo happppy!!! <h1>Visit</h1> our website...           
1  OMG!!! This movie was soooo bad :( but soundtr...           

⚠️ No words remaining after cleaning to generate Word Cloud.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
import torch
import torch.nn as nn

from torch.optim import Adam
from torch.distributions.uniform import Uniform
from torch.utils.data import TensorDataset, DataLoader

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
# CBOW and Skip-Gram from scratch (plain PyTorch)
# Converted from a PyTorch-Lightning notebook to plain PyTorch.
# Usage: place a text file named "data.txt" in the working directory (one big text or many lines).
# This script implements:
# - text loading + tokenization
# - vocabulary building
# - integer encoding
# - dataset & dataloader with collate functions
# - CBOW and Skip-Gram models implemented from scratch
# - training loops (plain PyTorch)
# - helper: get_embedding(word, model)

import re
import math
import random
from collections import Counter, defaultdict
from typing import List, Tuple

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm


# -----------------------------
# 1) Text processing utilities
# -----------------------------

def load_text(path: str) -> str:
    with open(path, 'r', encoding='utf-8') as f:
        return f.read()


def simple_tokenize(text: str) -> List[str]:
    # Lowercase, remove non-word characters except spaces, split on whitespace
    text = text.lower()
    # keep apostrophes inside words (e.g. don't -> don't) but remove other punctuation
    text = re.sub(r"[^a-z0-9\s']+", ' ', text)
    tokens = text.split()
    return tokens


class Vocab:
    def __init__(self, tokens: List[str], min_freq: int = 1, max_size: int = None, unk_token: str = '<unk>'):
        counter = Counter(tokens)
        # sort by frequency then lexicographically
        items = [item for item, cnt in counter.most_common() if cnt >= min_freq]
        if max_size is not None:
            items = items[:max_size]
        self.itos = ['<pad>', unk_token] + items  # reserve 0 for pad, 1 for unk
        self.stoi = {w: i for i, w in enumerate(self.itos)}
        self.pad_index = 0
        self.unk_index = 1

    def __len__(self):
        return len(self.itos)

    def encode(self, token: str) -> int:
        return self.stoi.get(token, self.unk_index)

    def decode(self, idx: int) -> str:
        return self.itos[idx]

# -----------------------------
# 2) Dataset generation for CBOW and Skip-Gram
# -----------------------------

def generate_cbow_pairs(tokens: List[str], vocab: Vocab, window_size: int) -> List[Tuple[List[int], int]]:
    """Return list of (context_indices_list, target_index)"""
    encoded = [vocab.encode(t) for t in tokens]
    pairs = []
    n = len(encoded)
    for i in range(n):
        context = []
        for j in range(i - window_size, i + window_size + 1):
            if j == i or j < 0 or j >= n:
                continue
            context.append(encoded[j])
        if len(context) == 0:
            continue
        pairs.append((context, encoded[i]))
    return pairs


def generate_skipgram_pairs(tokens: List[str], vocab: Vocab, window_size: int) -> List[Tuple[int, int]]:
    """Return list of (target_index, context_word_index) pairs"""
    encoded = [vocab.encode(t) for t in tokens]
    pairs = []
    n = len(encoded)
    for i in range(n):
        target = encoded[i]
        for j in range(i - window_size, i + window_size + 1):
            if j == i or j < 0 or j >= n:
                continue
            context = encoded[j]
            pairs.append((target, context))
    return pairs


class CBOWDataset(Dataset):
    def __init__(self, pairs: List[Tuple[List[int], int]]):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        context, target = self.pairs[idx]
        return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)


class SkipGramDataset(Dataset):
    def __init__(self, pairs: List[Tuple[int, int]]):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        target, context = self.pairs[idx]
        return torch.tensor(target, dtype=torch.long), torch.tensor(context, dtype=torch.long)


def cbow_collate(batch):
    # batch: list of (context_tensor, target_tensor) where contexts have variable length
    contexts, targets = zip(*batch)
    # we can pad contexts to same length
    lengths = [c.size(0) for c in contexts]
    max_len = max(lengths)
    padded = torch.zeros(len(batch), max_len, dtype=torch.long)
    for i, c in enumerate(contexts):
        padded[i, :c.size(0)] = c
    targets = torch.stack(targets)
    return padded, torch.tensor(lengths, dtype=torch.long), targets


def skipgram_collate(batch):
    # batch: list of (target_tensor, context_tensor) both scalars
    targets, contexts = zip(*batch)
    return torch.stack(targets), torch.stack(contexts)

# -----------------------------
# 3) Models
# -----------------------------

class CBOWModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # Simple linear classifier from averaged context embedding to vocab logits
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, contexts_padded, lengths):
        # contexts_padded: (batch, max_ctx_len)
        emb = self.embeddings(contexts_padded)  # (batch, max_len, emb)
        # compute average over actual lengths
        summed = emb.sum(dim=1)  # (batch, emb)
        lengths = lengths.clamp(min=1).unsqueeze(1).to(summed.dtype)
        averaged = summed / lengths
        logits = self.linear(averaged)
        return logits


class SkipGramModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        super().__init__()
        # We implement SkipGram with negative sampling via NCE loss or use full softmax for simplicity.
        # For simplicity (and clarity), this implementation uses full softmax (cross entropy) to predict
        # context word given target word.
        self.emb_target = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, targets):
        # targets: (batch,) indices
        emb = self.emb_target(targets)  # (batch, emb)
        logits = self.linear(emb)  # (batch, vocab)
        return logits

# -----------------------------
# 4) Training loops
# -----------------------------

def train_cbow(model, dataloader, optimizer, device, epochs=3, print_every=100):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    model.train()
    loop = range(epochs)
    for epoch in loop:
        total_loss = 0.0
        for i, (contexts_padded, lengths, targets) in enumerate(dataloader):
            contexts_padded = contexts_padded.to(device)
            lengths = lengths.to(device)
            targets = targets.to(device)
            optimizer.zero_grad()
            logits = model(contexts_padded, lengths)
            loss = criterion(logits, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            if (i + 1) % print_every == 0:
                avg = total_loss / print_every
                print(f"Epoch {epoch+1} step {i+1}/{len(dataloader)} loss={avg:.4f}")
                total_loss = 0.0
    return model


def train_skipgram(model, dataloader, optimizer, device, epochs=3, print_every=100):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for i, (targets, contexts) in enumerate(dataloader):
            targets = targets.to(device)
            contexts = contexts.to(device)
            optimizer.zero_grad()
            logits = model(targets)
            loss = criterion(logits, contexts)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            if (i + 1) % print_every == 0:
                avg = total_loss / print_every
                print(f"Epoch {epoch+1} step {i+1}/{len(dataloader)} loss={avg:.4f}")
                total_loss = 0.0
    return model

# -----------------------------
# 5) Utilities: embedding lookup and visualization
# -----------------------------

def get_word_embedding(word: str, vocab: Vocab, model: nn.Module, which='target') -> torch.Tensor:
    idx = vocab.encode(word)
    if isinstance(model, CBOWModel):
        emb = model.embeddings.weight.data[idx].cpu()
    elif isinstance(model, SkipGramModel):
        # For SkipGram we saved target embeddings in emb_target
        emb = model.emb_target.weight.data[idx].cpu()
    else:
        raise ValueError('Unknown model type')
    return emb


# -----------------------------
# 6) Example end-to-end runner
# -----------------------------

def run_training_pipeline(
    data_path='data.txt',
    model_type='cbow',
    window_size=2,
    embedding_dim=50,
    batch_size=256,
    epochs=3,
    min_freq=1,
    lr=0.01,
    device=None,
    max_pairs=None,
):
    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
    print('Device:', device)

    raw = load_text(data_path)
    tokens = simple_tokenize(raw)
    print(f'Token count: {len(tokens)} unique: {len(set(tokens))}')

    vocab = Vocab(tokens, min_freq=min_freq)
    print('Vocab size (including pad, unk):', len(vocab))

    if model_type == 'cbow':
        pairs = generate_cbow_pairs(tokens, vocab, window_size)
        dataset = CBOWDataset(pairs)
        collate = cbow_collate
    else:
        pairs = generate_skipgram_pairs(tokens, vocab, window_size)
        dataset = SkipGramDataset(pairs)
        collate = skipgram_collate

    if max_pairs is not None:
        # subsample for quick experiments
        print(f'Subsampling to {max_pairs} training pairs (random)')
        indices = random.sample(range(len(dataset)), min(max_pairs, len(dataset)))
        dataset = torch.utils.data.Subset(dataset, indices)

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate)

    if model_type == 'cbow':
        model = CBOWModel(len(vocab), embedding_dim)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        model = train_cbow(model, dataloader, optimizer, device, epochs=epochs, print_every=100)
    else:
        model = SkipGramModel(len(vocab), embedding_dim)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        model = train_skipgram(model, dataloader, optimizer, device, epochs=epochs, print_every=100)

    print('Training complete. Example embeddings for words:')
    for w in ['the', 'and', 'to', 'is', 'data']:
        emb = get_word_embedding(w, vocab, model)
        print(w, emb.shape)

    # visualize

    # Save embeddings and vocab
    torch.save({'model_state': model.state_dict(), 'vocab': vocab.itos}, f'{model_type}_embeddings.pt')
    print('Saved:', f'{model_type}_embeddings.pt')

    return model, vocab




In [14]:
# CBOW and Skip-Gram from scratch (plain PyTorch)
# Simplified version: call functions directly, no CLI.

import torch

from torch.utils.data import DataLoader

# Example call (no CLI)
if __name__ == '__main__':
    # Example: train CBOW on data.txt
    model, vocab = run_training_pipeline(
        data_path='/content/cleaned_news.txt',  # your text file
        model_type='cbow',     # or 'skipgram'
        window_size=2,
        embedding_dim=50,
        batch_size=256,
        epochs=3,
        min_freq=1,
        lr=0.01,
        max_pairs=50000,       # optional: limit training pairs
    )

    # Example: get embedding for a word

    emb = get_word_embedding('data', vocab, model)
    print('Embedding for "data":', emb[:10])

Device: cuda
Token count: 4913215 unique: 89034
Vocab size (including pad, unk): 89036
Subsampling to 50000 training pairs (random)
Epoch 1 step 100/196 loss=9.6336
Epoch 2 step 100/196 loss=6.6468
Epoch 3 step 100/196 loss=5.4465
Training complete. Example embeddings for words:
the torch.Size([50])
and torch.Size([50])
to torch.Size([50])
is torch.Size([50])
data torch.Size([50])
Saved: cbow_embeddings.pt
Embedding for "data": tensor([ 0.3030,  0.6431, -0.7224, -0.2541, -1.0879,  3.1228,  2.1484, -0.8621,
        -0.5176,  0.0188])


In [16]:
# IMDb Review Sentiment Classification (Plain PyTorch, from scratch)
# ---------------------------------------------------------------
# Automatically downloads IMDb dataset using kagglehub.

import re
import random
from collections import Counter
from typing import List
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import kagglehub

# ----------------------------
# 1. Download Dataset
# ----------------------------

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
print("Path to dataset files:", path)

# Load CSV
csv_path = f"{path}/IMDB Dataset.csv"
df = pd.read_csv(csv_path)
reviews = df['review'].values
labels = df['sentiment'].map({'positive': 1, 'negative': 0}).values

# ----------------------------
# 2. Tokenization & Vocabulary
# ----------------------------

def simple_tokenize(text: str) -> List[str]:
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s']+", ' ', text)
    return text.split()


class Vocab:
    def __init__(self, tokens, min_freq=2, max_size=None, unk_token='<unk>', pad_token='<pad>'):
        counter = Counter(tokens)
        words = [w for w, c in counter.items() if c >= min_freq]
        if max_size:
            words = sorted(words, key=lambda w: (-counter[w], w))[:max_size]
        self.itos = [pad_token, unk_token] + words
        self.stoi = {w: i for i, w in enumerate(self.itos)}
        self.pad_index = 0
        self.unk_index = 1

    def encode(self, token: str):
        return self.stoi.get(token, self.unk_index)

    def encode_sentence(self, tokens: List[str]):
        return [self.encode(t) for t in tokens]

# ----------------------------
# 3. Prepare Data
# ----------------------------

tokenized = [simple_tokenize(r) for r in tqdm(reviews, desc='Tokenizing')]
all_tokens = [t for sent in tokenized for t in sent]
vocab = Vocab(all_tokens, min_freq=2, max_size=20000)

encoded = [vocab.encode_sentence(t) for t in tokenized]

# Pad sequences to fixed length
max_len = 200
def pad_sequence(seq, max_len, pad_value=0):
    seq = seq[:max_len]
    return seq + [pad_value]*(max_len - len(seq))

encoded = [pad_sequence(seq, max_len, vocab.pad_index) for seq in encoded]

data = list(zip(encoded, labels))
random.shuffle(data)

split = int(0.8 * len(data))
train_data, test_data = data[:split], data[split:]

class IMDBDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        x, y = self.data[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.float)

train_loader = DataLoader(IMDBDataset(train_data), batch_size=128, shuffle=True)
test_loader = DataLoader(IMDBDataset(test_data), batch_size=128)

# ----------------------------
# 4. Model Definition (LSTM)
# ----------------------------

class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, num_layers=1, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        emb = self.dropout(self.embedding(x))
        _, (hidden, _) = self.lstm(emb)
        out = self.fc(hidden[-1])
        return torch.sigmoid(out).squeeze(1)

# ----------------------------
# 5. Training Loop
# ----------------------------

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentimentLSTM(len(vocab.itos)).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def train_epoch(loader):
    model.train()
    total_loss = 0
    for x, y in tqdm(loader, desc='Training', leave=False):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        preds = model(x)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(x)
    return total_loss / len(loader.dataset)

def evaluate(loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            preds = (model(x) > 0.5).float()
            correct += (preds == y).sum().item()
            total += len(y)
    return correct / total

for epoch in range(3):
    loss = train_epoch(train_loader)
    acc = evaluate(test_loader)
    print(f"Epoch {epoch+1}: Loss={loss:.4f}, Test Acc={acc:.4f}")

# ----------------------------
# 6. Inference Helper
# ----------------------------

def predict(text):
    tokens = simple_tokenize(text)
    encoded = vocab.encode_sentence(tokens)
    padded = pad_sequence(encoded, max_len, vocab.pad_index)
    x = torch.tensor(padded).unsqueeze(0).to(device)
    with torch.no_grad():
        pred = model(x).item()
    return 'Positive' if pred > 0.5 else 'Negative'

print("Example:", predict("The movie was absolutely fantastic!"))

Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.
Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


Tokenizing:   0%|          | 0/50000 [00:00<?, ?it/s]



Training:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 1: Loss=0.6923, Test Acc=0.5270


Training:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 2: Loss=0.6843, Test Acc=0.5270


Training:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 3: Loss=0.6769, Test Acc=0.5307
Example: Positive


In [19]:
# seq2seq_en_fr.py
# Simple Seq2Seq Machine Translation (plain PyTorch)
# - tries kagglehub.dataset_download(...) -> expects CSV/TSV with columns `src` and `tgt`
# - fallback to small toy parallel corpus if download fails
# - encoder LSTM -> decoder LSTM (no attention)
# - teacher forcing training
# ------------------------------------------------------------------------------

import os
import random
import re
from collections import Counter
from typing import List, Tuple

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

# ----------  CONFIG  ----------
KAGGLE_DATASET_ID = "https://www.kaggle.com/datasets/dhruvildave/en-fr-translation-dataset"  # e.g. "someone/eng-fra-parallel" (change if you have one)
CSV_FILENAME = None  # if dataset archives to a folder with a specific csv, set name here, else autodetect
MAX_VOCAB = 20000
MIN_FREQ = 1
SRC_LANGUAGE = "en"
TGT_LANGUAGE = "fr"
MAX_LEN = 30
BATCH_SIZE = 64
EMBED_DIM = 256
HIDDEN_DIM = 256
NUM_LAYERS = 1
TEACHER_FORCING_RATIO = 0.5
EPOCHS = 8
LR = 1e-3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# -------------------------------

# ---------- Utilities ----------
def simple_tokenize(text: str) -> List[str]:
    text = text.lower()
    text = re.sub(r"[^a-z0-9À-ÿ'\\s.,!?;:-]+", " ", text)  # keep basic punctuation and accented chars
    return text.strip().split()

class Vocab:
    def __init__(self, tokens: List[str], min_freq: int = 1, max_size: int = None,
                 pad_token="<pad>", sos_token="<sos>", eos_token="<eos>", unk_token="<unk>"):
        counter = Counter(tokens)
        # sort by freq then lexicographic
        words = [w for w, cnt in counter.most_common() if cnt >= min_freq]
        if max_size:
            words = words[:max_size]
        # special tokens: pad=0, sos=1, eos=2, unk=3
        self.pad_token = pad_token
        self.sos_token = sos_token
        self.eos_token = eos_token
        self.unk_token = unk_token
        self.itos = [pad_token, sos_token, eos_token, unk_token] + words
        self.stoi = {w: i for i, w in enumerate(self.itos)}
        self.pad_idx = self.stoi[pad_token]
        self.sos_idx = self.stoi[sos_token]
        self.eos_idx = self.stoi[eos_token]
        self.unk_idx = self.stoi[unk_token]

    def encode(self, token: str) -> int:
        return self.stoi.get(token, self.unk_idx)

    def decode(self, idx: int) -> str:
        return self.itos[idx]

    def encode_seq(self, tokens: List[str], add_sos_eos: bool = False) -> List[int]:
        ids = [self.encode(t) for t in tokens]
        if add_sos_eos:
            return [self.sos_idx] + ids + [self.eos_idx]
        return ids

    def __len__(self):
        return len(self.itos)

# --------------------------------

# ---------- Data loading (kagglehub fallback toy) ----------
def try_download_kaggle_dataset(kaggle_id: str):
    try:
        import kagglehub
    except Exception:
        print("kagglehub not installed or unavailable. Skipping kaggle download.")
        return None

    if not kaggle_id or kaggle_id.strip() == "" or "your-kaggle" in kaggle_id:
        print("No valid KAGGLE_DATASET_ID provided; skipping kaggle download.")
        return None

    try:
        print("Attempting dataset download via kagglehub:", kaggle_id)
        path = kagglehub.dataset_download(kaggle_id)
        print("Downloaded to:", path)
        return path
    except Exception as e:
        print("kagglehub download failed:", e)
        return None

def load_parallel_from_folder(folder_path: str, csv_filename: str = None) -> List[Tuple[str, str]]:
    # look for CSV/TSV in folder; expect columns 'src','tgt' or 'en','fr' or 'source','target'
    import pandas as pd
    candidates = []
    if csv_filename:
        candidates = [os.path.join(folder_path, csv_filename)]
    else:
        for f in os.listdir(folder_path):
            if f.lower().endswith((".csv", ".tsv", ".txt")):
                candidates.append(os.path.join(folder_path, f))
    for cand in candidates:
        try:
            if cand.lower().endswith(".tsv"):
                df = pd.read_csv(cand, sep="\t", encoding="utf-8")
            else:
                df = pd.read_csv(cand, encoding="utf-8")
            # try to find columns
            cols = [c.lower() for c in df.columns]
            colmap = {}
            for c in df.columns:
                lc = c.lower()
                if lc in ("src", "source", "en", "english"):
                    colmap["src"] = c
                if lc in ("tgt", "target", "fr", "french"):
                    colmap["tgt"] = c
            if "src" in colmap and "tgt" in colmap:
                pairs = list(zip(df[colmap["src"]].astype(str).values, df[colmap["tgt"]].astype(str).values))
                print(f"Loaded {len(pairs)} pairs from {cand}")
                return pairs
            # fallback: try first two columns
            if df.shape[1] >= 2:
                pairs = list(zip(df.iloc[:,0].astype(str).values, df.iloc[:,1].astype(str).values))
                print(f"Loaded {len(pairs)} pairs from {cand} using first two columns")
                return pairs
        except Exception as e:
            print("Failed reading", cand, e)
    return None

def get_parallel_corpus():
    # try kagglehub first
    folder = try_download_kaggle_dataset(KAGGLE_DATASET_ID)
    if folder:
        pairs = load_parallel_from_folder(folder, CSV_FILENAME)
        if pairs:
            return pairs

    # fallback toy corpus (small but runnable)
    print("Using small built-in toy parallel corpus (English -> French).")
    toy_pairs = [
        ("hello", "bonjour"),
        ("how are you", "comment ça va"),
        ("i am fine", "je vais bien"),
        ("what is your name", "comment tu t'appelles"),
        ("my name is john", "je m'appelle john"),
        ("thank you", "merci"),
        ("good morning", "bonjour"),
        ("good night", "bonne nuit"),
        ("see you later", "à plus tard"),
        ("i love you", "je t'aime"),
        ("where is the bathroom", "où sont les toilettes"),
        ("i need help", "j'ai besoin d'aide"),
        ("this is a book", "c'est un livre"),
        ("do you speak english", "parlez-vous anglais"),
        ("i don't understand", "je ne comprends pas"),
    ]
    return toy_pairs

# --------------------------------

# ---------- Dataset & collate ----------
class ParallelDataset(Dataset):
    def __init__(self, pairs: List[Tuple[str,str]], src_vocab: Vocab, tgt_vocab: Vocab, max_len:int=MAX_LEN):
        self.samples = []
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len
        for s, t in pairs:
            s_tokens = simple_tokenize(s)
            t_tokens = simple_tokenize(t)
            s_ids = src_vocab.encode_seq(s_tokens, add_sos_eos=False)
            t_ids = tgt_vocab.encode_seq(t_tokens, add_sos_eos=True)  # include <sos>/<eos> for targets
            if len(s_ids) == 0 or len(t_ids) == 0:
                continue
            if len(s_ids) > max_len:
                s_ids = s_ids[:max_len]
            if len(t_ids) > max_len:
                t_ids = t_ids[:max_len]
            self.samples.append((s_ids, t_ids))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

def collate_fn(batch):
    # batch: list of (src_ids, tgt_ids(with sos/eos))
    srcs, tgts = zip(*batch)
    src_lens = [len(s) for s in srcs]
    tgt_lens = [len(t) for t in tgts]
    max_src = max(src_lens)
    max_tgt = max(tgt_lens)
    src_padded = torch.full((len(batch), max_src), fill_value=src_vocab.pad_idx, dtype=torch.long)
    tgt_padded = torch.full((len(batch), max_tgt), fill_value=tgt_vocab.pad_idx, dtype=torch.long)
    for i, s in enumerate(srcs):
        src_padded[i, :len(s)] = torch.tensor(s, dtype=torch.long)
    for i, t in enumerate(tgts):
        tgt_padded[i, :len(t)] = torch.tensor(t, dtype=torch.long)
    return src_padded, torch.tensor(src_lens, dtype=torch.long), tgt_padded, torch.tensor(tgt_lens, dtype=torch.long)

# ---------- Model (Encoder / Decoder without attention) ----------
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, n_layers=1, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=src_vocab.pad_idx)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)

    def forward(self, src, src_lengths):
        # src: (batch, seqlen)
        emb = self.embedding(src)  # (batch, seqlen, emb)
        packed = nn.utils.rnn.pack_padded_sequence(emb, src_lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, (h, c) = self.lstm(packed)
        # outputs (packed) not used here (no attention); return hidden state to init decoder
        return h, c

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, n_layers=1, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=tgt_vocab.pad_idx)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_tokens, hidden, cell):
        # input_tokens: (batch, 1) - one step
        emb = self.embedding(input_tokens)  # (batch,1,emb)
        output, (h, c) = self.lstm(emb, (hidden, cell))  # output: (batch,1,hidden)
        logits = self.out(output.squeeze(1))  # (batch, vocab)
        return logits, h, c

# ---------- Seq2Seq wrapper ----------
class Seq2Seq(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, device="cpu"):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, src_lengths, tgt=None, teacher_forcing_ratio=0.5, max_len=MAX_LEN):
        # If tgt is provided, compute logits for teacher forcing training.
        batch_size = src.size(0)
        if tgt is not None:
            tgt_len = tgt.size(1)
        else:
            tgt_len = max_len

        # encode
        hidden, cell = self.encoder(src, src_lengths)  # hidden: (n_layers, batch, hidden_dim)
        # first input to decoder: <sos> for each example
        inputs = torch.full((batch_size, 1), fill_value=tgt_vocab.sos_idx, dtype=torch.long, device=self.device)
        outputs = torch.zeros(batch_size, tgt_len, len(tgt_vocab.itos), device=self.device)

        # hidden & cell already in shape (n_layers, batch, hidden_dim) as encoder returned
        for t in range(tgt_len):
            logits, hidden, cell = self.decoder(inputs, hidden, cell)
            outputs[:, t, :] = logits
            # decide next input
            if tgt is not None and random.random() < teacher_forcing_ratio:
                # teacher forcing: next input is actual next token
                inputs = tgt[:, t].unsqueeze(1)
            else:
                # use own predictions
                top1 = logits.argmax(dim=1).unsqueeze(1)
                inputs = top1
        return outputs

# ---------- Training / evaluation ----------
def compute_loss_and_update(model: Seq2Seq, batch, optimizer, criterion, device):
    src, src_lens, tgt, tgt_lens = batch
    src, src_lens, tgt = src.to(device), src_lens.to(device), tgt.to(device)
    optimizer.zero_grad()
    # run forward with teacher forcing
    outputs = model(src, src_lens, tgt=tgt, teacher_forcing_ratio=TEACHER_FORCING_RATIO, max_len=tgt.size(1))
    # outputs: (batch, tgt_len, vocab)
    # shift targets: inputs were <sos> + tokens, so targets are tokens including <eos> position
    # compute loss over all time steps
    # flatten
    out_flat = outputs.view(-1, outputs.size(-1))
    tgt_flat = tgt.view(-1)
    loss = criterion(out_flat, tgt_flat)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    return loss.item()

def evaluate_model(model: Seq2Seq, loader: DataLoader, criterion, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in loader:
            src, src_lens, tgt, tgt_lens = batch
            src, src_lens, tgt = src.to(device), src_lens.to(device), tgt.to(device)
            outputs = model(src, src_lens, tgt=tgt, teacher_forcing_ratio=0.0, max_len=tgt.size(1))
            out_flat = outputs.view(-1, outputs.size(-1))
            tgt_flat = tgt.view(-1)
            loss = criterion(out_flat, tgt_flat)
            total_loss += loss.item()
    return total_loss / len(loader)

def translate_sentence(model: Seq2Seq, sentence: str, src_vocab: Vocab, tgt_vocab: Vocab,
                       max_len=MAX_LEN, device="cpu") -> str:
    model.eval()
    tokens = simple_tokenize(sentence)
    ids = src_vocab.encode_seq(tokens, add_sos_eos=False)
    if len(ids) == 0:
        return ""
    ids = ids[:max_len]
    src_tensor = torch.tensor(ids, dtype=torch.long).unsqueeze(0).to(device)
    src_len = torch.tensor([len(ids)], dtype=torch.long).to(device)
    with torch.no_grad():
        outputs = model(src_tensor, src_len, tgt=None, teacher_forcing_ratio=0.0, max_len=max_len)
        # outputs shape: (1, max_len, tgt_vocab_size)
        preds = outputs.argmax(dim=2).squeeze(0).tolist()
    # convert tokens until eos
    words = []
    for idx in preds:
        if idx == tgt_vocab.eos_idx:
            break
        if idx == tgt_vocab.sos_idx or idx == tgt_vocab.pad_idx:
            continue
        words.append(tgt_vocab.decode(idx))
    return " ".join(words)

# ---------- Prepare data, build vocabs, model ----------
pairs = get_parallel_corpus()  # list of (src, tgt)
print(f"Loaded {len(pairs)} sentence pairs.")

# build token lists
src_tokens = []
tgt_tokens = []
for s, t in pairs:
    src_tokens.extend(simple_tokenize(s))
    tgt_tokens.extend(simple_tokenize(t))
# create vocabs
src_vocab = Vocab(src_tokens, min_freq=MIN_FREQ, max_size=MAX_VOCAB)
tgt_vocab = Vocab(tgt_tokens, min_freq=MIN_FREQ, max_size=MAX_VOCAB)
print("SRC vocab size:", len(src_vocab), "TGT vocab size:", len(tgt_vocab))

# split train/test
random.shuffle(pairs)
split = int(0.8 * len(pairs))
train_pairs = pairs[:split]
test_pairs = pairs[split:]

train_ds = ParallelDataset(train_pairs, src_vocab, tgt_vocab, max_len=MAX_LEN)
test_ds = ParallelDataset(test_pairs, src_vocab, tgt_vocab, max_len=MAX_LEN)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# instantiate models (encoder/decoder require vocab globals for padding token; we used src_vocab/tgt_vocab earlier)
encoder = Encoder(len(src_vocab), EMBED_DIM, HIDDEN_DIM, n_layers=NUM_LAYERS).to(DEVICE)
decoder = Decoder(len(tgt_vocab), EMBED_DIM, HIDDEN_DIM, n_layers=NUM_LAYERS).to(DEVICE)
model = Seq2Seq(encoder, decoder, device=DEVICE).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab.pad_idx)

# ---------- Training loop ----------
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch} training"):
        loss = compute_loss_and_update(model, batch, optimizer, criterion, DEVICE)
        total_loss += loss
    avg_train_loss = total_loss / len(train_loader)
    val_loss = evaluate_model(model, test_loader, criterion, DEVICE)
    print(f"Epoch {epoch} Train Loss: {avg_train_loss:.4f}  Val Loss: {val_loss:.4f}")

# ---------- Examples ----------
examples = [
    "hello",
    "how are you",
    "this is a test",
    "i need help",
    "where is the bathroom"
]
print("\nTranslations (examples):")
for ex in examples:
    print("EN:", ex, " -> FR:", translate_sentence(model, ex, src_vocab, tgt_vocab, max_len=MAX_LEN, device=DEVICE))

# Save model/vocabs if desired
torch.save({
    "encoder_state": encoder.state_dict(),
    "decoder_state": decoder.state_dict(),
    "src_itos": src_vocab.itos,
    "tgt_itos": tgt_vocab.itos
}, "seq2seq_en_fr.pt")
print("Saved model to seq2seq_en_fr.pt")


Attempting dataset download via kagglehub: https://www.kaggle.com/datasets/dhruvildave/en-fr-translation-dataset
kagglehub download failed: Invalid dataset handle: https://www.kaggle.com/datasets/dhruvildave/en-fr-translation-dataset
Using small built-in toy parallel corpus (English -> French).
Loaded 15 sentence pairs.
SRC vocab size: 37 TGT vocab size: 37


Epoch 1 training:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1 Train Loss: 3.6769  Val Loss: 3.5429


Epoch 2 training:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 2 Train Loss: 3.5204  Val Loss: 3.4205


Epoch 3 training:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 3 Train Loss: 3.3308  Val Loss: 3.3324


Epoch 4 training:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 4 Train Loss: 3.2236  Val Loss: 3.2548


Epoch 5 training:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 5 Train Loss: 3.0831  Val Loss: 3.1680


Epoch 6 training:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 6 Train Loss: 2.8533  Val Loss: 3.1233


Epoch 7 training:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 7 Train Loss: 2.7753  Val Loss: 3.1595


Epoch 8 training:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 8 Train Loss: 2.6757  Val Loss: 3.1434

Translations (examples):
EN: hello  -> FR: 
EN: how are you  -> FR: 
EN: this is a test  -> FR: 
EN: i need help  -> FR: 
EN: where is the bathroom  -> FR: 
Saved model to seq2seq_en_fr.pt


In [1]:
# 🧠 LangChain: Prompt Creation, Message Chaining, and Chat History Simulation (No LLM Calls)

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage

# ----------------------------------------
# 1️⃣ Define a Chat Prompt Template
# ----------------------------------------
messages_recipe = [
    ("system", "You are a helpful assistant specialized in {topic}."),
    ("user", "Hello! Can you tell me {number} interesting facts about {topic}?")
]

prompt_template = ChatPromptTemplate(messages=messages_recipe)

# Fill in placeholders using .invoke()
filled_prompt = prompt_template.invoke({"topic": "Artificial Intelligence", "number": 3})

print("🧾 Prompt Template → Messages:")
for msg in filled_prompt.to_messages():
    print(f"{msg.type.capitalize()}: {msg.content}\n")

# ----------------------------------------
# 2️⃣ Simulate a Multi-turn Chat
# ----------------------------------------
chat_history = [
    SystemMessage(content="You are a helpful assistant specialized in Artificial Intelligence."),
    HumanMessage(content="What is Artificial Intelligence?"),
    AIMessage(content="Artificial Intelligence is the simulation of human intelligence in machines."),
    HumanMessage(content="Give me an example of AI in daily life.")
]

# Simulate an AI's next response (manually generated, not via model)
simulated_ai_reply = "Examples include voice assistants like Siri or Alexa that understand and respond to human speech."
chat_history.append(AIMessage(content=simulated_ai_reply))

print("💬 Simulated Chat History:")
for i, msg in enumerate(chat_history, 1):
    print(f"{i:02d}. {msg.type.capitalize()}: {msg.content}")

# ----------------------------------------
# 3️⃣ Demonstrate Updating the Prompt Dynamically
# ----------------------------------------
new_topic = "Machine Learning"
updated_prompt = prompt_template.invoke({"topic": new_topic, "number": 2})
print("\n🔁 Updated Prompt for New Topic:")
for msg in updated_prompt.to_messages():
    print(f"{msg.type.capitalize()}: {msg.content}\n")


🧾 Prompt Template → Messages:
System: You are a helpful assistant specialized in Artificial Intelligence.

Human: Hello! Can you tell me 3 interesting facts about Artificial Intelligence?

💬 Simulated Chat History:
01. System: You are a helpful assistant specialized in Artificial Intelligence.
02. Human: What is Artificial Intelligence?
03. Ai: Artificial Intelligence is the simulation of human intelligence in machines.
04. Human: Give me an example of AI in daily life.
05. Ai: Examples include voice assistants like Siri or Alexa that understand and respond to human speech.

🔁 Updated Prompt for New Topic:
System: You are a helpful assistant specialized in Machine Learning.

Human: Hello! Can you tell me 2 interesting facts about Machine Learning?

