In [None]:
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict
import random
import re

# Step 1: Load dataset from Hugging Face
dataset = load_dataset("ag_news", split="train[:1%]")  # Small subset for demo

# Step 2: Tokenize and clean text
def tokenize(text):
    text = re.sub(r"[^a-zA-Z ]", "", text.lower())
    return text.split()

tokenized_corpus = [tokenize(example["text"]) for example in dataset]

# Step 3: Build vocabulary
vocab = set(word for sentence in tokenized_corpus for word in sentence)
word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for word, i in word2idx.items()}
vocab_size = len(vocab)

# Step 4: Generate skip-gram pairs
def generate_skip_grams(sentences, context_size):
    pairs = []
    for sentence in sentences:
        for i, center in enumerate(sentence):
            for j in range(max(0, i - context_size), min(len(sentence), i + context_size + 1)):
                if i != j:
                    pairs.append((center, sentence[j]))
    return pairs

context_size = 2
pairs = generate_skip_grams(tokenized_corpus, context_size)
pairs_idx = [(word2idx[c], word2idx[t]) for c, t in pairs]

# Step 5: Define simple Word2Vec model
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        embed = self.embeddings(x)
        return self.linear(embed)

embedding_dim = 100
model = Word2Vec(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Step 6: Training loop
epochs = 5
for epoch in range(epochs):
    total_loss = 0
    random.shuffle(pairs_idx)
    for center, context in pairs_idx:
        center_tensor = torch.tensor([center])
        context_tensor = torch.tensor([context])

        output = model(center_tensor)
        loss = criterion(output, context_tensor)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Step 7: Save embeddings
torch.save(model.embeddings.state_dict(), "custom_embeddings_hf.pt")

# Step 8: Retrieve embedding
def get_embedding(word):
    idx = word2idx.get(word)
    if idx is not None:
        with torch.no_grad():
            return model.embeddings(torch.tensor(idx)).numpy()
    else:
        return None

print("\nSample embedding for 'news':", get_embedding("news"))


In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np

# Step 1: Get embeddings matrix
embedding_weights = model.embeddings.weight.detach().numpy()

# Step 2: Select subset of words for clarity in visualization
num_words_to_plot = 200  # limit to avoid clutter
selected_indices = list(range(min(num_words_to_plot, vocab_size)))
selected_embeddings = embedding_weights[selected_indices]
selected_labels = [idx2word[i] for i in selected_indices]

# Step 3: Dimensionality reduction with t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
reduced = tsne.fit_transform(selected_embeddings)

# Step 4: Plot
plt.figure(figsize=(14, 10))
for i, label in enumerate(selected_labels):
    x, y = reduced[i]
    plt.scatter(x, y, marker='o', color='blue')
    plt.text(x + 0.1, y + 0.1, label, fontsize=9)

plt.title("Word Embeddings Visualization (t-SNE)")
plt.grid(True)
plt.show()


In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

# Define LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Define prompt template
template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use two sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

# Setup RAG pipeline
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)