In [8]:
import os
import importlib
import tiktoken
import torch
from PyPDF2 import PdfReader
from dotenv import load_dotenv
from torch.utils.data import Dataset, DataLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from classes import GPTDatasetV1
import fitz  # PyMuPDF
# Load PDF
doc = fitz.open(r"C:\Users\ASISH\Downloads\LLM\wharton_verdict.pdf")

# Extract all text
text = ""
for page in doc:
    text += page.get_text()





In [9]:
# DataLoader for GPT
def create_dataloader_v1(text, batch_size=4, max_length=200, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")
    # Create dataset
    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)
    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

X=create_dataloader_v1(text=text, batch_size=4, max_length=200, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0)



In [16]:
# embedding
def embedding(dataloader):

    # token embedding
    data_iter = iter(dataloader)
    input_batch, target_batch = next(data_iter)
    # GPT-2 vocab size
    vocab_size = 50257 
    # typical embedding size 
    embedding_dim = 768  
    torch.manual_seed(123)
    token_embedding_layer = torch.nn.Embedding(vocab_size, embedding_dim)
    token_embeddings = token_embedding_layer(input_batch)

    #positional embedding
    max_length = 200
    context_length = max_length
    pos_embedding_layer = torch.nn.Embedding(context_length,embedding_dim)
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))
    
    input_embeddings = token_embeddings + pos_embeddings

    return input_embeddings

Y=embedding(X)



In [17]:
d_in = embedding(X).shape[2] #B 768
d_out = 500 #C whatever i want based on model

In [13]:
import torch.nn as nn
class CausalAttention(nn.Module):

    def __init__(self, d_in, d_out, context_length,
                 dropout, qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout) # New
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New

    def forward(self, x):
        b, num_tokens, d_in = x.shape # New batch dimension b
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.transpose(1, 2) # Changed transpose
        attn_scores.masked_fill_(  # New, _ ops are in-place
            self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)  # `:num_tokens` to account for cases where the number of tokens in the batch is smaller than the supported context_size
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )
        attn_weights = self.dropout(attn_weights) # New

        context_vec = attn_weights @ values
        return context_vec

In [35]:
# casual attention 

def self_attention (input_embeddings):
    seq_len = 20 
    input_embeddings = torch.randn(seq_len, d_in)  # Shape: (seq_len, d_in)
    batch = torch.stack((input_embeddings, input_embeddings), dim=0)

    torch.manual_seed(123)
    context_length = batch.shape[1]
    ca = CausalAttention(d_in, d_out, context_length=context_length, dropout=0.1)
    context_vecs = ca(batch)   
    return context_vecs

Z=self_attention(Y)
            


In [36]:
Z


tensor([[[ 2.2321e+00,  2.7056e-01,  1.2016e+00,  ...,  1.1351e-01,
           2.1959e-01, -1.7183e+00],
         [ 1.6240e+00,  3.8437e-02,  6.6129e-01,  ...,  1.5938e-01,
          -3.0147e-02, -1.4106e+00],
         [ 4.3470e-01, -5.4729e-02,  1.0739e-01,  ...,  3.5351e-02,
          -1.9121e-01, -4.4711e-01],
         ...,
         [ 2.1222e-01,  6.1133e-02, -1.5091e-01,  ...,  1.4022e-01,
          -1.1286e-01,  6.0857e-02],
         [ 1.4445e-01,  5.6152e-02, -1.0781e-01,  ...,  1.6612e-01,
          -1.6136e-01, -6.5314e-02],
         [ 1.8809e-01,  1.3874e-01, -8.1277e-04,  ...,  3.4327e-01,
          -2.2253e-02,  1.3157e-01]],

        [[ 2.2321e+00,  2.7056e-01,  1.2016e+00,  ...,  1.1351e-01,
           2.1959e-01, -1.7183e+00],
         [ 1.6240e+00,  3.8437e-02,  6.6129e-01,  ...,  1.5938e-01,
          -3.0147e-02, -1.4106e+00],
         [ 1.0430e+00,  1.8998e-02,  4.3483e-01,  ...,  6.6284e-02,
          -1.3137e-01, -9.1536e-01],
         ...,
         [ 1.3341e-01,  6

In [38]:
# vector db

import faiss
import torch
import numpy as np
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.docstore.document import Document

def vector_db(context_vecs):
    # Convert torch.Tensor or list of vectors to numpy float32 array
    if isinstance(context_vecs, torch.Tensor):
        context_vecs = context_vecs.detach().cpu().numpy()
    elif isinstance(context_vecs, list):
        context_vecs = np.array(context_vecs)
    
    context_vecs = context_vecs.astype("float32")

    # Validate shape
    if context_vecs.ndim != 2:
        raise ValueError(f"Expected 2D array (n, d), but got shape {context_vecs.shape}")

    # Create dummy documents
    documents = [Document(page_content=f"doc_{i}") for i in range(len(context_vecs))]

    # Create FAISS index and add vectors
    index = faiss.IndexFlatL2(context_vecs.shape[1])
    index.add(context_vecs)

    # Build docstore and mapping
    index_to_docstore_id = {i: str(i) for i in range(len(documents))}
    docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(documents)})

    # Return FAISS vectorstore
    vector_store = FAISS(index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)

    return vector_store


In [39]:
print(type(Z))
print(np.array(Z).shape)

<class 'numpy.ndarray'>
(2, 20, 500)


In [40]:
def vector_db(context_vecs):
    if isinstance(context_vecs, torch.Tensor):
        context_vecs = context_vecs.detach().cpu().numpy().astype("float32")
    elif isinstance(context_vecs, list):
        context_vecs = np.array(context_vecs).astype("float32")

    if context_vecs.ndim != 2:
        raise ValueError(f"Expected 2D array (n, d), but got shape {context_vecs.shape}")

    # Create dummy docs
    from langchain.docstore.document import Document
    documents = [Document(page_content=f"doc_{i}") for i in range(len(context_vecs))]

    import faiss
    index = faiss.IndexFlatL2(context_vecs.shape[1])
    index.add(context_vecs)

    from langchain.docstore import InMemoryDocstore
    from langchain.vectorstores import FAISS

    index_to_docstore_id = {i: str(i) for i in range(len(documents))}
    docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(documents)})

    return FAISS(index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)


In [41]:
vector_db(Z)


ValueError: Expected 2D array (n, d), but got shape (2, 20, 500)

In [42]:
if isinstance(Z, torch.Tensor):
    context_vecs = Z.detach().cpu().numpy()


In [44]:
context_vecs = Z.reshape(-1, Z.shape[-1])  # (2*20, 500) → (40, 500)


In [46]:
context_vecs.shape

(40, 500)

In [48]:
def vector_db(context_vecs):
    if isinstance(context_vecs, torch.Tensor):
        context_vecs = context_vecs.detach().cpu().numpy()

    context_vecs = context_vecs.reshape(-1, context_vecs.shape[-1])

    documents = [Document(page_content="dummy")] * len(context_vecs)  # Replace later
    docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(documents)})
    index_to_docstore_id = {i: str(i) for i in range(len(documents))}

    index = faiss.IndexFlatL2(context_vecs.shape[1])
    index.add(context_vecs)

    vector_store = FAISS(index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)
    return vector_store


In [50]:
vector_db(context_vecs,embedding=embedding)

TypeError: vector_db() got an unexpected keyword argument 'embedding'