In [None]:
import os
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
#1. Prompt
from langchain import PromptTemplate

personal_prompt_template = """
    I'm your friendly AI assistant, here to provide information about my background, education, work experience, and beliefs. 
    Feel free to ask me any questions about myself, and I'll do my best to provide accurate and helpful answers.
    
    Context: {context}
    Question: {question}
    Answer:
    """.strip()

PERSONAL_PROMPT = PromptTemplate.from_template(template=personal_prompt_template)
PERSONAL_PROMPT



In [None]:
import fitz  # PyMuPDF
from unstructured.partition.md import partition_md

def load_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Load your resume
resume_text = load_pdf("resume.pdf")

# Load LinkedIn profile
linkedin_text = load_pdf("linkedin_profile.pdf")

def load_markdown(file_path):
    """
    Load text from a markdown file using unstructured.
    """
    elements = partition_md(filename=file_path)
    return "\n".join([str(el) for el in elements])

# Load your personal blog (if applicable)
blog_text = load_markdown("personal_blog.md")

In [None]:
documents = [
    {"content": resume_text, "source": "resume.pdf"},
    {"content": linkedin_text, "source": "linkedin_profile.pdf"},
    {"content": blog_text, "source": "personal_blog.md"},  
]

In [None]:
len(documents)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=100
)

doc_chunks = []
for doc in documents:
    chunks = text_splitter.split_text(doc["content"])
    for chunk in chunks:
        doc_chunks.append({"content": chunk, "source": doc["source"]})

In [None]:
doc_chunks[1]

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize Sentence Transformers embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("Embedding model initialized successfully!")

In [None]:
# Test the embedding model
text = "This is a test sentence."
embeddings = embedding_model.embed_query(text)
print(embeddings)

In [None]:
from langchain.vectorstores import FAISS
from langchain.schema import Document

# Convert chunks to LangChain Document format
docs = [Document(page_content=chunk["content"], metadata={"source": chunk["source"]}) for chunk in doc_chunks]

# Create vector store
vector_store = FAISS.from_documents(docs, embedding_model)

# Save the vector store locally
vector_store.save_local("personal_vector_store")

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Load the tokenizer and model
model_id = "fastchat-t5-3b-v1.0"
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")  # Use a compatible tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Create a text generation pipeline
pipe = pipeline(
    task="text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200,  # Limit response length
    model_kwargs={
        "temperature": 0.7,
        "repetition_penalty": 1.2
    }
)

# Wrap the pipeline in LangChain's HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe)

# Create a RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": 3})  # Retrieve top 3 chunks
)

# Clean and ask a question
def clean_text(text):
    return text.strip().replace("\n", " ").replace("\r", " ")



In [None]:
# Ask a question
question = "How old is Arunya P. Senadeera?"
try:
    response = qa_chain.run(question)
    print(f"Question: {question}")
    print(f"Answer: {response}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

# Step 6: Initialize Memory for Conversation History
memory = ConversationBufferMemory(
    memory_key="chat_history",  # Key to store chat history
    return_messages=True  # Return chat history as a list of messages
)

# Step 7: Create the Conversational Retrieval Chain
chain = ConversationalRetrievalChain.from_llm(
    llm=llm,  # Language model (HuggingFacePipeline)
    retriever=vector_store.as_retriever(search_kwargs={"k": 3}),  # Retrieve top 3 chunks
    memory=memory  # Add memory for conversation history
)

# Test the chatbot
query = "What is my highest level of education?"
response = chain({"question": query})
print(response["answer"])