In [1]:
import os
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [2]:
#1. Prompt
from langchain import PromptTemplate

personal_prompt_template = """
    I'm your friendly AI assistant, here to provide information about my background, education, work experience, and beliefs. 
    Feel free to ask me any questions about myself, and I'll do my best to provide accurate and helpful answers.
    
    Context: {context}
    Question: {question}
    Answer:
    """.strip()

PERSONAL_PROMPT = PromptTemplate.from_template(template=personal_prompt_template)
PERSONAL_PROMPT



PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="I'm your friendly AI assistant, here to provide information about my background, education, work experience, and beliefs. \n    Feel free to ask me any questions about myself, and I'll do my best to provide accurate and helpful answers.\n    \n    Context: {context}\n    Question: {question}\n    Answer:")

In [3]:
import fitz  # PyMuPDF
from unstructured.partition.md import partition_md

def load_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Load your resume
resume_text = load_pdf("resume.pdf")

# Load LinkedIn profile
linkedin_text = load_pdf("linkedin_profile.pdf")

def load_markdown(file_path):
    """
    Load text from a markdown file using unstructured.
    """
    elements = partition_md(filename=file_path)
    return "\n".join([str(el) for el in elements])

# Load your personal blog (if applicable)
blog_text = load_markdown("personal_blog.md")

In [4]:
documents = [
    {"content": resume_text, "source": "resume.pdf"},
    {"content": linkedin_text, "source": "linkedin_profile.pdf"},
    {"content": blog_text, "source": "personal_blog.md"},  
]

In [5]:
len(documents)

3

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=100
)

doc_chunks = []
for doc in documents:
    chunks = text_splitter.split_text(doc["content"])
    for chunk in chunks:
        doc_chunks.append({"content": chunk, "source": doc["source"]})

In [7]:
doc_chunks[1]

{'content': 'Asian Institute of Technology \nBangkok, Thailand \nM. Eng. in Mechatronics \n2015 - 2017 \n• \nThesis: “Sensorless Terrain Estimation  and Longitudinal Acceleration Suppression for a Wheeled Mobile \nRobot”,[PDF] \n• \nSupervisor: Dr. A. M. Harsha S. Abeykoon \n• \nAwards: AIT Fellowship \n \nAsian Institute of Technology \nBangkok, Thailand \nB.Sc. in Engineering - Mechatronics \n2009 - 2013 \n• \nThesis: “Motion Detection and Target tracking using a Pan-Tilt Camera” \n• \nSupervisor: Prof. Manukid Pranichkun \n• \nRank: Second Class-Upper Division \n \nEmployment \nUniversity of Moratuwa, Department of Electrical Engineering \nKatubedda, Sri Lanka \nLecture (Career break from AIT, Thailand)',
 'source': 'resume.pdf'}

In [8]:
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize Sentence Transformers embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("Embedding model initialized successfully!")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Embedding model initialized successfully!


In [9]:
# Test the embedding model
text = "This is a test sentence."
embeddings = embedding_model.embed_query(text)
print(embeddings)

[0.08429645001888275, 0.05795370042324066, 0.004493384622037411, 0.10582107305526733, 0.00708338338881731, -0.01784462481737137, -0.01688799262046814, -0.015228294767439365, 0.040473103523254395, 0.033422552049160004, 0.10432764142751694, -0.04703591763973236, 0.006884727627038956, 0.04101794213056564, 0.018711984157562256, -0.04149234667420387, 0.023647490888834, -0.056501924991607666, -0.033696211874485016, 0.050990939140319824, 0.06930320709943771, 0.05478423833847046, -0.00978838186711073, 0.02369716763496399, 0.019996603950858116, 0.009717307053506374, -0.058899134397506714, 0.007307387888431549, 0.047026533633470535, -0.004510192666202784, -0.055799663066864014, -0.004159401170909405, 0.06475706398487091, 0.04807629808783531, 0.01702086813747883, -0.0031833983957767487, 0.05740240216255188, 0.035231851041316986, -0.0058838739059865475, 0.014832890592515469, 0.011576258577406406, -0.10748074948787689, 0.019104115664958954, 0.022085731849074364, 0.010864544659852982, 0.003781967097

In [10]:
from langchain.vectorstores import FAISS
from langchain.schema import Document

# Convert chunks to LangChain Document format
docs = [Document(page_content=chunk["content"], metadata={"source": chunk["source"]}) for chunk in doc_chunks]

# Create vector store
vector_store = FAISS.from_documents(docs, embedding_model)

# Save the vector store locally
vector_store.save_local("personal_vector_store")

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Load the tokenizer and model
model_id = "fastchat-t5-3b-v1.0"
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")  # Use a compatible tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Create a text generation pipeline
pipe = pipeline(
    task="text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200,  # Limit response length
    model_kwargs={
        "temperature": 0.7,
        "repetition_penalty": 1.2
    }
)

# Wrap the pipeline in LangChain's HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe)

# Create a RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": 3})  # Retrieve top 3 chunks
)

# Clean and ask a question
def clean_text(text):
    return text.strip().replace("\n", " ").replace("\r", " ")



Device set to use cpu
  llm = HuggingFacePipeline(pipeline=pipe)


In [12]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

# Step 6: Initialize Memory for Conversation History
memory = ConversationBufferMemory(
    memory_key="chat_history",  # Key to store chat history
    return_messages=True  # Return chat history as a list of messages
)

# Step 7: Create the Conversational Retrieval Chain
chain = ConversationalRetrievalChain.from_llm(
    llm=llm,  # Language model (HuggingFacePipeline)
    retriever=vector_store.as_retriever(search_kwargs={"k": 3}),  # Retrieve top 3 chunks
    memory=memory  # Add memory for conversation history
)

# Test the chatbot
query = "How old is Arunya P. Senadeera?"
response = chain({"question": query})
print(response["answer"])

  memory = ConversationBufferMemory(
  response = chain({"question": query})
Token indices sequence length is longer than the specified maximum sequence length for this model (584 > 512). Running this sequence through the model will result in indexing errors


Answer: Arunya P. Senadeera is 30 years old.


Save the model, save the model, tokenizer, and pipeline for dash application

In [13]:
%%script False
# Save the tokenizer and model
model.save_pretrained("saved_model_HugingFace")
tokenizer.save_pretrained("saved_tokenizer_HugingFace")

# Save the pipeline
import pickle
with open("saved_pipeline_HugingFace.pkl", "wb") as f:
    pickle.dump(pipe, f) 

Couldn't find program: 'False'


In [17]:
import json

# --------------------- Questions to Ask ---------------------
questions = [
    "How old is Arunya?",
    "What is your highest level of education?",
    "What major or field of study did you pursue during your education?",
    "How many years of work experience do you have?",
    "What type of work or industry have you been involved in?",
    "Can you describe your current role or job responsibilities?",
    "What are your core beliefs regarding the role of technology in shaping society?",
    "How do you think cultural values should influence technological advancements?",
    "As a student, what is the most challenging aspect of your studies so far?",
    "What specific research interests or academic goals do you hope to achieve during your time as a student?"
]

# --------------------- Ask Questions and Store Results ---------------------
results = []

for question in questions:
    response = chain({"question": question})
    answer = response.get("answer", "No answer found.")
    results.append({
        "question": question,
        "answer": answer
    })

# --------------------- Print Results in JSON Format ---------------------
print(json.dumps(results, indent=4))

[
    {
        "question": "How old is Arunya?",
        "answer": "Answer: Arunya P. Senadeera is 30 years old."
    },
    {
        "question": "What is your highest level of education?",
        "answer": "Your highest level of education is Doctoral Student , Data Science & Artificial Intelligence"
    },
    {
        "question": "What major or field of study did you pursue during your education?",
        "answer": "Answer: Electronics and Computer Science"
    },
    {
        "question": "How many years of work experience do you have?",
        "answer": "7 years 8 months"
    },
    {
        "question": "What type of work or industry have you been involved in?",
        "answer": "As an academic researcher"
    },
    {
        "question": "Can you describe your current role or job responsibilities?",
        "answer": "As a Senior Research Associate at the Asian Institute of Technology, my current role is to develop and manage the Advanced Telecommunication Laboratories and