In [None]:
import sys, subprocess

# Install required packages programmatically (works in Colab or local Python)
def pip_install(pkgs):
    print("Installing:", pkgs)
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + pkgs)
    print("Done installing.")

pip_install([
    "langchain",
    "langchain-core",
    "langchain-community",
    "langchain-text-splitters",
    "langchain-groq",
    "sentence-transformers",
    "faiss-cpu",
    "pypdf",
    "python-dotenv",
])


# **Initialisation du Chatbot:**

In [None]:
import os, textwrap
from typing import List
from langchain_core.documents import Document
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq

# --- Hardcoded API key (replace with your own) ---
HARD_GROQ_API_KEY = "REDACTED_FOR_PRIVACY"
os.environ["GROQ_API_KEY"] = HARD_GROQ_API_KEY

CHAT_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
EMBED_MODEL = "all-MiniLM-L6-v2"
CHUNK_SIZE = 800
CHUNK_OVERLAP = 120
TOP_K = 4

# Core profile text used as fallback context (edit to your info)
personal_profile = """
Name: Mehdi Ben Fredj
Role: AI Engineering Student
Location: Tunis, Tunisie 
Email: mehdi.benfredj15@gmail.com
Skills: Python, LangChain, LLMOps, RAG, MLOps, FastAPI, Cloud (AWS/GCP)
Experience:
- Built RAG chatbots for personal/portfolio sites.
- Deployed LLM microservices behind FastAPI with autoscaling.
- Integrated PDF/TXT ingestion for personal knowledge bases.
- Build a Personal Portfolio
Education:
- TEK-UP University 2023-2028
Projects:
- Personal website chatbot answering from CV and portfolio.
- PDF QA system with multi-file ingestion.
"""


# **Load CV (For RAG):**

In [None]:
# Helpers

def build_text_chunks(raw_docs: List[Document]):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
    )
    return splitter.split_documents(raw_docs)


def load_profile_docs(profile_text: str) -> List[Document]:
    return [Document(page_content=profile_text, metadata={"source": "profile"})]


def docs_from_upload(paths: List[str]) -> List[Document]:
    from pathlib import Path
    from pypdf import PdfReader

    docs = []
    for p in paths:
        ext = Path(p).suffix.lower()
        if ext == ".txt":
            with open(p, "r", encoding="utf-8", errors="ignore") as f:
                docs.append(Document(page_content=f.read(), metadata={"source": p}))
        elif ext == ".pdf":
            reader = PdfReader(p)
            text = "\n".join(page.extract_text() or "" for page in reader.pages)
            docs.append(Document(page_content=text, metadata={"source": p}))
        else:
            print(f"Skipping unsupported file: {p}")
    return docs


def build_vectorstore(all_docs: List[Document]) -> FAISS:
    embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
    return FAISS.from_documents(all_docs, embedding=embeddings)


In [None]:
# Load documents and build vector store
# Safety: define helpers here if not already run
try:
    load_profile_docs
    build_text_chunks
    build_vectorstore
    docs_from_upload
except NameError:
    from pathlib import Path
    from pypdf import PdfReader

    def build_text_chunks(raw_docs: List[Document]):
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP,
        )
        return splitter.split_documents(raw_docs)

    def load_profile_docs(profile_text: str) -> List[Document]:
        return [Document(page_content=profile_text, metadata={"source": "profile"})]

    def docs_from_upload(paths: List[str]) -> List[Document]:
        docs = []
        for p in paths:
            ext = Path(p).suffix.lower()
            if ext == ".txt":
                with open(p, "r", encoding="utf-8", errors="ignore") as f:
                    docs.append(Document(page_content=f.read(), metadata={"source": p}))
            elif ext == ".pdf":
                reader = PdfReader(p)
                text = "\n".join(page.extract_text() or "" for page in reader.pages)
                docs.append(Document(page_content=text, metadata={"source": p}))
            else:
                print(f"Skipping unsupported file: {p}")
        return docs

    def build_vectorstore(all_docs: List[Document]) -> FAISS:
        embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
        return FAISS.from_documents(all_docs, embedding=embeddings)

# Put your CV path here (e.g., "/content/cv.pdf" in Colab)
user_files = ["sample_cv.pdf"]

profile_docs = load_profile_docs(personal_profile)
uploaded_docs = docs_from_upload(user_files)

print(f"Loaded profile docs: {len(profile_docs)}")
print(f"Loaded uploaded docs: {len(uploaded_docs)} -> {user_files}")

all_docs = build_text_chunks(profile_docs + uploaded_docs)
print(f"Total chunks: {len(all_docs)}")

vs = build_vectorstore(all_docs)
print("Vector store ready.")


# **Build LLM + retriever and start chat:**

In [None]:
# Build LLM + retriever and start chat

llm = ChatGroq(model=CHAT_MODEL, temperature=0.1)
retriever = vs.as_retriever(search_type="similarity", search_kwargs={"k": TOP_K})

system_prompt = textwrap.dedent(
    """
    You are Mehdi's secretary. Be concise and answer using the provided context (profile + docs).
    If you do not know, say you do not know. Mention the document source name when you can.
    """
)

def chat():
    chat_history = []
    print("Hello, I'm Mehdi's secretary. Ask me anything about Mehdi from his CV.")
    while True:
        q = input("You: ").strip()
        if q.lower() in {"exit", "quit"}:
            print("Bye!")
            break

        docs = retriever.invoke(q)  # modern API
        context = "\n\n".join(d.page_content for d in docs)
        history_text = "\n".join([f"User: {h[0]}\nAssistant: {h[1]}" for h in chat_history])

        user_prompt = f"""Context:\n{context}\n\nChat history (if any):\n{history_text}\n\nQuestion: {q}\nAnswer:"""

        messages = [
            SystemMessage(content=system_prompt),
            HumanMessage(content=user_prompt),
        ]

        resp = llm.invoke(messages)
        answer = resp.content if hasattr(resp, "content") else str(resp)
        chat_history.append((q, answer))
        print(f"Bot: {answer}\n")

chat()
