In [None]:
### Prompt I used: Mount my Google Drive. Then pip install pdfminer.six, sentence_transformers, faiss-cpu, transformers, bitsandbytes. Show only a '✅ Ready' print.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!pip -q install pdfminer.six sentence_transformers faiss-cpu transformers bitsandbytes
print('✅ Ready')

In [None]:
### Prompt I used: Use google.colab.files.upload() so I can drop a catalog PDF of any name. Save the first uploaded file's path in CATALOG_PATH and print the filename.
from google.colab import files, output
uploaded = files.upload()
if not uploaded:
    raise ValueError('No file uploaded.')
CATALOG_PATH = next(iter(uploaded.keys()))
print(f"📄 Catalog loaded: {CATALOG_PATH}")

In [None]:
### Prompt I used: Read the PDF at CATALOG_PATH with pdfminer.high_level.extract_text. Split by the form-feed character '' to keep page boundaries and store in list pages. Print 'Loaded X pages'.
from pdfminer.high_level import extract_text
raw_text = extract_text(CATALOG_PATH)
pages = raw_text.split('')
print(f"📑 Loaded {len(pages)} pages")

In [None]:
### Prompt I used: Define make_chunks(pages, window=120, stride=80) that returns a list of dicts {text, page}. Use a sliding window of 'window' words every 'stride' words within each page. Then build the chunk list.
import re, json, math
def make_chunks(pages, window=120, stride=80):
    chunks = []
    for idx, pg in enumerate(pages, start=1):
        words = re.findall(r"\S+", pg)
        for start in range(0, max(len(words)-window+1, 0), stride):
            segment = ' '.join(words[start:start+window])
            chunks.append({'text': segment, 'page': idx})
    return chunks

chunks = make_chunks(pages)
print(f"🧩 Created {len(chunks)} chunks (~{round(len(chunks)/len(pages),1)} per page)")

In [None]:
### Prompt I used: Load sentence-transformers/all-mpnet-base-v2. Encode each chunk, L2-normalise, save embeddings to catalog_embeds.npy and chunks list to chunks.json. Print sizes.
import numpy as np, torch, json
from sentence_transformers import SentenceTransformer
model_emb = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
model_emb.max_seq_length = 512

embeddings = model_emb.encode(
    [c['text'] for c in chunks],
    convert_to_numpy=True,
    show_progress_bar=True,
    normalize_embeddings=True,
).astype('float32')

np.save('/content/drive/MyDrive/catalog_embeds.npy', embeddings)
with open('/content/drive/MyDrive/chunks.json', 'w') as f:
    json.dump(chunks, f)

print(f"🔒 Saved {embeddings.shape[0]} embeddings → catalog_embeds.npy ({embeddings.nbytes/1e6:.1f} MB)")

In [None]:
### Prompt I used: If catalog.index exists in Drive, load it; else create a faiss.IndexFlatIP, add the embeddings, and write it to catalog.index. Print 'Index ready (N vectors)'.
import faiss, os
index_path = '/content/drive/MyDrive/catalog.index'
if os.path.exists(index_path):
    index = faiss.read_index(index_path)
else:
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, index_path)
print(f"⚡ Index ready ({index.ntotal} vectors)")

In [None]:
### Prompt I used: Load 4-bit-quantized microsoft/phi-2 with bitsandbytes (device_map='auto'). Build a text-generation pipeline 'generate' with temperature 0.2, max_new_tokens 300. Print device.
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
tokenizer = AutoTokenizer.from_pretrained('microsoft/phi-2')
model_llm = AutoModelForCausalLM.from_pretrained(
    'microsoft/phi-2',
    device_map='auto',
    load_in_4bit=True,
    trust_remote_code=True,
)
generate = pipeline(
    'text-generation',
    model=model_llm,
    tokenizer=tokenizer,
    temperature=0.2,
    max_new_tokens=300,
    repetition_penalty=1.1,
)
device = next(model_llm.parameters()).device
print(f"🤖 LLM loaded on {device}")

In [None]:
### Prompt I used: Implement ask(q, k=7): embed q, search FAISS, build prompt with retrieved chunks (showing page numbers), and answer with generate(). If max similarity < 0.25, reply: 'Συγγνώμη 🙏 – ρώτα με κάτι που υπάρχει στον πανεπιστημιακό κατάλογο! (Sorry – please ask something from the university catalog!)'.  Include course codes, titles, credits exactly as in context, and cite pages as '(see p. 123)'.
def ask(question, k=7, min_sim=0.25):
    # 1. embed question
    q_vec = model_emb.encode(question, convert_to_numpy=True, normalize_embeddings=True).astype('float32')
    # 2. search FAISS
    D, I = index.search(q_vec.reshape(1, -1), k)
    top_scores = D[0]
    top_ids = I[0]
    if top_scores[0] < min_sim:
        return 'Συγγνώμη 🙏 – ρώτα με κάτι που υπάρχει στον πανεπιστημιακό κατάλογο!
Sorry 🙏 – please ask something from the university catalog!'
    # 3. build context with page refs
    ctx_parts = []
    for score, idx in zip(top_scores, top_ids):
        chunk = chunks[int(idx)]
        ctx_parts.append(f"[p.{chunk['page']}] {chunk['text']}")
    context = '
'.join(ctx_parts)
    # 4. compose prompt
    prompt = (
        'You are UniCatalogBot. Answer ONLY with information found in the CONTEXT. '
        'List course codes, titles, credits & descriptions verbatim when relevant. '
        'Cite page numbers like (see p. 123). If answer not in context, apologise in Greek & English.

'
        f'CONTEXT:
{context}

'
        f'QUESTION: {question}

ANSWER:'
    )
    # 5. generate
    result = generate(prompt)[0]['generated_text']
    # 6. strip the prompt part from the output (keep text after 'ANSWER:')
    answer_start = result.find('ANSWER:')
    answer = result[answer_start + len('ANSWER:'):].strip() if answer_start != -1 else result
    return answer

In [None]:
### Prompt I used: Run ask('I am a Computer Science major. What required and elective courses do I need?') and print the answer.
print(ask('I am a Computer Science major. What required and elective courses do I need?'))

In [None]:
### Prompt I used: Create a simple loop: while True ask for input('📚> '); break on 'quit' or empty; else print ask().
while True:
    try:
        user_q = input('📚> ').strip()
    except EOFError:
        break
    if user_q.lower() in {'quit', ''}:
        print('👋 Bye!')
        break
    print(ask(user_q))