In [3]:
# STEP 1: Install & Import Required Libraries
!pip install -q faiss-cpu sentence-transformers transformers accelerate python-docx PyMuPDF

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
import faiss
from google.colab import files
import os
import docx
import fitz  # for PDF

# STEP 2: Upload file, chunk, embed, and build FAISS index
uploaded = files.upload()  # Upload your .txt, .docx, or .pdf file
filename = list(uploaded.keys())[0]
ext = os.path.splitext(filename)[-1].lower()

def read_file(file_path):
    if ext == ".txt":
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    elif ext == ".docx":
        doc = docx.Document(file_path)
        return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
    elif ext == ".pdf":
        pdf = fitz.open(file_path)
        return "\n".join([page.get_text() for page in pdf])
    else:
        raise ValueError("❌ Unsupported file format. Please upload a .txt, .docx, or .pdf file.")

full_text = read_file(filename)

# Chunk the text into ~100-word blocks
def chunk_text(text, max_words=100):
    words = text.split()
    return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

chunks = chunk_text(full_text)
print(f"✅ Total Chunks: {len(chunks)}")

# Embed using MiniLM
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedder.encode(chunks)

# Build FAISS index
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(np.array(embeddings))
print("✅ FAISS Index ready!")


Saving final_1D.csv to final_1D.csv


ValueError: ❌ Unsupported file format. Please upload a .txt, .docx, or .pdf file.

In [2]:
# STEP 3: Ask Question + Generate Detailed Answer with Phi-2 (~300+ words)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Phi-2 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype=torch.float16).to(device)
model.eval()

# Ask a question
user_question = input(" Ask a question from your notes: ")

# Get top matching chunk
question_embedding = embedder.encode([user_question])
D, I = faiss_index.search(np.array(question_embedding), k=1)
retrieved_chunk = chunks[I[0][0]]

# Prompt Phi-2 with retrieved context
prompt = f"""You are a knowledgeable tutor. Using the context provided, write a very detailed and comprehensive answer to the question below. Make sure the answer is clear, complete, and at least 300 words long.

Context: {retrieved_chunk}

Question: {user_question}
Answer:"""

inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)

# Generate long answer (~300+ words = 500+ tokens)
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=700,       # enough room for 300+ words
        min_length=450,           # ~300 words minimum
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        eos_token_id=tokenizer.eos_token_id
    )

# Decode and print
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n🧠 Phi-2 Detailed Answer:\n")
print(answer[len(prompt):].strip())


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

 Ask a question from your notes: what is learning curve?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



🧠 Phi-2 Detailed Answer:

A learning curve is a graphical representation of the performance of a model as it is trained on a dataset. It is a useful tool for diagnosing the quality of a model's training process, as well as for evaluating the model's ability to generalize to new data. 

The learning curve is typically plotted with the number of training examples used on the x-axis and the corresponding performance metric on the y-axis. As the model is trained with more examples, the learning curve should show an improvement in performance. If the performance does not improve with more training data, this may indicate that the model is overfitting to the training data and is not generalizing well to new data.

In addition to evaluating the performance of a model, learning curves can also be used to diagnose the quality of the training dataset. If the learning curve is flat or shows little improvement with more training data, this may indicate that the training dataset is not representat