# EDUTOR PROJECT BOOK PLANNING CLASSES


## Authetification for Gemini

In [1]:
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"PATH_FOR_API_KEY"


In [2]:
import google.auth

creds, project = google.auth.default()
print("Authenticated as:", creds.service_account_email)
print("Project ID:", project)


Authenticated as: reto-pisa-up-25p@psa-sabia-llm-dv.iam.gserviceaccount.com
Project ID: psa-sabia-llm-dv


In [3]:
import google.generativeai as genai


# Load the Gemini 1.5 Pro model
model = genai.GenerativeModel(model_name="models/gemini-1.5-pro-latest")

# # Simple prompt
# response = model.generate_content("escribe un texto sobre como la ia funciona.")

# # Print the response
# print(response.text)


  from .autonotebook import tqdm as notebook_tqdm


## Input PDF

In [None]:
import fitz  # PyMuPDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text() for page in doc)

full_text = extract_text_from_pdf(r"C:\Users\Trabajo\Documents\Proyecto_terminal\Edutor\Books\storytelling-with-data-cole-nussbaumer-knaflic.pdf")


### Split into chunks (e.g., 500 words with overlap):

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
docs = splitter.create_documents([full_text])


## Create Embeddings and Store Them

### Local Transformers (temproary option)

#### TF-IDF

TF-IDF narrows down the top 50–100 candidate chunks (fast, keyword-based).

Embeddings rank the best ones (accurate, semantic-based).

Top k are passed to Gemini for planning.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Your LangChain split docs
texts = [doc.page_content for doc in docs]

# Create TF-IDF matrix for all chunks
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(texts)


#### TF-IDF Pre-filtering

In [8]:
query = "Create a semester plan for teaching this book"
query_vec = vectorizer.transform([query])

# Cosine similarity between query and all chunks
similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

# Get top 50 candidate chunks based on TF-IDF
top_tfidf_indices = similarities.argsort()[::-1][:50]
candidate_chunks = [texts[i] for i in top_tfidf_indices]


#### Embeddings with TF-IDF implementation

In [6]:
from google.cloud import aiplatform
import numpy as np
import faiss
from vertexai.language_models import TextEmbeddingModel
import vertexai

# # Load the embedding model
# embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@latest") 

# # Prepare your document texts
# texts = [doc.page_content for doc in docs]

# # Get embeddings
# embeddings = [embedding.values for embedding in embedding_model.get_embeddings(texts)]

# # Convert to numpy and store in FAISS
# embeddings_np = np.array(embeddings)
# dimension = embeddings_np.shape[1]

# index = faiss.IndexFlatL2(dimension)
# index.add(embeddings_np)

# # Optional mapping to recover text later
# text_index_map = {i: texts[i] for i in range(len(texts))}

# # Save FAISS index
# faiss.write_index(index, "faiss_index/index.bin")

In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Embed query and candidate chunks
query_embedding = embedding_model.encode([query])[0]
chunk_embeddings = embedding_model.encode(candidate_chunks)

# Compute cosine similarity manually
chunk_embeddings = np.array(chunk_embeddings)
query_embedding = np.array(query_embedding)

cosine_scores = np.dot(chunk_embeddings, query_embedding) / (
    np.linalg.norm(chunk_embeddings, axis=1) * np.linalg.norm(query_embedding)
)


## Define Your RAG Prompt

### Search in the FAISS index

In [10]:
# Get top 5 semantically relevant chunks
top_semantic_indices = np.argsort(cosine_scores)[::-1][:5]
relevant_chunks = [candidate_chunks[i] for i in top_semantic_indices]

# Merge into context
context = "\n\n".join(relevant_chunks)


### Build the Prompt for Gemini

In [13]:
prompt = f"""
You are an expert high school curriculum planner.

Using ONLY the following excerpts from the book "Storytelling with Data":

{context}

Design a complete 60-day semester plan for teaching this book.

For each day, provide the following:
1. **Day Number**
2. **Topic Title** – what specific concept will be covered that day.
3. **Book Reference** – which chapter, section, or excerpt the content is based on.
4. **Learning Objective** – what students should understand or be able to do by the end of the lesson.
5. **Elaboration Points** – bullet points with key ideas or takeaways the teacher should elaborate on during the lesson.
6. **Suggested Activities** – classroom exercises, discussions, or visual examples.
7. **Homework** (optional) – short assignment based on the day's topic.
8. **Why This Day Matters** – explain briefly why this content appears at this point in the sequence.

Guidelines:
- Keep the content aligned strictly to the book. Do NOT use external or invented topics.
- Elaborate clearly on each point so a new teacher could confidently teach the material.
- Use clear, practical language.
- Avoid listing vague or generic ideas. Ground every suggestion in the book content.
"""


### Send to Gemini 1.5 Pro

In [14]:
from google.generativeai import GenerativeModel

genai_model = GenerativeModel(model_name="models/gemini-1.5-pro-latest")

response = genai_model.generate_content(prompt)
print(response.text)


## 60-Day Semester Plan for "Storytelling with Data"

**Part 1: Foundations (Days 1-15)**

**Day 1:** Introduction to Data Storytelling
* **Book Reference:** Introduction, "How this book is organized," "storytellingwithdata.com" excerpt
* **Learning Objective:** Students will understand the importance of data storytelling and the structure of the book.
* **Elaboration Points:**
    * Data visualization is about more than just charts; it's about communicating insights.
    * The book emphasizes practical application over pure theory.
    * The sequential nature of the chapters builds upon previous concepts.
* **Suggested Activities:** Class discussion: Why is data storytelling important in different fields? Brainstorm real-world examples.
* **Why This Day Matters:** Sets the stage for the course and introduces the core philosophy of the book.

**Day 2:** The Storytelling with Data Process
* **Book Reference:** "How this book is organized"
* **Learning Objective:** Students will grasp th

# EDUTOR QUIZZ GENERATOR

## 

In [19]:
def build_quiz_prompt(response_quiz: str, depth_level: int) -> str:
    return f"""
You are a high school assessment designer.

A user has input a cognitive depth level from 1 (basic recall) to 10 (full application). For this case, the level is: **{depth_level}**

Based on the following 60-day lesson plan for the course "Storytelling with Data", generate two midterm quizzes:

Lesson Plan:
{response_quiz}

Generate:
- **Midterm 1 Quiz**: Based on content from Days 1–30.
- **Midterm 2 Quiz**: Based on content from Days 31–60.

Each quiz should include:
1. 10 multiple-choice questions (with 4 answer options labeled A–D)
2. An answer key at the end
3. Each question should reference the specific **day number** it’s based on

Instructions:
- If the level is **1–3**, write questions and answers that are **directly pulled from the lesson content** (basic recall).
- If the level is **4–7**, include questions that ask students to **interpret or explain concepts** with mild application.
- If the level is **8–10**, write questions that require **higher-order thinking**, such as applying concepts in new situations, comparing alternatives, or making design decisions — **not directly answerable by copying text**.

Ensure that:
- All questions stay grounded in the lesson plan (don’t invent unrelated topics).
- The complexity matches the selected level.
"""


In [20]:
depth = int(input("Choose quiz depth (1–10): "))

Choose quiz depth (1–10):  5


In [21]:
from google.generativeai import GenerativeModel

genai_model = GenerativeModel(model_name="models/gemini-1.5-pro-latest")

quiz_prompt = build_quiz_prompt(response.text, depth)

response_quizz = genai_model.generate_content(quiz_prompt)
print(response_quizz.text)


## Storytelling with Data - Midterm 1 Quiz (Days 1-30) - Cognitive Level 5

**Instructions:** Choose the best answer for each multiple-choice question.

1. **(Day 1)** Which of the following best describes the core philosophy of the "Storytelling with Data" approach?
    A) Prioritizing complex charting techniques over clear communication.
    B) Emphasizing theoretical models of data visualization.
    C) Focusing on practical application and conveying insights.
    D) Presenting data in a purely objective and unemotional manner.

2. **(Day 2)** Why is understanding the "Storytelling with Data" process crucial?
    A) It allows students to skip ahead to advanced topics.
    B) It provides a structured approach to building upon concepts sequentially.
    C) It encourages rote memorization of data visualization techniques.
    D) It allows for a more random and flexible learning experience.

3. **(Day 3)**  Effective communication of data insights starts with:
    A) Selecting the most 