# NOTE: Put you Gemini API key in the secrets with the name "GEMINI-API-KEY" and upload the first file of gem extraction "01 - u1_Past Simple.txt"

In [None]:
!pip install -qU langchain langchain_core langchain_google_genai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.7/93.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/156.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.8/156.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
from langchain.agents import create_agent
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chat_models import init_chat_model
from dataclasses import dataclass

## Initializing the model

In [6]:
# Importing gemini api key from secrets.
from google.colab import userdata

In [None]:
import getpass
import os

# if "GOOGLE_API_KEY" not in os.environ:
#     os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

os.environ["GOOGLE_API_KEY"] = userdata.get('GEMINI-API-KEY')

True

In [None]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

# **Main Ideas Extraction**

## Setting response format

In [214]:
from typing import List
from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser

In [None]:
class MainIdea(BaseModel):
    name: str = Field(..., description="Main idea name")
    summary: str = Field(..., description="Detailed full-sentence summary explaining the concept, its relevance, any examples or applications, its connections to other ideas, and its role in understanding the material.")

class MainIdeas(BaseModel):
    MainIdeas: List[MainIdea]

## Setting prompt

In [198]:
class StagePrompt(BaseModel):
    system: str
    human: str

In [199]:
from langchain_core.prompts import ChatPromptTemplate

In [None]:
extrct_prompt = StagePrompt(
    system="Instructions:\n"
    "You are an expert educator specializing in creating detailed concept maps from academic texts. Given the following excerpt from a longer document, extract the main ideas, detailed concepts, and supporting details that are critical to understanding the material.\n"
    "Focus on identifying:\n\n"
    "- Key concepts or terms introduced in the text.\n"
    "- Definitions or explanations of these concepts.\n"
    "- Relationships between concepts.\n"
    "- Any examples or applications mentioned.\n"
    "Use clear, bullet-point summaries, organized by topic. Never add any commentary\n"
    "",
    human="Here is the excerpt:\n"
    "Context:\n"
    "{context}"
)

In [219]:
extrct_prompt_tmplt = ChatPromptTemplate.from_messages([
    ("system", extrct_prompt.system),
    ("human", extrct_prompt.human)])

## Create and run the agent.

In [257]:
# Create a simple chain with your llm and prompt
extrct_chain = extrct_prompt_tmplt | llm

In [None]:
# Reading text from a file.
with open('/content/01 - u1_Past Simple.txt', 'r') as file:
    text = file.read()

In [None]:
# Run the chain and parse the result into the schema
extrct_output = extrct_chain.invoke({"context": text})   # `text` was read from your file

In [208]:
print(extrct_output.content)

**Past Simple: Form**

*   **Affirmative:**
    *   Structure: Subject + Past Simple form of the verb (التصريف الثانى للفعل)
    *   Example 1: Dalia travelled to Aswan a week ago.
    *   Example 2: Yahya Haqqi wrote many interesting short stories.
*   **Interrogative:**
    *   Structure: Did + subject + infinitive?
    *   Structure: Wh-word + did + subject + infinitive?
    *   Example 1: Did Maher invite you to the party last Friday?
    *   Example 2: What did you study last night?
*   **Negative:**
    *   Structure: Subject + didn’t + infinitive
    *   Example: My son didn’t use mobile phones 10 years ago.
*   **Passive:**
    *   Structure: Object + was/were + Past Participle (P.P.)
    *   Example: The pyramids were built by the ancient Egyptians.

**Past Simple: Usage (االستخدام)**

*   **Completed action in the past with a time indicator:**
    *   Example: It rained heavily yesterday.
*   **Completed action in the past without a time indicator (obvious past):**
    *   Ex

# **Combine & Reduce**

## Combine

In [209]:
combine_prompt = StagePrompt(
    system="Instructions:\n"
    "You are combining multiple concept maps into a single, comprehensive summary while retaining all key ideas and details. Below are several lists of main ideas and concepts extracted from a larger document.\n"
    "Your task is to:\n\n"
    "1. Merge these lists into a single structured list, removing redundancies while keeping all unique and detailed information.\n"
    "2. Ensure all main ideas, relationships, and examples are preserved and clearly organized. Never add any commentary"
    "Respond with the consolidated and organized list of main ideas and concepts.",
    
    human="Here are the concept maps to combine:\n"
    "Context:\n"
    "{context}"
)

In [210]:
combine_prmpt_tmplt = ChatPromptTemplate.from_messages(
    [
        ("system", combine_prompt.system),
        ("human", combine_prompt.human)
    ]
)

In [258]:
combine_chain = combine_prmpt_tmplt | llm

In [212]:
combine_output = combine_chain.invoke({"context": extrct_output})

In [213]:
print(combine_output.content)

**Past Simple**

*   **Form**
    *   **Affirmative:**
        *   Structure: Subject + Past Simple form of the verb (التصريف الثانى للفعل)
        *   Example 1: Dalia travelled to Aswan a week ago.
        *   Example 2: Yahya Haqqi wrote many interesting short stories.
    *   **Interrogative:**
        *   Structure: Did + subject + infinitive?
        *   Structure: Wh-word + did + subject + infinitive?
        *   Example 1: Did Maher invite you to the party last Friday?
        *   Example 2: What did you study last night?
    *   **Negative:**
        *   Structure: Subject + didn’t + infinitive
        *   Example: My son didn’t use mobile phones 10 years ago.
    *   **Passive:**
        *   Structure: Object + was/were + Past Participle (P.P.)
        *   Example: The pyramids were built by the ancient Egyptians.

*   **Usage (االستخدام)**
    *   **Completed action in the past with a time indicator:**
        *   Example: It rained heavily yesterday.
    *   **Completed act

## Reduce

In [None]:
reduce_parser = PydanticOutputParser(pydantic_object=MainIdeas)

In [177]:
reduce_prompt = StagePrompt(
    system="Instructions:\n"
    "You are reducing sets of detailed concept maps, a concise yet comprehensive list of important concepts, generated by extracting concepts from a document and potentially combining subsets of them that are relevant to each other.\n"
    "The goal is to create a structured resource that fully captures the essence of the material for testing and teaching purposes.\n"
    "Your task is to:\n\n"
    "- Identify the most critical concepts from the detailed concept map.\n"
    "- Provide a full-sentence summary for each concept that explains its significance, its relationship to other concepts, and any relevant examples or applications.\n"
    "- Ensure that the summaries are clear, self-contained, and detailed enough to aid in understanding without requiring additional context.\n"
    "- If necessary, combine related concepts into a single summary. Some of the concept maps have broader headings that can be used to guide this process.\n"
    "Answer in the following format:\n"
    ""+reduce_parser.get_format_instructions().replace('{', '{{').replace('}', '}}'),

    human="Here is the detailed concept map:\n"
    "Context:\n"
    "{context}\n"
)

In [178]:
reduce_prmpt_tmplt = ChatPromptTemplate.from_messages(
    [
        ("system", reduce_prompt.system),
        ("human", reduce_prompt.human)
    ]
)

In [259]:
reduce_chain = reduce_prmpt_tmplt | llm | reduce_parser

In [180]:
reduce_output = reduce_chain.invoke({"context": combine_output})

In [181]:
reduce_output

MainIdeas(MainIdeas=[MainIdea(name='Past Simple Form', summary="The Past Simple tense is constructed in several forms: for affirmative sentences, it uses the subject followed by the past participle (V2), as in 'Dalia travelled to Aswan a week ago'; for interrogative sentences, it uses 'Did' + subject + infinitive or 'Wh-word' + 'did' + subject + infinitive, such as 'Did Maher invite you to the party last Friday?'; for negative sentences, it follows the structure subject + 'didn’t' + infinitive, like 'My son didn’t use mobile phones 10 years ago'; and in the passive voice, it uses object + 'was/were' + past participle, as demonstrated by 'The pyramids were built by the ancient Egyptians'."), MainIdea(name='Core Usage of Past Simple', summary="The Past Simple tense is primarily used to describe actions that were completed in the past, either with a clear time indicator (e.g., 'It rained heavily yesterday') or when the past context is evident without one (e.g., 'Graham Bell invented the t

In [183]:
for i, c in enumerate(reduce_output.MainIdeas):
    print("Idea", i)
    print(c)

Idea 0
name='Past Simple Form' summary="The Past Simple tense is constructed in several forms: for affirmative sentences, it uses the subject followed by the past participle (V2), as in 'Dalia travelled to Aswan a week ago'; for interrogative sentences, it uses 'Did' + subject + infinitive or 'Wh-word' + 'did' + subject + infinitive, such as 'Did Maher invite you to the party last Friday?'; for negative sentences, it follows the structure subject + 'didn’t' + infinitive, like 'My son didn’t use mobile phones 10 years ago'; and in the passive voice, it uses object + 'was/were' + past participle, as demonstrated by 'The pyramids were built by the ancient Egyptians'."
Idea 1
name='Core Usage of Past Simple' summary="The Past Simple tense is primarily used to describe actions that were completed in the past, either with a clear time indicator (e.g., 'It rained heavily yesterday') or when the past context is evident without one (e.g., 'Graham Bell invented the telephone'). It is also employ

# **Ranking**

In [184]:
class Ranking(BaseModel):
    ranking : List[int] = Field(..., description='Rank corresponding to each idea')

In [185]:
rank_parser = PydanticOutputParser(pydantic_object=Ranking)

In [186]:
print(rank_parser.get_format_instructions().replace('{', '{{').replace('}', '}}'))

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {{"properties": {{"foo": {{"title": "Foo", "description": "a list of strings", "type": "array", "items": {{"type": "string"}}}}}}, "required": ["foo"]}}
the object {{"foo": ["bar", "baz"]}} is a well-formatted instance of the schema. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not well-formatted.

Here is the output schema:
```
{{"properties": {{"ranking": {{"description": "Rank corresponding to each idea", "items": {{"type": "integer"}}, "title": "Ranking", "type": "array"}}}}, "required": ["ranking"]}}
```


In [187]:
rank_prompt = StagePrompt(
    system="Instructions:\n"
    "Given the following groups of main ideas extracted from a text, rank them in order of importance, with the most important main idea receiving a rank of 1 and lower ranks for less important ideas.\n"
    "Focus on the most important aspects of the text and the main ideas that are critical to understanding the material.\n"
    "While sometimes important, background information or less critical ideas should be ranked lower.\n"
    "When ranking:\n\n"
    "- Assign a unique number to each main idea, starting from 1.\n"
    "- Ensure that the most important main idea is ranked first.\n"
    "- Rank the main ideas based on their relevance and significance.\n"
    "- If necessary, combine related concepts into a single summary. Some of the concept maps have broader headings that can be used to guide this process.\n\n"
    "Example:\n"
    "Input: [Main Idea 1, Main Idea 2, Main Idea 3]\n"
    "Output: [2, 1, 3]\n\n"
    "Respond in the following fromat"
    ""+rank_parser.get_format_instructions().replace('{', '{{').replace('}', '}}'),
    
    human= "Main Ideas:\n"
    "{context}",
)

In [188]:
rank_prmpt_tmplt = ChatPromptTemplate.from_messages(
    [
        ("system", rank_prompt.system),
        ("human", rank_prompt.human)
    ]
)

In [260]:
rank_chain = rank_prmpt_tmplt | llm | rank_parser

In [None]:
rank_output = rank_chain.invoke({"context": reduce_output})
rank_output

Ranking(ranking=[1, 3, 2, 4, 5, 6, 7, 8, 9])

In [269]:
def reranked(ideas, ranking):
    assert len(ideas) == len(ranking), "ideas and ranking lists are not of the same length"
    ranked_ideas = [None] * len(ranking)
    for idea, rank in zip(ideas, ranking):
        ranked_ideas[rank-1] = idea

    return ranked_ideas

# **Running Example**

In [221]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [272]:
with open('content/01 - u1_Past Simple.txt') as f:
    doc = f.read()

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=40)
chunks = splitter.split_text(doc)
print(f"Extracted {len(chunks)} chunks")
print(chunks)

Extracted 7 chunks
['Past Simple  الماضى البسيط1\nForm\nSubj. + التصريف الثانى للفعل\n •Dalia travelled to Aswan a week ago.\n• Yahya Haqqi wrote many interesting short stories.\nInterrogative\nDid + subj. + inf.?/Wh-word + did + subj. + inf.?\n• Did Maher invite you to the party last Friday?\n• What did you study last night?\nNegative\nSubj. + didn’t + inf.\n• My son didn’t use mobile phones 10 years ago. \nPassive \nObj. + was/were + P.P.\n• The pyramids were built by the ancient Egyptians.', '.حدث تم وانتهى فى الماضى ويوجد ما يدل عليه من كلمات It rained heavily yesterday.\n1\n .)حدث تم وانتهى فى الماضى وال يوجد ما يدل عليه من كلمات (الماضى البديهى\n Graham Bell invented the telephone.\n2\n.لتتابع األحداث فى الماضى\n Jehan wiped the table after dinner, took a shower and went to bed.\n.)عادات الماضى (وقبل أن تكون عادات فى الماضى كانت عادات فى المضارع\n People in the past travelled on camels.\n When I was on holiday, I went to the beach every day.\n3', '3\n . لعادات الماضى وليس لحدث من

In [None]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash-lite", # up to 30 requests per minute allowed
    temperature=0,
    max_tokens=None,
    timeout=None,
)

In [273]:
import asyncio

async def process_chunk(chunk):
    ideas = await extrct_chain.ainvoke(input={'context': chunk})
    return ideas.content

# Process chunks in parallel
async def process_all_chunks(chunks):
    tasks = [process_chunk(chunk) for chunk in chunks]
    results = await asyncio.gather(*tasks)
    return "".join(results)

# Run the async function
collected_ideas = await process_all_chunks(chunks)

In [274]:
combined = combine_chain.invoke(input={'context':collected_ideas})
reduced = reduce_chain.invoke(input={'context':combined})
ranked = rank_chain.invoke(input={'context':reduced})

In [275]:
reranked(ideas= reduced.MainIdeas, ranking=ranked.ranking)

[MainIdea(name='Past Simple Tense', summary='The Past Simple tense is used to describe completed actions in the past. It is formed with the subject and the second form of the verb, such as "Dalia travelled to Aswan." Questions are formed using "Did + subject + infinitive," like "Did Maher invite you?" Negatives use "didn’t + infinitive," as in "My son didn’t use mobile phones." The passive voice uses "was/were + Past Participle," for example, "The pyramids were built." It is used to provide details of events with the present perfect tense, for storytelling, with specific time expressions (e.g., yesterday, last week, ago), and with frequency adverbs. It can also be used if the action took a specific duration in the past. Examples of time expressions include "yesterday," "last week," "a week ago," and "in 2002."'),
 MainIdea(name='Past Tense Events', summary='Past tense events describe actions that occurred and finished in the past. These events can be explicitly marked with time indicat