In [17]:
import os
import requests
import random
import json

from bs4 import BeautifulSoup
from pprint import pprint
from typing import List

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain.tools import Tool
from langchain_community.utilities import GoogleSearchAPIWrapper

In [14]:
os.environ["GOOGLE_CSE_ID"] = "***"
os.environ["GOOGLE_API_KEY"] = "***"
os.environ["OPENAI_API_KEY"] = "***"

In [3]:
TREND_LOC = "FR"

## Get recent search trends

In [4]:
url = f"https://trends.google.fr/trends/trendingsearches/daily/rss?geo={TREND_LOC}"

trend_response = requests.get(url=url)

In [5]:
soup = BeautifulSoup(trend_response.text, "xml")

items = soup.findAll("item")

themes = []
for item in items:
    print(item.title.text)
    themes.append(item.title.text)


Ligue 2
Top 14
Sabalenka
Roi Charles III
Taylor Swift
Unrwa
Budapest
Tottenham
CIJ
Luka Doncic
Handball
Ligue 1
Djokovic
Zverev
Jurgen Klopp
Loi Egalim
Laurent Fabius
Roy Orbison
Sylvain Tesson poetes
Philippe Caverivière RTL


## Searching for information on the selected topic on wikipedia

In [6]:
topic = random.choice(themes)
print(f"search info for {topic}")

google_request = f"{topic} site:wikipedia.org"

search info for Unrwa


In [7]:
search = GoogleSearchAPIWrapper()

def top3_results(query):
    return search.results(query, 3)

tool = Tool(
    name="Google Search",
    description="Search Google for recent results.",
    func=top3_results,
)

In [8]:
sources = tool.run(google_request)
sources

[{'title': 'UNRWA - Wikipedia',
  'link': 'https://en.wikipedia.org/wiki/UNRWA',
  'snippet': 'UNRWA · The United Nations Relief and Works Agency for Palestine Refugees in the Near East (UNRWA, pronounced /ˈʌnrə/ UN-rə) · UNRWA was established in 1949 by\xa0...'},
 {'title': 'Al-Maghazi UNRWA school airstrike - Wikipedia',
  'link': 'https://en.wikipedia.org/wiki/Al-Maghazi_UNRWA_school_airstrike',
  'snippet': 'Al-Maghazi UNRWA school airstrike ... On 17 October 2023, an airstrike conducted by Israel Defense Forces struck a United Nations Relief and Works Agency for\xa0...'},
 {'title': 'Philippe Lazzarini - Wikipedia',
  'link': 'https://en.wikipedia.org/wiki/Philippe_Lazzarini',
  'snippet': 'Philippe Lazzarini (born 1964) is a national of Switzerland and Italy who has been serving as Commissioner-General of the United Nations Relief and Works\xa0...'}]

In [9]:
def get_context(sources):
    texts = []
    for source in sources:
        
        page = requests.get(source['link'])
        soup = BeautifulSoup(page.content, 'html.parser')

        paragraphs = []
        for paragraph in soup.find_all('p'):
            paragraphs.append(str(paragraph.text))

        text  =' '.join(paragraphs)
        texts.append(text[:2000])
    return texts

texts = get_context(sources)

## Building the VectorDB

In [18]:
class Question(BaseModel):
    level: int = Field(description="difficulty of the question between 1 and 10, 10 is the most difficult")
    setup: str = Field(description="riddle related to the context")
    hint1: str = Field(description="a first hint to resolve the riddle")
    hint2: str = Field(description="a second hint to resolve the riddle")
    answer: str = Field(description="answer to the riddle")

class Questions(BaseModel):
    questions: List[Question]

template = """Generate your answer based only on the following context (delimited by <context> ... </context> tags):
<context>
{context}
</context>
{format_instructions}
Question: Generate 3 riddle about {topic}
"""
parser = JsonOutputParser(pydantic_object=Questions)

prompt = PromptTemplate(
    template=template,
    input_variables=["context","topic"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

model = ChatOpenAI()
chain = prompt | model | parser

context = "\n".join(texts)
llm_output = chain.invoke({"context": context, "topic": topic})
pprint(f"{topic}")
output = json.dumps(llm_output, indent=4)
print(output)

'Unrwa'
{
    "questions": [
        {
            "level": 5,
            "setup": "I am an organization that supports the relief and development of Palestinian refugees. What am I?",
            "hint1": "I am a UN agency.",
            "hint2": "My mandate includes providing education, health care, and social services to Palestinian refugees.",
            "answer": "UNRWA"
        },
        {
            "level": 7,
            "setup": "I was established in 1949 by the UN General Assembly. I provide relief to refugees from the 1948 conflict. What am I?",
            "hint1": "I operate in Jordan, Lebanon, Syria, the Gaza Strip, and the West Bank.",
            "hint2": "I am the only UN agency dedicated to helping refugees from a specific region or conflict.",
            "answer": "UNRWA"
        },
        {
            "level": 3,
            "setup": "I am a person who has been serving as the Commissioner-General of UNRWA since 2020. Who am I?",
            "hint1": "I am a n