In [1]:
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from pydantic import BaseModel, Field
from typing import Optional
from lmformatenforcer import JsonSchemaParser
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd
from datetime import datetime
from helper import *

In [2]:
class Conference(BaseModel):
    name: str = Field(description="Name of the conference")
    location: Optional[str] = Field(description="Location of the conference")
    date: Optional[str] = Field(description="Date of the location in the following format DD/MM/YYYY")

class Conferences(BaseModel):
    conferences: list[Conference] = Field(description="List of Conferences")

def ask_llm_for_conferences(llm: Llama, markdown_texts: list[str]) -> list[Conference]:
    conferences = dict()
    for idx, text in enumerate(markdown_texts):
        print(f"Working on {idx + 1}/{len(markdown_texts)}:")
        prompt = f"""You are an AI assistant. Based on the scraped search result below, extract and provide the top academic AI conferences in 2025 in the format of a JSON list.
                    Scraped Content:
                    {text[:llm.n_ctx() - 1000]}
                    You MUST answer using the following json schema: {Conferences.model_json_schema()}"""
        try:
            output = llamacpp_with_character_level_parser(llm, prompt, JsonSchemaParser(Conferences.model_json_schema()))
            output_conferences = Conferences.model_validate_json(output.replace("\r\n", "\n"))
            for new_conference in output_conferences.conferences:
                if new_conference.name in conferences:
                    existing_conference = conferences[new_conference.name]
                    # Replace None values in existing_conference with values from new_conference
                    for field, value in new_conference.model_dump().items():
                        if getattr(existing_conference, field) is None and value is not None:
                            setattr(existing_conference, field, value)
                else:
                    conferences[new_conference.name] = new_conference
            print("Finished successful.")
        except Exception as e:
            print(f"Error parsing JSON: {e}")
    
    return list(conferences.values())

In [3]:
downloaded_model_path = hf_hub_download(repo_id="bartowski/Mistral-NeMo-Minitron-8B-Instruct-GGUF", filename="Mistral-NeMo-Minitron-8B-Instruct-Q4_K_L.gguf")
llm = Llama(model_path=downloaded_model_path, n_ctx=16384, n_gpu_layers=-1, verbose=False)

In [4]:
prompt = """Generate five different Google search queries to find information on the top academic AI conferences in 2025.
            These queries should be tailored to discover high-impact conferences covering fields like machine learning, natural language processing,
            computer vision, and other areas within AI."""
queries = llm_generate_search_queries(llm, prompt)

In [5]:
urls = list({url['url'] for query in queries for url in get_top_urls_for_google_search_query(query)})
markdowns = scrape_and_convert_to_markdown(urls[:10])
conferences : list[Conference] = ask_llm_for_conferences(llm, markdowns)
conferences

Working on 1/10:
Finished successful.
Working on 2/10:
Finished successful.
Working on 3/10:
Finished successful.
Working on 4/10:
Finished successful.
Working on 5/10:
Finished successful.
Working on 6/10:
Finished successful.
Working on 7/10:
Finished successful.
Working on 8/10:
Finished successful.
Working on 9/10:
Finished successful.
Working on 10/10:
Finished successful.


[Conference(name='Clean Architecture Masterclass', location='Online', date='2024-11-28'),
 Conference(name='Clean Code: The Next Level', location='Online', date='2024-12-05'),
 Conference(name='The Principal Dev – Masterclass for Tech Leads', location='Online', date='2024-12-15'),
 Conference(name='Modern Software Development with TDD', location='Online', date='2024-12-16'),
 Conference(name='CDAO Nordics', location='Stockholm', date='2024-11-13'),
 Conference(name='GDG Bucharest DevFest 2024', location='Bucharest', date='2024-11-13'),
 Conference(name='Commvault SHIFT Zurich 2024', location='Zurich', date='2024-11-13'),
 Conference(name='empowerHER+ Conference', location='Potsdam', date='2024-11-14'),
 Conference(name='AI [in Prod] London by Weavite', location='London', date='2024-11-18'),
 Conference(name='Machine Learning Week Europe', location='Munich', date='2024-11-18'),
 Conference(name='DSC Europe 24', location='Belgrade', date='2024-11-18'),
 Conference(name='Big Data Conferen

In [6]:
# Write conferences to file
output_path = os.path.join(os.getcwd(), "conferences")
os.makedirs(output_path, exist_ok=True)
with open(os.path.join(output_path, f"conferences_{llm.metadata['general.name'].replace(' ', '_')}.json"), 'w', encoding='utf-8') as f:
    f.write(Conferences(conferences=conferences).model_dump_json(indent=4))

## Sort Conferences

In [None]:
download_path = os.path.join(os.getcwd(), "downloads")
os.makedirs(download_path, exist_ok=True)

options = webdriver.FirefoxOptions()
options.set_preference("browser.download.folderList", 2)  # custom location
options.set_preference("browser.download.dir", download_path)
options.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv")
options.set_preference("browser.download.manager.showWhenStarting", False)
options.add_argument("--headless")

driver = webdriver.Firefox(options=options)
driver.implicitly_wait(5)

def get_number_of_searches(query: str):
    url = f"https://trends.google.com/trends/explore?date=today%201-m&q={query.replace(' ', '%20')}"
    driver.get(url)
    download_button = None
    for i in range(5):
        try:
            download_button = driver.find_element(By.XPATH, "//button[@class='widget-actions-item export' and @title='CSV']")
            break
        except NoSuchElementException:
            time.sleep(1)
            driver.refresh()
            
    if download_button:
        try:
            download_button.click()
        except:
            print("Button click failed!")
            return 0    
    else:
        print("Download button not found after multiple attempts.")
        return 0 

    csv_file_path = os.path.join(download_path, 'multiTimeline.csv')
    timeout = 5
    start_time = time.time()
    while not os.path.exists(csv_file_path):
        if time.time() - start_time > timeout:
            print("CSV file download timed out.")
            return 0 
        time.sleep(1)

    df = pd.read_csv(csv_file_path, skiprows=2) # first two rows are header and not csv
    total_searches = df.iloc[:, 1].sum()
    os.remove(csv_file_path)
    return total_searches

### Conferences by Searches

In [25]:
conferences_popularity :dict[str, tuple[Conference, int]]= dict()
for conference in conferences:
    total_searches = get_number_of_searches(conference.name)
    conferences_popularity[conference.name] = (conference, total_searches)
    
conferences_sorted_by_searches = sorted(conferences_popularity.values(), key=lambda x: x[1], reverse=True)
for conference, searches in conferences_sorted_by_searches:
    print(f"Conference: {conference.name}, Location: {conference.location}, Date: {conference.date}, Searches: {searches}")

<selenium.webdriver.remote.webelement.WebElement (session="6bf7e991-b567-4b7e-b0b7-30bae9386a56", element="ddab62ed-623b-43f2-8841-cc73bb9ffa2b")>
<selenium.webdriver.remote.webelement.WebElement (session="6bf7e991-b567-4b7e-b0b7-30bae9386a56", element="7da08d5f-5314-47d0-bc37-7eb10a3fa55d")>
<selenium.webdriver.remote.webelement.WebElement (session="6bf7e991-b567-4b7e-b0b7-30bae9386a56", element="1f3beee9-301a-427d-8b75-6087163e6e6c")>


ElementClickInterceptedException: Message: Element <button class="widget-actions-item export"> is not clickable at point (1127,671) because another element <a class="cookieBarButton cookieBarMoreButton" href="https://policies.google.com/technologies/cookies?hl=de"> obscures it
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
ElementClickInterceptedError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:337:5
webdriverClickElement@chrome://remote/content/marionette/interaction.sys.mjs:177:11
interaction.clickElement@chrome://remote/content/marionette/interaction.sys.mjs:136:11
clickElement@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:205:29
receiveMessage@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:85:31


### Conferences by Date

In [None]:
# TODO got through all conferences and ask llm to convert all dates to one format

In [19]:
def parse_date(date_str):
    try:
        return datetime.strptime(date_str, "%d/%m/%Y") if date_str else None
    except ValueError:
        return None

valid_conferences = [
    conference for conference in conferences if parse_date(conference.date) is not None
]
conferences_sorted_by_date = sorted(valid_conferences, key=lambda conf: parse_date(conf.date))
for conference in conferences_sorted_by_date:
    print(f"Conference: {conference.name}, Location: {conference.location}, Date: {conference.date}")

Conference: International Conference on Artificial Intelligence and Machine Learning, Location: null, Date: 12/05/2025
Conference: ICML 2025, Location: Sydney, Australia, Date: 07/11/2025


In [26]:
driver.quit()
llm.close()