In [1]:
import requests
from bs4 import BeautifulSoup
import markdownify
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp import LogitsProcessorList
from lmformatenforcer import CharacterLevelParser
from lmformatenforcer.integrations.llamacpp import build_llamacpp_logits_processor
from typing import Optional
from pydantic import BaseModel, Field
from typing import Optional
from lmformatenforcer import JsonSchemaParser
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd
from datetime import datetime

In [None]:
class Conference(BaseModel):
    name: str = Field(description="Name of the conference")
    location: Optional[str] = Field(description="Location of the conference")
    date: Optional[str] = Field(description="Date of the location in the following format DD/MM/YYYY")

class Conferences(BaseModel):
    conferences: list[Conference] = Field(description="List of Conferences")

class Queries(BaseModel):
    queries: list[str] = Field(description="Google Search Query for Topic")

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'DNT': '1',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
}

def get_top_urls_for_google_search_query(query: str, number: int = 5) -> list[dict]:
    url = f"https://www.google.com/search?q={query}"
    
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    
    results_div = soup.find("div", id="rso") # list of urls inside this elem
    if not results_div:
        print("FAILED BECAUSE GOOGLE SEARCH OUTPUT CHANGED!")
        return []
    
    output = []
    for a_tag in results_div.find_all("a", href=True):
        title_tag = a_tag.find("h3")
        if title_tag:
            title = title_tag.get_text()
            url = a_tag['href']
            output.append({"title": title, "url": url})
            if len(output) >= number:
                break

    return output

def scrape_and_convert_to_markdown(urls: list[str]) -> list[str]:
    markdown_texts = []
    for url in urls:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        try:
            markdown_text = markdownify.markdownify(str(soup))
            markdown_texts.append(markdown_text)
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            continue

    return markdown_texts
    
def llamacpp_with_character_level_parser(llm: Llama, prompt: str, character_level_parser: Optional[CharacterLevelParser]) -> str:
    logits_processors: Optional[LogitsProcessorList] = None
    if character_level_parser:
        logits_processors = LogitsProcessorList([build_llamacpp_logits_processor(llm, character_level_parser)])
    
    output = llm(prompt, logits_processor=logits_processors, max_tokens=4096)
    text: str = output['choices'][0]['text']
    return text

def ask_llm_for_conferences(llm: Llama, markdown_texts: list[str]) -> list[Conference]:
    conferences = dict()
    for idx, text in enumerate(markdown_texts):
        print(f"Working on {idx}/{len(markdown_texts)}:")
        prompt = f"""You are an AI assistant. Based on the scraped search result below, extract and provide the top academic AI conferences in the format of a JSON list.
                    Scraped Content:
                    {text}
                    You MUST answer using the following json schema: {Conferences.model_json_schema()}"""
        try:
            output = llamacpp_with_character_level_parser(llm, prompt, JsonSchemaParser(Conferences.model_json_schema()))
            output_conferences = Conferences.model_validate_json(output)
            for conference in output_conferences.conferences:
                conferences[conference.name] = conference
            print("Finished successful.")
        except Exception as e:
            print(f"Error parsing JSON: {e}")
    
    return list(conferences.values())

In [None]:
downloaded_model_path = hf_hub_download(repo_id="bartowski/Llama-3.2-3B-Instruct-GGUF", filename="Llama-3.2-3B-Instruct-Q6_K_L.gguf") 
llm = Llama(model_path=downloaded_model_path, n_ctx=16384, n_threads=8, n_gpu_layers=-1, verbose=False)

In [4]:
prompt = f"""Generate five different Google search queries to find information on the top academic AI conferences in 2025.
            These queries should be tailored to discover high-impact conferences covering fields like machine learning, natural language processing,
            computer vision, and other areas within AI.
            You MUST answer using the following json schema: {Queries.model_json_schema()}"""
queries = llamacpp_with_character_level_parser(llm, prompt, JsonSchemaParser(Queries.model_json_schema()))
queries = Queries.model_validate_json(queries).queries
queries

['top AI conferences 2025',
 'best AI conferences 2025',
 'leading AI conferences 2025',
 'top-tier AI conferences 2025',
 'notable AI conferences 2025']

In [None]:
top_urls = {url['url'] for query in queries for url in get_top_urls_for_google_search_query(query)}
markdowns = scrape_and_convert_to_markdown(top_urls)
conferences : list[Conference] = ask_llm_for_conferences(llm, markdowns)
conferences

In [None]:
# Write conferences to file
output_path = os.path.join(os.getcwd(), "conferences")
os.makedirs(output_path, exist_ok=True)
with open(os.path.join(output_path, f"conferences_{llm.metadata['general.name']}.json"), 'w', encoding='utf-8') as f:
    f.write(Conferences(conferences=conferences).model_dump_json(indent=4))

'{"conferences":[{"name":"abc","location":"newyork","date":"11/12/2023"}]}'

## Sort Conferences

In [None]:
download_path = os.path.join(os.getcwd(), "downloads")
os.makedirs(download_path, exist_ok=True)

options = webdriver.FirefoxOptions()
options.set_preference("browser.download.folderList", 2)  # custom location
options.set_preference("browser.download.dir", download_path)
options.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv")
options.set_preference("browser.download.manager.showWhenStarting", False)
options.add_argument("--headless")

driver = webdriver.Firefox(options=options)
driver.implicitly_wait(5)

def get_number_of_searches(query: str):
    url = f"https://trends.google.com/trends/explore?date=today%201-m&q={query.replace(' ', '%20')}"
    driver.get(url)
    download_button = None
    for i in range(5):
        try:
            download_button = driver.find_element(By.XPATH, "//button[@class='widget-actions-item export' and @title='CSV']")
            break
        except NoSuchElementException:
            time.sleep(1)
            driver.refresh()

    if download_button:
        download_button.click()
    else:
        print("Download button not found after multiple attempts.")
        return 0 

    csv_file_path = os.path.join(download_path, 'multiTimeline.csv')
    timeout = 5
    start_time = time.time()
    while not os.path.exists(csv_file_path):
        if time.time() - start_time > timeout:
            print("CSV file download timed out.")
            return 0 
        time.sleep(1)

    df = pd.read_csv(csv_file_path, skiprows=2) # first two rows are header and not csv
    total_searches = df.iloc[:, 1].sum()
    os.remove(csv_file_path)
    return total_searches

### Conferences by Searches

In [None]:
conferences_popularity :dict[str, tuple[Conference, int]]= dict()
for conference in conferences:
    total_searches = get_number_of_searches(conference.name)
    conferences_popularity[conference.name] = (conference, total_searches)
    
conferences_sorted_by_searches = sorted(conferences_popularity.values(), key=lambda x: x[1], reverse=True)
for conference, searches in conferences_sorted_by_searches:
    print(f"Conference: {conference.name}, Location: {conference.location}, Date: {conference.date}, Searches: {searches}")

Total searches for 'Data + AI Summit': 1038


### Conferences by Date

In [None]:
def parse_date(date_str):
    try:
        return datetime.strptime(date_str, "%d/%m/%Y") if date_str else None
    except ValueError:
        return None

valid_conferences = [
    conference for conference in conferences if parse_date(conference.date) is not None
]
conferences_sorted_by_date = sorted(valid_conferences, key=lambda conf: parse_date(conf.date))
for conference in conferences_sorted_by_date:
    print(f"Conference: {conference.name}, Location: {conference.location}, Date: {conference.date}")

In [None]:
driver.quit()
llm.close()

1030