# AI-Events Extractor

This notebook tests the code to extract AI-related events using web-scrapping from different sources, the first one to test is: [Unite.AI](https://www.unite.ai/conferences/)

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from loguru import logger
import re
from datetime import datetime
import sys

sys.path.append("../.")

from data_ingestion_pipeline.config import DataSources

In [3]:
data_sources = DataSources()

In [4]:
# Step 1: Fetch the page
url = data_sources.TOP_AI_CONFERENCES
headers = {"User-Agent": "Mozilla/5.0"} # To avoid get uncomplete data of the website

logger.info("Sending request to fetch web data")
response = requests.get(url, headers=headers)

if response.status_code != 200:
    raise ValueError(f"The request to {url} failed. Status code: {response.status_code}, {response.text}")

logger.info("Extracting AI events...")
soup = BeautifulSoup(response.content, "html.parser")


# Paso 2: Buscar el tbody directamente
tbody = soup.find("tbody", class_="row-striping")
rows = tbody.find_all("tr")

# Paso 3: Extraer los datos
conferences = []
for row in rows:
    cols = row.find_all("td")
    if len(cols) >= 3:
        dates = cols[0].get_text(strip=True)
        title_tag = cols[1].find("a")
        title = title_tag.get_text(strip=True) if title_tag else cols[1].get_text(strip=True)
        link = title_tag["href"] if title_tag and "href" in title_tag.attrs else None
        location = cols[2].get_text(strip=True)

        conferences.append({
            "Title": title,
            "Dates": dates,
            "Location": location,
            "Link": link
        })

if conferences:
    logger.info("Retrieval succeeded")
else:
    logger.info("Retrieval failed. No data was fetched")

[32m2025-09-29 12:44:48.889[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mSending request to fetch web data[0m
[32m2025-09-29 12:44:49.660[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mExtracting AI events...[0m
[32m2025-09-29 12:44:49.691[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m38[0m - [1mRetrieval succeeded[0m


In [5]:
ai_events = pd.DataFrame(conferences)

In [6]:
ai_events.tail()

Unnamed: 0,Title,Dates,Location,Link
50,European Chatbot & Conversational AI Summit 2026,"March 17 to 19, 2026","Edinburgh, Scotland",https://theeuropeanchatbot.com/
51,Big data and Data science Conference 2026,"April 13 to 14, 2026","Orlando, FL",https://datascience-machinelearning.averconfer...
52,MLcon San Diego,"June 1 to 5, 2026","San Diego, CA",https://mlconference.ai/san-diego/
53,4th Data Science & AI Summit,"June 16 to 17, 2025","London, UK",https://datascience.thepeopleevents.com/
54,2nd International Conference on Artificial Int...,"July 6 to 7, 2026","Singapore, SG",https://artificialintelligence.novelticsconfer...


In [7]:
def format_string_date(date_str: str) -> str:
    """
    Formats different types of date strings to the most common format 'YYYY-mm-dd'

    Args:
        date_str: str -> Date string with the format 'October 23, 2025', or 'Oct 23, 2025' 
    
    Returns:
        str -> Date string in the format: '2025-10-23'
    """
    if not isinstance(date_str, str):
        raise ValueError("The date is not a string data type")
    
    for fmt in (r"%B %d, %Y", r"%b %d, %Y"):
        try:
            return datetime.strptime(date_str, fmt).strftime(r"%Y-%m-%d")
        except ValueError:
            continue

    raise ValueError(f"Unrecognized date format: '{date_str}'")
        




def get_initial_and_final_dates(raw_date: str) -> tuple[str]:
    """
    Extracts the initial and final event's date from formats such as: 
            - 'October 23 to 27, 2025'
            - 'Oct 23 to 27, 2025'
            - 'October 23 to November 1, 2025'
    
    In case there's a single-day event, it needs to have the format 'October 23, 2025' or 'Oct 23, 2025'
    
    Args:
        raw_date: str -> Date with the formats provided early
    
    Returns:
        tuple[str] -> Tuple of strings initial and final dates in the format '%Y-%m-%d'
    """
    year_match = re.search(r"\b\d{4}\b", raw_date)
    year = year_match.group(0) if year_match else ""

    if " to " in raw_date:
        dates = raw_date.split(" to ") # spaces are necessary due to "October"

    elif "-" in raw_date: # In case it does not have "to" as a date separator
        dates = raw_date.split("-") # In case there's dates like "Oct 12-20, 2024"

    elif re.search(r"[A-Za-z]+ \d+, \d{4}",raw_date): # In case there's only a single-day event
        initial_date = format_string_date(raw_date)
        final_date = initial_date
        return initial_date, final_date

    else:
        raise ValueError(f"Unknown date format: {raw_date}")

    initial_date = f"{dates[0].strip()}, {year}"

    date_pattern = r"[A-Za-z]+\s\d+, \d{4}" #Looks for 'March 12, 2025' formats

    # If no month is defined in the final_date, the month of the initial_date is set
    final_date = dates[1].strip() if re.search(date_pattern, dates[1].strip()) else f"{initial_date.split()[0]} {dates[1].strip()}"

    # Format string dates
    
    initial_date = format_string_date(initial_date) # In case the month is fully written ("October", "September")
    final_date = format_string_date(final_date)

    return initial_date, final_date
    

In [8]:
date_example = "Oct 17 to Nov 20, 2026"

dates = get_initial_and_final_dates(date_example)
print(f"{dates[0]=}, {dates[1]=}")
type(dates)

dates[0]='2026-10-17', dates[1]='2026-11-20'


tuple

In [9]:
ai_events[["initial_date","final_date"]] = ai_events["Dates"].apply(get_initial_and_final_dates).apply(pd.Series)

In [10]:
ai_events.drop("Dates", axis = 1)

Unnamed: 0,Title,Location,Link,initial_date,final_date
0,GAI World 2025,"Boston, MA",https://www.gaiworld.com/,2025-09-29,2025-09-30
1,7th Annual Machine Learning in Quantitative Fi...,"New York, NY",https://bit.ly/4mzG8SL,2025-09-29,2025-10-01
2,"2nd Edition AI and ML Model Development, Risk ...","New York, NY",https://www.marcusevans.com/conferences/aifina...,2025-09-29,2025-10-01
3,"3rd Annual Development, Implementation and Man...","New York, NY",https://bit.ly/3SmSfVt,2025-09-29,2025-10-01
4,MLcon New York,"New York, NY",https://mlconference.ai/new-york/,2025-09-30,2025-10-01
5,SDV & AI in Automotive USA 2025,"Ann Arbor, Mi",https://www.automotive-iq.com/events-software-...,2025-09-30,2025-10-02
6,AI in Health summit,"London, UK",http://econimpact.co/UF,2025-10-01,2025-10-01
7,1682: The Business of Innovation Conference,"Philadelphia, PA",https://www.1682conference.com/,2025-10-08,2025-10-08
8,World Summit AI,"Amsterdam, NL",https://worldsummit.ai/,2025-10-08,2025-10-09
9,CDAO Fall,"Boston, MA",https://cdao-fall.coriniumintelligence.com/,2025-10-14,2025-10-15


The following code is to generate an excel file that contains a table, which is required by PowerAutomate to correctly identify the table

In [13]:
from openpyxl.worksheet.table import Table, TableStyleInfo
from openpyxl.utils import get_column_letter

with pd.ExcelWriter('../data/ai_events2.xlsx', engine='openpyxl') as writer:
    ai_events.to_excel(writer, index=False, sheet_name='AI Events')
    workbook  = writer.book
    worksheet = writer.sheets["AI Events"]
    (max_row, max_col) = ai_events.shape

    # Calcula el rango de la tabla en formato Excel (por ejemplo, "A1:D10")
    table_ref = f"A1:{get_column_letter(max_col)}{max_row + 1}"

    table = Table(displayName="AIEventsTable", ref=table_ref)
    style = TableStyleInfo(name="TableStyleMedium9", showFirstColumn=False,
                           showLastColumn=False, showRowStripes=True, showColumnStripes=False)
    table.tableStyleInfo = style
    worksheet.add_table(table)