In [38]:
import pandas as pd
from dotenv import load_dotenv
import geopy
import os
import numpy as np
from time import sleep
import asyncio
import aiohttp
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
from langchain_groq import ChatGroq
from pydantic import BaseModel
from typing import Optional

In [2]:
import logging

# Set higher logging level for the noisy loggers
logging.getLogger("openai").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)

In [3]:
load_dotenv(".env")
HF_TOKEN = os.getenv("HF_TOKEN")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GROQ_MODEL = os.getenv("GROQ_MODEL")

In [4]:
BATCH_SIZE_FETCH_ORGANIZATION_INFO = 50
BATCH_SIZE_FETCH_WIKI_URL = 100
BATCH_SIZE_FETCH_WIKI_PAGE_HTML = 20
TIME_OUT = 60

In [5]:
async def fetch_batch_async(fetch_function, data: list) -> list[dict]:
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_function(session, item) for item in data]
        res = await asyncio.gather(*tasks)
        return res

In [6]:
df = pd.read_csv("../../data/models_info.csv")

In [7]:
author_names = df["model_name"].apply(lambda x: x.split("/")[0])

In [8]:
author_names = author_names.unique()

In [9]:
async def fetch_organization_info(session, organization_name: str) -> dict:
    url = f"https://huggingface.co/api/organizations/{organization_name}/overview"
    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
    async with session.get(url, headers=headers) as response:
        # Add timeout if status code is 429 (Too Many Requests)
        if response.status == 429:
            retry_after = int(response.headers.get("Retry-After", TIME_OUT))
            await asyncio.sleep(retry_after)
            # Retry the request after waiting
            async with session.get(url, headers=headers) as retry_response:
                return await retry_response.json()
        return await response.json()

In [10]:
organization_info = []

# Calculate the number of complete batches
num_complete_batches = len(author_names) // BATCH_SIZE_FETCH_ORGANIZATION_INFO
# Check if there's a remainder that needs an additional batch
has_remainder = len(author_names) % BATCH_SIZE_FETCH_ORGANIZATION_INFO > 0
# Total number of batches needed
total_batches = num_complete_batches + (1 if has_remainder else 0)

for i in tqdm(range(total_batches)):
    start_idx = i * BATCH_SIZE_FETCH_ORGANIZATION_INFO
    end_idx = min((i + 1) * BATCH_SIZE_FETCH_ORGANIZATION_INFO, len(author_names))

    res = await fetch_batch_async(
        fetch_organization_info,
        author_names[start_idx:end_idx],
    )
    organization_info.extend(res)

100%|██████████| 137/137 [03:34<00:00,  1.56s/it]


In [11]:
organization_info = [info for info in organization_info if "error" not in info]

In [12]:
organization_info = pd.DataFrame(organization_info)

In [13]:
async def get_wikipedia_url(session, organization_name: str) -> str:
    url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={organization_name}&format=json"
    async with session.get(url) as search_response:
        search_data = await search_response.json()
        if search_data.get("query", {}).get("search", []):
            page_id = search_data["query"]["search"][0]["pageid"]
            return f"https://en.wikipedia.org/?curid={page_id}"
    return ""  # No results found

In [14]:
wiki_page_urls = []

# Calculate the number of complete batches
num_complete_batches = len(organization_info) // BATCH_SIZE_FETCH_WIKI_URL
# Check if there's a remainder that needs an additional batch
has_remainder = len(organization_info) % BATCH_SIZE_FETCH_WIKI_URL > 0
# Total number of batches needed
total_batches = num_complete_batches + (1 if has_remainder else 0)

for i in tqdm(range(total_batches)):
    start_idx = i * BATCH_SIZE_FETCH_WIKI_URL
    end_idx = min((i + 1) * BATCH_SIZE_FETCH_WIKI_URL, len(organization_info))

    res = await fetch_batch_async(
        get_wikipedia_url,
        organization_info["name"][start_idx:end_idx],
    )
    wiki_page_urls.extend(res)

100%|██████████| 18/18 [00:27<00:00,  1.53s/it]


In [15]:
organization_info["wikipedia_url"] = wiki_page_urls

In [16]:
organization_info = organization_info[organization_info["wikipedia_url"] != ""]

In [17]:
def extract_headquarters_from_wikipage(html):
    soup = BeautifulSoup(html, "html.parser")
    headquarters = None
    infobox = soup.select_one("table.infobox")
    if infobox:
        for row in infobox.find_all("tr"):
            header = row.find("th")
            if header and ("Headquarters" in header.text or "Location" in header.text):
                td = row.find("td")
                if td:
                    headquarters = td.get_text()
                    if not headquarters:
                        headquarters = "".join([text for text in td.stripped_strings])
                    if headquarters:
                        headquarters = re.sub(r"\s+", " ", headquarters).strip()
                break
        return headquarters
    return None

In [18]:
async def get_wikipedia_page_html(session, url: str) -> str:
    async with session.get(url) as search_response:
        return await search_response.text()

In [19]:
wiki_page_htmls = []

# Calculate the number of complete batches
num_complete_batches = len(organization_info) // BATCH_SIZE_FETCH_WIKI_PAGE_HTML
# Check if there's a remainder that needs an additional batch
has_remainder = len(organization_info) % BATCH_SIZE_FETCH_WIKI_PAGE_HTML > 0

total_batches = num_complete_batches + (1 if has_remainder else 0)

for i in tqdm(range(total_batches)):
    start_idx = i * BATCH_SIZE_FETCH_WIKI_PAGE_HTML
    end_idx = min((i + 1) * BATCH_SIZE_FETCH_WIKI_PAGE_HTML, len(organization_info))

    res = await fetch_batch_async(
        get_wikipedia_page_html,
        organization_info["wikipedia_url"][start_idx:end_idx],
    )
    wiki_page_htmls.extend(res)

100%|██████████| 58/58 [01:33<00:00,  1.62s/it]


In [20]:
headquarters = []
for html in tqdm(wiki_page_htmls):
    headquarters.append(extract_headquarters_from_wikipage(html))

100%|██████████| 1142/1142 [01:15<00:00, 15.09it/s]


In [21]:
organization_info["headquarters"] = headquarters

In [22]:
organization_info.dropna(subset=["headquarters"], inplace=True)

In [24]:
organization_info = organization_info[
    (organization_info["numUsers"] > 2)
    & (organization_info["numModels"] > 1)
    & (organization_info["numFollowers"] > 200)
]

In [25]:
columns = ["name", "numUsers", "numModels", "numFollowers", "headquarters"]
organization_info = organization_info[columns]

In [26]:
class Location(BaseModel):
    country: str
    city: Optional[str] = None

In [27]:
groq_client = ChatGroq(api_key=GROQ_API_KEY, model_name=GROQ_MODEL)

In [28]:
groq_client = groq_client.with_structured_output(Location)

In [29]:
prompt = """
Extract the country and city (if presented) from the following address:
{address}
"""

In [30]:
locations = []
for address in tqdm(organization_info["headquarters"]):
    locations.append(groq_client.invoke(prompt.format(address=address)))

100%|██████████| 51/51 [02:23<00:00,  2.81s/it]


In [33]:
organization_info["country"] = [location.country for location in locations]
organization_info["city"] = [location.city for location in locations]
organization_info.drop(columns=["headquarters"], inplace=True)

In [22]:
organization_info = pd.read_csv("../../data/organization_info.csv")

In [39]:
geocoder = geopy.geocoders.Nominatim(user_agent="DWV project")

for i in range(len(organization_info)):
    sleep(1)
    if organization_info.loc[i, "city"] is not np.nan:
        coords = geocoder.geocode(organization_info.loc[i, "city"])[1]
        organization_info.loc[i, "latitude"] = coords[0]
        organization_info.loc[i, "longitude"] = coords[1]
    else:
        coords = geocoder.geocode(organization_info.loc[i, "country"])[1]
        organization_info.loc[i, "latitude"] = coords[0]
        organization_info.loc[i, "longitude"] = coords[1]

In [41]:
organization_info.to_csv("../../data/organization_info.csv", index=False)