# Script to scrape GSOC 2025 organisations and their technologies
**Add (options.add_argument("--headless") if you'd rather run in headless mode)**

## Scraping the links to all organisations

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd
import chromedriver_autoinstaller  
from fake_useragent import UserAgent

# List to store extracted href links
href_list = []

# Automatically install chromedriver and get the path
driver_path = chromedriver_autoinstaller.install()

# Setting up Chrome options for headless browsing
options = webdriver.ChromeOptions()
#options.add_argument("--headless")  # Run in headless mode 
ua = UserAgent()
options.add_argument(f"user-agent={ua.random}")  # Set a random user-agent
options.add_argument("--no-sandbox")  # Bypass OS security model
options.add_argument("--disable-dev-shm-usage")  # Avoid memory issues
options.add_argument("--disable-gpu")  # Disable GPU hardware acceleration
    
# Initialize the WebDriver
driver = webdriver.Chrome(driver_path, options=options)
wait = WebDriverWait(driver, 10)  # Set explicit wait time

# Open the target webpage
print("Opening Google Summer of Code page...")
driver.get("https://summerofcode.withgoogle.com/programs/2025/organizations")

time.sleep(3)  # Allow page to load

# Scroll down the page a little to trigger lazy loading if needed
for _ in range(1):  # Adjust the range if needed
    driver.execute_script("window.scrollBy(0, 500);")
    time.sleep(1)

try:

    # Click on the first button (modify selector if necessary)
    button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="mat-button-toggle-6-button"]')))
    button.click()
        
    # Wait for elements to load and scroll for additional content
    time.sleep(2)
    for _ in range(10):  # Adjust range to ensure full loading
        driver.execute_script("window.scrollBy(0, 500);")
        time.sleep(1)
        
        # Click the dropdown to select a filter (modify selectors if necessary)
    bt2 = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="mat-select-value-3"]')))
    bt2.click()
            
    bt3 = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="mat-option-7"]')))
    bt3.click()
        
    time.sleep(1)  # Short wait before extracting data
        
    # Find all organization list elements
    orgs = driver.find_elements(By.TAG_NAME, "app-org-list")
        
    for org in orgs:
        try:
            # Extract the href attribute from each organization's link
            link_element = org.find_element(By.TAG_NAME, "a")
            href = link_element.get_attribute("href")
            href_list.append(href)
            print("Found:", href)
        except Exception:
            print("No link found in:", org.get_attribute("outerHTML"))
        
    print("\nTotal links found:", len(href_list))
        
    # Scroll further down for additional results
    for _ in range(10):  # Adjust if needed
        driver.execute_script("window.scrollBy(0, 500);")
        time.sleep(1)
        
    # Click the next page button to load more results
    bt4 = wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/app-root/app-layout/mat-sidenav-container/mat-sidenav-content[1]/div/div/main/app-program-organizations/app-orgs-grid/section[2]/div/mat-paginator/div/div/div[2]/button[2]')))
    bt4.click()
    print('Success: Moved to next page')
        
    # Scroll back up slightly
    driver.execute_script("window.scrollBy(0, -500);")
    time.sleep(1)
        
    # Extract links again from the new set of organizations
    orgs2 = driver.find_elements(By.TAG_NAME, "app-org-list")
    for org in orgs2:
        try:
            link_element = org.find_element(By.TAG_NAME, "a")
            href = link_element.get_attribute("href")
            href_list.append(href)
            print("Found:", href)
        except Exception:
            print("No link found in:", org.get_attribute("outerHTML"))
        

except Exception as e:
    print("Error:", e)  # Handle exceptions

finally:
    driver.quit()  # Ensure driver closes properly


In [119]:
print(f"A total of {len(href_list)} organisations were found")

A total of 185 organisations were found


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def scrape_org_details_selenium(href_list):
    options = webdriver.ChromeOptions()

    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(options=options)
    wait = WebDriverWait(driver, 10)

    org_data = {}
    count = 0
    for url in href_list:
        try:
            driver.get(url)
            time.sleep(1)  # Allow time for elements to load

            # Extract organization name
            name_xpath = "/html/body/app-root/app-layout/mat-sidenav-container/mat-sidenav-content[1]/div/div/main/app-program-organization/app-org-page-title/app-feature-banner/section/div/div/app-feature-cta/div/div[1]/div[1]/h2/span"
            name = wait.until(EC.presence_of_element_located((By.XPATH, name_xpath))).text.strip()

            # Extract technologies
            tech_xpath = "/html/body/app-root/app-layout/mat-sidenav-container/mat-sidenav-content[1]/div/div/main/app-program-organization/app-org-info/section/div[2]/div/div/div[1]/div/app-org-info-details/div/div[1]/div[1]/div[2]"
            technologies = wait.until(EC.presence_of_element_located((By.XPATH, tech_xpath))).text.strip()

            org_data[name] = technologies
            
            
            
            count = count + 1
            if (count%10 == 0):
                print(f"Scraped: {count} of 185")
            elif count == 185:
                print("Done")

        except Exception as e:
            print(f"Failed to scrape {url}: {e}")

    driver.quit()
    return org_data


data = scrape_org_details_selenium(href_list)

In [13]:
# Clean technologies in the dictionary
for key, value in data.items():
    cleaned_tech = ", ".join(value.replace(",", " ").split())  # Remove extra commas/spaces
    data[key] = cleaned_tech

# Now create the DataFrame
gsoc_df = pd.DataFrame(list(data.items()), columns=['Organization', 'Technologies'])


gsoc_df


Unnamed: 0,Organization,Technologies
0,LabLua,"lua, luarocks, kernel, lunatik, pallene"
1,ScummVM,"python, opengl, c++, assembly, php"
2,52°North Spatial Information Research GmbH,"javascript, android, java, web, services, ogc,..."
3,AsyncAPI,"javascript, java, go, typescript, RAML"
4,openSUSE Project,"python, c/c++, go, ruby, reactjs, javascript"
...,...,...
180,OpenELIS Global,"postgresql, javascript, java, react, spring"
181,The Rust Foundation,"python, rust"
182,FOSSASIA,"c, python, javascript, django, android"
183,The JPF team,"android, java, distributed, systems, jvm"


In [None]:
# Allowed technologies (normalized)
allowed_techs = {"flask","python", "javascript","react","node.js",'typescript','tailwind','django',"node"," pandas","numpy","api","scikit-learn","scikitlearn","sql","mysql","matplot","seaborn","beautifulsoup","selenium","powerbi","github","git"}  # At least one must be present

# Function to filter organizations where at least one allowed tech exists
def at_least_one_filter_organizations(df):
    filtered_data = {}

    for index, row in df.iterrows():
        # Normalize and split technologies into a set
        techs = {t.strip().lower() for t in row['Technologies'].split(",")}
        
        # Check if there's an intersection between row techs and allowed techs
        if allowed_techs & techs:  # At least one common element
            filtered_data[row['Organization']] = ", ".join(techs)  # Store as a string
    
    return filtered_data

# Apply filtering
filtered_dict = at_least_one_filter_organizations(gsoc_df)

# Display results
print(filtered_dict)

In [None]:
# (lowercase, no spaces)
terms_to_remove = {"go","c++","rust","spring","shell","linux","swift"}  
terms_to_remove = {t.lower().replace(" ", "") for t in terms_to_remove}

# Filter the dictionary
filtered_dict = {
    k: v for k, v in filtered_dict.items()
    if not any(term in v.lower().replace(" ", "") for term in terms_to_remove)
}

# Display the updated dictionary
print(filtered_dict)
print(len(filtered_dict))

In [None]:
# Clean technologies in the dictionary
for key, value in filtered_dict.items():
    cleaned_tech = ", ".join(value.replace(",", " ").split())  # Remove extra commas/spaces
    filtered_dict[key] = cleaned_tech

# Now create the DataFrame
mgsoc_df = pd.DataFrame(list(filtered_dict.items()), columns=['Organization', 'Technologies'])


mgsoc_df

In [None]:
mgsoc_df.to_csv('mgsoc_data.csv')