In [1]:
import requests
import pandas as pd

def fetch_gsoc_organizations(url):
    response = requests.get(url)
    if response.status_code == 200:
        organizations = response.json()
        return organizations
    else:
        print("Failed to fetch data")
        return []

def process_organizations(organizations):
    data = []
    for org in organizations:
        org_data = {
            'name': org.get('name', ''),
            'description': org.get('description', ''),
            'tagline': org.get('tagline', ''),
            'website_url': org.get('website_url', ''),
            'source_code': org.get('source_code', ''),
            'ideas_link': org.get('ideas_link', ''),
            'contributor_guidance_url': org.get('contributor_guidance_url', ''),
            'license': org.get('license', ''),
            'logo_url': org.get('logo_url', ''),
            'categories': ', '.join(org.get('categories', [])),
            'tech_tags': ', '.join(org.get('tech_tags', [])),
            'topic_tags': ', '.join(org.get('topic_tags', [])),
            'contact_links': ', '.join([link['value'] for link in org.get('contact_links', [])]),
            'direct_comm_methods': ', '.join([method['value'] for method in org.get('direct_comm_methods', [])]),
            'social_comm_methods': ', '.join([method['value'] for method in org.get('social_comm_methods', [])])
        }
        data.append(org_data)
    return pd.DataFrame(data)

# URL for GSoC 2024 organizations
url = "https://summerofcode.withgoogle.com/api/program/2024/organizations/"
organizations = fetch_gsoc_organizations(url)
gsoc_df = process_organizations(organizations)



In [2]:
gsoc_df.head()

Unnamed: 0,name,description,tagline,website_url,source_code,ideas_link,contributor_guidance_url,license,logo_url,categories,tech_tags,topic_tags,contact_links,direct_comm_methods,social_comm_methods
0,LibreCube Initiative,LibreCube develops an ecosystem of open source...,Open Source Space Exploration,https://librecube.org,https://gitlab.com/librecube,https://librecube.org/google-summer-of-code-2024/,https://librecube.org/google-summer-of-code-pr...,MIT,https://summerofcode.withgoogle.com/media/org/...,Science and medicine,"python, rest api, micropython","automation, space, Communication Protocols, Rover","info@librecube.org, https://app.element.io/#/r...","info@librecube.org, https://app.element.io/#/r...",https://fosstodon.org/@librecube
1,Alaska,Alaska Project Ideas are mentored by the resea...,"Many Traditions, One Alaska",https://www.uaa.alaska.edu/research,https://github.com/uaanchorage/GSoC,https://github.com/uaanchorage/GSoC,https://github.com/uaanchorage/GSoC/blob/main/...,Apache-2.0,https://summerofcode.withgoogle.com/media/org/...,"Science and medicine, Artificial Intelligence","python, mysql, java, matlab, dicom","deep learning, neuroscience, radiology, heathc...",https://github.com/uaanchorage/GSoC/discussion...,https://github.com/uaanchorage/GSoC/discussions,https://github.com/uaanchorage/GSoC/wiki
2,The Julia Language,"The Julia Language is an open-source, high lev...",A fresh approach to technical computing,https://julialang.org,https://github.com/JuliaLang/julia,https://julialang.org/jsoc/projects/,https://julialang.org/jsoc/guidelines/,MIT,https://summerofcode.withgoogle.com/media/org/...,"Programming languages, Artificial Intelligence","machine learning, julia, data science, compile...","math, machine learning, science, data science,...","https://julialang.org/slack/, https://discours...","https://julialang.org/slack/, https://discours...","https://twitter.com/JuliaLanguage, https://jul..."
3,Graphite,Graphite is an in-development raster and vecto...,Redefining state‑of‑the‑art graphics editing,https://graphite.rs,https://github.com/GraphiteEditor/Graphite,https://graphite.rs/volunteer/guide/projects/s...,https://graphite.rs/volunteer/guide/projects/s...,Apache-2.0,https://summerofcode.withgoogle.com/media/org/...,"End user applications, Media","rust, typescript, ai, webgpu, stable diffusion","graphics, computational geometry, rendering, d...","https://discord.graphite.rs, contact@graphite....","https://discord.graphite.rs, contact@graphite.rs","https://twitter.com/GraphiteEditor/, https://g..."
4,BRL-CAD,<p>This is the place to be if you love compute...,3D CAD & other computer-aided tech (CAx),https://opencax.github.io/,https://github.com/BRL-CAD/brlcad,https://opencax.github.io/project-proposals/,https://opencax.github.io/gsoc_checklist.html,ISC,https://summerofcode.withgoogle.com/media/org/...,"Media, Artificial Intelligence","python, c/c++, opengl, opencl, scripting","geometry, 2d/3d graphics, ray tracing, high-pe...","https://opencax.github.io, devs@brlcad.org, ht...","https://opencax.github.io, devs@brlcad.org, ht...",https://fb.me/BRL-CAD


In [3]:
gsoc_df.describe()

Unnamed: 0,name,description,tagline,website_url,source_code,ideas_link,contributor_guidance_url,license,logo_url,categories,tech_tags,topic_tags,contact_links,direct_comm_methods,social_comm_methods
count,195,195,195,195,195,195,184,195,195,195,195,195,195,195,195.0
unique,195,195,195,195,195,195,184,21,195,64,192,195,195,195,178.0
top,LibreCube Initiative,LibreCube develops an ecosystem of open source...,Open Source Space Exploration,https://librecube.org,https://gitlab.com/librecube,https://librecube.org/google-summer-of-code-2024/,https://librecube.org/google-summer-of-code-pr...,Apache-2.0,https://summerofcode.withgoogle.com/media/org/...,"Science and medicine, Data","python, machine learning, c++, data analysis, ...","automation, space, Communication Protocols, Rover","info@librecube.org, https://app.element.io/#/r...","info@librecube.org, https://app.element.io/#/r...",
freq,1,1,1,1,1,1,1,46,1,13,2,1,1,1,18.0


In [5]:
gsoc_df.to_csv('gsoc_organizations.csv', index=False)

In [2]:
def classify_ideas_link(ideas_link):
    if "docs.google.com" in ideas_link:
        return "Google Doc"
    elif "github.com" in ideas_link and "/issues" in ideas_link:
        return "GitHub Issues"
    elif "http" in ideas_link or "https" in ideas_link:
        return "Webpage"
    else:
        return "Other"
    
gsoc_df['ideas_link_type'] = gsoc_df['ideas_link'].apply(classify_ideas_link)
gsoc_df['ideas_link_type'].value_counts()

ideas_link_type
Webpage          177
Google Doc        15
GitHub Issues      3
Name: count, dtype: int64

In [4]:
gsoc_df[gsoc_df['ideas_link_type'] == 'GitHub Issues']

Unnamed: 0,name,description,tagline,website_url,source_code,ideas_link,contributor_guidance_url,license,logo_url,categories,tech_tags,topic_tags,contact_links,direct_comm_methods,social_comm_methods,ideas_link_type
123,cBioPortal for Cancer Genomics,The cBioPortal for Cancer Genomics is a resour...,Aid discovery in complex cancer genomics data,https://www.cbioportal.org/,https://github.com/cBioPortal,https://github.com/cBioPortal/GSoC/issues?q=is...,https://github.com/cBioPortal/GSoC,LGPL-3.0,https://summerofcode.withgoogle.com/media/org/...,"Science and medicine, Artificial Intelligence","mysql, javascript, java, react, typescript","genomics, cancer, bioinformatics, big data, pr...","https://slack.cbioportal.org, https://twitter....",https://slack.cbioportal.org,https://twitter.com/cbioportal,GitHub Issues
130,AFLplusplus,We are dedicated to provide the most effective...,State of the art fuzzing for better security,https://aflplus.plus,https://github.com/AFLplusplus,https://github.com/AFLplusplus/LibAFL/issues/119,,Apache-2.0,https://summerofcode.withgoogle.com/media/org/...,Security,"llvm, rust, fuzzing, qemu","fuzzing, ci","afl@aflplus.plus, https://twitter.com/aflplusplus",afl@aflplus.plus,https://twitter.com/aflplusplus,GitHub Issues
150,National Resource for Network Biology (NRNB),The National Resource for Network Biology (NRN...,Developing open source tools for network biology,https://nrnb.org/gsoc.html,https://github.com/nrnb,https://github.com/nrnb/GoogleSummerOfCode/issues,https://docs.google.com/document/d/1Zi6L38CHEe...,LGPL-2.1,https://summerofcode.withgoogle.com/media/org/...,Science and medicine,"python, javascript, java, html, r","web application, data science, graphics, scien...",https://github.com/nrnb/GoogleSummerOfCode/iss...,https://github.com/nrnb/GoogleSummerOfCode/iss...,https://twitter.com/cytoscape,GitHub Issues


In [None]:
# create a function to process each type of ideas link and return the ideas details

# process Google Doc link and get all the ideas, I want to process all the ideas in the wrangler login