## The pipeline:
- Fetch the input data which is available from GSoC and parse it into a pandas dataframe.
- Classify the ideas url into Google Doc, GitHub Issues, single web page, and others.
- Using one of the three methods, classify the webpages type into single webpage that contain the ideas or a webpage with links to the ideas details.
- extract the ideas details for each type

In [1]:
import requests
import pandas as pd
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
from difflib import SequenceMatcher
from requests_html import HTMLSession
import warnings
import os

warnings.filterwarnings("ignore")



In [2]:
# Fetch the input data which is available from GSoC and parse it into a pandas dataframe.
def process_organizations(organizations):
    data = []
    for org in organizations:
        organization_data = {
            'name': org.get('name', ''),
            'description': org.get('description', ''),
            'tagline': org.get('tagline', ''),
            'website_url': org.get('website_url', ''),
            'source_code': org.get('source_code', ''),
            'ideas_link': org.get('ideas_link', ''),
            'contributor_guidance_url': org.get('contributor_guidance_url', ''),
            'license': org.get('license', ''),
            'logo_url': org.get('logo_url', ''),
            'categories': ', '.join(org.get('categories', [])),
            'tech_tags': ', '.join(org.get('tech_tags', [])),
            'topic_tags': ', '.join(org.get('topic_tags', [])),
            'contact_links': ', '.join([link['value'] for link in org.get('contact_links', [])]),
            'direct_comm_methods': ', '.join([method['value'] for method in org.get('direct_comm_methods', [])]),
            'social_comm_methods': ', '.join([method['value'] for method in org.get('social_comm_methods', [])])
        }
        data.append(organization_data)
    return pd.DataFrame(data)

# URL for GSoC 2024 organizations
url = "https://summerofcode.withgoogle.com/api/program/2024/organizations/"
response = requests.get(url)
if response.status_code == 200:
    organizations = response.json()
else:
    raise requests.exceptions.RequestException("Failed to fetch data")

gsoc_df = process_organizations(organizations)

gsoc_df.head()

Unnamed: 0,name,description,tagline,website_url,source_code,ideas_link,contributor_guidance_url,license,logo_url,categories,tech_tags,topic_tags,contact_links,direct_comm_methods,social_comm_methods
0,Scalable Parallel Computing Laboratory,Scalable Parallel Computing Laboratory (SPCL) ...,High-performance computing for clusters and cl...,https://spcl.inf.ethz.ch/,https://github.com/spcl,https://github.com/spcl/.github/blob/main/prof...,https://github.com/spcl/.github/blob/main/prof...,BSD-3-Clause,https://summerofcode.withgoogle.com/media/org/...,"Programming languages, Infrastructure and cloud","python, c++, mpi, aws, Serverless","programming languages, high-performance comput...",https://chat.spcl.inf.ethz.ch/signup_user_comp...,https://chat.spcl.inf.ethz.ch/signup_user_comp...,https://twitter.com/spcl_eth
1,"Department of Biomedical Informatics, Emory Un...",Biomedical Informatics is a multidisciplinary ...,Biomedical research to advance healthcare,https://med.emory.edu/departments/biomedical-i...,https://github.com/NISYSLAB/Emory-BMI-GSoC/,https://github.com/NISYSLAB/Emory-BMI-GSoC,https://github.com/NISYSLAB/Emory-BMI-GSoC/blo...,BSD-3-Clause,https://summerofcode.withgoogle.com/media/org/...,"Science and medicine, Artificial Intelligence","python, javascript, java, dicom, Colab","science and medicine, data integration, workfl...","http://bit.ly/emory-bmi, mzeydab@emory.edu, ht...","http://bit.ly/emory-bmi, mzeydab@emory.edu",https://twitter.com/emorybmi
2,Drupal Association,The Drupal Association is the non-profit organ...,The best open source digital experience platform,https://drupal.org,https://git.drupalcode.org/project/drupal.git,https://www.drupal.org/project/issues/gsoc?cat...,https://www.drupal.org/community/contributor-g...,GPL-2.0,https://summerofcode.withgoogle.com/media/org/...,"Web, Social and communication","mysql, javascript, html, php, symfony","web, cloud, DXP, Massive community, Inclusive","https://groups.drupal.org/google-summer-code, ...","https://groups.drupal.org/google-summer-code, ...",https://twitter.com/drupal
3,FreeType,FreeType is a freely available software librar...,FreeType is a software library to render fonts.,https://freetype.org,https://gitlab.freedesktop.org/freetype,https://freetype.org/gsoc.html,https://freetype.org/gsoc.html,GPL-2.0,https://summerofcode.withgoogle.com/media/org/...,"Programming languages, Media","c, autotools, meson","library, rendering, fonts, opentype, truetype",freetype-devel@nongnu.org,freetype-devel@nongnu.org,
4,JdeRobot,Robotics applications are typically distribute...,Toolkit for developing Robotics applications,http://jderobot.github.io,https://github.com/jderobot,https://jderobot.github.io/activities/gsoc/202...,https://jderobot.github.io/activities/gsoc/202...,GPL-3.0,https://summerofcode.withgoogle.com/media/org/...,"Other, Artificial Intelligence","python, ros, gazebo, opencv, tensorflow","education, artificial intelligence, robotics, ...",https://gsyc.urjc.es/cgi-bin/mailman/listinfo/...,https://gsyc.urjc.es/cgi-bin/mailman/listinfo/...,https://twitter.com/JdeRobot


In [4]:
gsoc_df.describe()

Unnamed: 0,name,description,tagline,website_url,source_code,ideas_link,contributor_guidance_url,license,logo_url,categories,tech_tags,topic_tags,contact_links,direct_comm_methods,social_comm_methods
count,195,195,195,195,195,195,184,195,195,195,195,195,195,195,195.0
unique,195,195,195,195,195,195,184,21,195,64,192,195,195,195,178.0
top,Synfig,Synfig is a 2D open-source animation software....,Open-source 2D animation software,https://synfig.org,https://github.com/synfig/synfig,https://synfig-docs-dev.readthedocs.io/en/late...,https://synfig-docs-dev.readthedocs.io/en/late...,Apache-2.0,https://summerofcode.withgoogle.com/media/org/...,"Science and medicine, Data","python, c++","2d/3d graphics, animation, vector graphics",https://forums.synfig.org/t/gsoc-2024-google-s...,https://forums.synfig.org/t/gsoc-2024-google-s...,
freq,1,1,1,1,1,1,1,46,1,13,2,1,1,1,18.0


In [3]:
gsoc_df.to_csv('./gsoc_organizations.csv', index=False)

In [3]:
gsoc_df_copy = gsoc_df.copy()
# Classify the ideas url into Google Doc, GitHub Issues, single web page, and others.
def classify_ideas_link(ideas_link):
    parsed_url = urlparse(ideas_link)
    if "docs.google.com" in parsed_url.netloc:
        return "Google Doc"
    elif "github.com" in parsed_url.netloc and "/issues" in parsed_url.path:
        return "GitHub Issues"
    elif parsed_url.scheme in ["http", "https"]:
        return "Webpage"
    else:
        return "Other"
    
gsoc_df_copy['ideas_link_type'] = gsoc_df_copy['ideas_link'].apply(classify_ideas_link)

gsoc_df_copy.value_counts('ideas_link_type')

ideas_link_type
Webpage          177
Google Doc        15
GitHub Issues      3
Name: count, dtype: int64

In [4]:
# for each ideas link, fetch the data and store the text into .txt file and save it in a directory.
def fetch_ideas_link(ideas_link, org_name):
    try:
        session = HTMLSession()
        response = session.get(ideas_link)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text()
            text = re.sub(r'\n+', '\n', text)
            text = re.sub(r'\n',
                            ' ',
                            text)
            text = re.sub(r'\s+',
                            ' ',
                            text)
            text = text.strip()
            filename = org_name + ".txt"
            with open(f"./ideas_link_data/{filename}", 'w') as file:
                file.write(text)
        else:
            filename = "Not Found"
    except:
        filename = "Not Found"
            
    return filename

# Create a directory to store the ideas link data
if not os.path.exists('ideas_link_data'):
    os.makedirs('ideas_link_data')
    
gsoc_df_copy['ideas_link_file'] = gsoc_df_copy.apply(lambda row: fetch_ideas_link(row['ideas_link'], row['name']), axis=1)

In [5]:
# count the number of words in each ideas file
def count_words_in_file(filename):
    if filename == "Not Found":
        return 0
    
    with open(f"./ideas_link_data/{filename}", 'r') as file:
        text = file.read()
        words = text.split()
        return len(words)
    
gsoc_df_copy['ideas_link_word_count'] = gsoc_df_copy['ideas_link_file'].apply(count_words_in_file)

In [6]:
# drop rows with ideas_link_word_count < 100
gsoc_df_copy.drop(gsoc_df_copy[gsoc_df_copy['ideas_link_word_count'] < 100].index, inplace=True)

In [None]:
gsoc_df_copy.to_csv('./gsoc_organizations_ideas_link.csv', index=False)

### Used DataFrame Structured Data Extraction from [here](https://docs.llamaindex.ai/en/stable/examples/output_parsing/df_program.html):

In [None]:
# %pip install llama-index-llms-openai
# %pip install llama-index-program-evaporate

In [None]:
%load_ext autoreload
%autoreload 2

In [7]:
os.environ["OPENAI_API_KEY"] = "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"

from llama_index.program.openai import OpenAIPydanticProgram
from llama_index.program.evaporate.df import DFFullProgram, DFRowsProgram
from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI
import pandas as pd

# initialize empty df
df = pd.DataFrame(
    {
        "idea title": pd.Series(dtype="str"),
        "idea description": pd.Series(dtype="str"),
        "skills": pd.Series(dtype="str"),
        "difficulty": pd.Series(dtype="str"),
        "duration": pd.Series(dtype="str"),
    }
)

# initialize program, using existing df as schema
df_rows_program = DFRowsProgram.from_defaults(
    pydantic_program_cls=OpenAIPydanticProgram, df=df
)

Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")

# parse text for each text file, using existing df as schema
import nltk
import os
from pathlib import Path

# Create a directory to store the results
Path("./results2").mkdir(parents=True, exist_ok=True)

files = os.listdir('results2')

# for each file name split it using the underscore and the first part is the main name, save them in a list
file_names = [file.split('_')[0] for file in files]
print(file_names)

# For each text file
for index, row in gsoc_df_copy.iterrows():
    if row['ideas_link_file'] == "Not Found" or row['name'] in file_names:
        continue
    
    with open(f"./ideas_link_data/{row['ideas_link_file']}", 'r') as file:
        text = file.read()
        # Tokenize the text
        tokens = nltk.word_tokenize(text)
        # Divide the tokens into chunks of 3000
        chunks = [tokens[i:i + 3000] for i in range(0, len(tokens), 3000)]
        # Process each chunk
        for i, chunk in enumerate(chunks):
            try:
                chunk_text = ' '.join(chunk)
                result_obj = df_rows_program(input_str=chunk_text)
                # Save the result to a .csv file
                result_obj.to_df(existing_df=df).to_csv(f'./results2/{row["name"]}_chunk_{i}.csv', index=False)
            except:
                pass

['OpenRefine', 'Swift', 'Nightwatch.js', 'Nightwatch.js', 'Nightwatch.js', 'Nightwatch.js', 'Open Chemistry', 'Nightwatch.js', 'OWASP Foundation', 'OWASP Foundation', 'Internet Health Report', 'JdeRobot', 'LAPPIS', 'R project for statistical computing', 'Zulip']


Retrying llama_index.llms.openai.base.OpenAI._chat in 0.0077555005521194875 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-V7lZkUuaTH7Fq8vi2FdzVHQh on requests per day (RPD): Limit 200, Used 200, Requested 1. Please try again in 7m12s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._chat in 0.12482114229223507 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-V7lZkUuaTH7Fq8vi2FdzVHQh on requests per day (RPD): Limit 200, Used 200, Requested 1. Please try again in 7m12s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your 

### Using Entity Extractor integration from LlamaIndex Hub:

In [None]:
# TODO:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.extractors.entity import EntityExtractor

reader = SimpleDirectoryReader('files')
documents = reader.load_data()
parser = SentenceSplitter(include_prev_next_rel=True)
nodes = parser.get_nodes_from_documents(documents)

entity_extractor = EntityExtractor(
    label_entities = True,
    device = "cpu"
)
metadata_list = entity_extractor.extract(nodes)

print(metadata_list)


In [None]:
# TODO
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import BaseExtractor
from typing import List, Dict

class CustomExtractor(BaseExtractor):
    async def aextract(self, nodes) -> List[Dict]:
        metadata_list = [
            {
                "node_length":  str(len(node.text))
            }
            for node in nodes
        ]
        return metadata_list

reader = SimpleDirectoryReader('files')
documents = reader.load_data()

parser = SentenceSplitter(include_prev_next_rel=True)
nodes = parser.get_nodes_from_documents(documents)

extractor = CustomExtractor()
print(extractor.extract(nodes))