## Data Extractor with other tools
- I tried both LlamaIndex but it was not satisfied, so here I tried different ways.

### Data Extractor with Gemini

In [None]:
# Function calling for GSoC 2024 ideas extraction
import google.generativeai as genai
import textwrap
import pandas as pd
import os
import json


genai.configure(api_key="")

# Set up the model
generation_config = {
  "temperature": 0.9,
  "top_p": 1,
  "top_k": 1,
  "max_output_tokens": 2048,
}

safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
]

model = genai.GenerativeModel(model_name="gemini-1.0-pro",
                              generation_config=generation_config,
                              safety_settings=safety_settings)

gsoc_df = pd.read_csv("gsoc_organizations_ideas_link.csv")
files = os.listdir('results2')

# for each file name split it using the underscore and the first part is the main name, save them in a list
file_names = [file.split('_')[0] for file in files]
print(file_names)

# For each text file
for index, row in gsoc_df.iterrows():
    if row['ideas_link_file'] == "Not Found" or row['name'] in file_names:
        continue
    
    with open(f"./ideas_link_data/{row['ideas_link_file']}", 'r') as file:
        content = file.read()
              
    response = model.generate_content(textwrap.dedent("""\
          Please return JSON descriping the the ideas from the given content using the following schema:

          {"ideas": list[IDEA]}

          IDEA = {"title": str, "description": str, "skills": str, "difficulty": str, "duration": str,  "related_url": list[str]}

          All fields are required. the related_url field, whitch is the related URLs.

          Important: Only return a single piece of valid JSON text.

          Here is the content:

          """) + content)

    json_text = response.text.strip('`\r\n ').removeprefix('json')
    data = json.dumps(json.loads(json_text), indent=4)
      
    with open(f"results2/{row['name']}_ideas.json", 'w') as file:
          file.write(data)
    print(f"Done with {row['name']}")
print("All done")

In [None]:
import google.ai.generativelanguage as glm
import google.generativeai as genai

gsoc_idea = glm.Schema(
    type = glm.Type.OBJECT,
    properties = {
        'title':  glm.Schema(type=glm.Type.STRING),
        'description':  glm.Schema(type=glm.Type.STRING),
        'skills': glm.Schema(type=glm.Type.ARRAY, items=glm.Schema(type=glm.Type.STRING)),
        'duration': glm.Schema(type=glm.Type.STRING),
        'difficulty': glm.Schema(type=glm.Type.STRING),
        'related_url': glm.Schema(type=glm.Type.ARRAY, items=glm.Schema(type=glm.Type.STRING))
    },
    required=['title', 'description', 'skills', 'duration', 'difficulty']
)

gsoc_ideas = glm.Schema(
    type=glm.Type.ARRAY,
    items=gsoc_idea
)

add_to_database = glm.FunctionDeclaration(
    name="add_to_database",
    description=textwrap.dedent("""\
        Adds entities to the database.
        """),
    parameters=glm.Schema(
        type=glm.Type.OBJECT,
        properties = {
            'gsoc_ideas': gsoc_ideas
        }
    )
)

model = genai.GenerativeModel(
    model_name='gemini-1.0-pro',
    tools = [add_to_database])

gsoc_df = pd.read_csv("gsoc_organizations_ideas_link.csv")
files = os.listdir('results2')

# for each file name split it using the underscore and the first part is the main name, save them in a list
file_names = [file.split('_')[0] for file in files]
print(file_names)

# For each text file
for index, row in gsoc_df.iterrows():
    if row['ideas_link_file'] == "Not Found" or row['name'] in file_names:
        continue
    
    with open(f"./ideas_link_data/{row['ideas_link_file']}", 'r') as file:
        content = file.read()
        
    result = model.generate_content(f"""
    Please add the ideas from this content to the database:

    {content}
    """)

    if 'function_call' in result.candidates[0].content.parts[0]:
        fc = result.candidates[0].content.parts[0].function_call
        data = json.dumps(type(fc).to_dict(fc), indent=4)
        with open(f"results2/{row['name']}_add_to_database.json", 'w') as file:
            file.write(data)
    print(f"Done with {row['name']}")
print("All done")


### Data Extractor with Cloudflare AI worker

In [10]:
import requests
import pandas as pd
from IPython.display import Markdown, display
import json

model = "@hf/thebloke/deepseek-coder-6.7b-instruct-awq"

# JSON schema
json_schema = """
{
    "title": "GSoC Project Idea",
    "description": "An idea from an organization from Google Summer of Code",
    "type": "object",
    "properties": {
        "organizationName": {
            "description": "The name of the organization proposing the project",
            "type": "string"
        },
        "projectTitle": {
            "description": "The title of the project",
            "type": "string"
        },
        "projectDescription": {
            "description": "A brief description of the project",
            "type": "string"
        },
        "skills": {
            "description": "Skills or tech stack required for the project",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "difficulty": {
            "description": "The difficulty level of the project",
            "type": "string"
        },
        "duration": {
            "description": "The estimated duration of the project in hours",
            "type": "integer"
        },
        "relatedLinks": {
            "description": "Related links for the project",
            "type": "array",
            "items": {
                "type": "string"
            }
        }
    },
    "required": ["organizationName", "projectTitle", "projectDescription", "skills", "difficulty"]
}
"""

system_prompt = f"""
The text below describes a project idea for Google Summer of Code. Create a JSON object from the description to match the JSON schema provided. The description may have more than one idea and you should give me all the available objects.

<BEGIN JSON SCHEMA>
{json_schema}
<END JSON SCHEMA>

Return JSON only. Do not explain or provide usage examples. just give me the JSON object that you can return and for parts you don't know just leave them empty.
"""

gsoc_df = pd.read_csv("gsoc_organizations_ideas_link.csv")
account_id = "4da66dac8f0c0483794586300c5ccc66"
api_token = "kOVr_YX01r4cveU4iPRy1Toe9rDODTedw0YeofF1"

# delete the files that already processed
import os
# load all the file names in the directory
files = os.listdir('results2')

# for each file name split it using the underscore and the first part is the main name, save them in a list
file_names = [file.split('_')[0] for file in files]
print(file_names)

# Initialize an empty DataFrame to store the parsed JSON responses
parsed_df = pd.DataFrame()

for index, row in gsoc_df.iterrows():
    if row['ideas_link_file'] in file_names:
        continue
    
    with open(f"./ideas_link_data/{row['ideas_link_file']}", 'r') as file:
        prompt = file.read()
            
    print(f"Processing idea {index + 1}...")
    
    # Split the prompt into chunks of 6144 characters
    chunks = [prompt[i:i+2000] for i in range(0, len(prompt), 2000)]
    
    for chunk in chunks:
        response = requests.post(
                f"https://api.cloudflare.com/client/v4/accounts/{account_id}/ai/run/@cf/meta/llama-2-7b-chat-fp16",
                headers={"Authorization": f"Bearer {api_token}"},
                json={"messages": [
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": chunk}
                ]}
        )
        
        inference = response.json()
        response = inference["result"]["response"]

        print(response.strip())
        try:
            # Parse the JSON response
            user_info = json.loads(response.strip())

            # Append the parsed JSON response to the DataFrame
            parsed_df = parsed_df.append(user_info, ignore_index=True)
            
            print(f"Successfully processed idea {index + 1}!")
        except:
            print(f"Failed to process idea {index + 1}!")
            continue
        
# Save the parsed DataFrame to a CSV file
parsed_df.to_csv('parsed_data.csv', index=False)

# Display the parsed DataFrame
display(parsed_df)


['OpenRefine', 'Swift', 'Nightwatch.js', 'Nightwatch.js', 'Nightwatch.js', 'Nightwatch.js', 'Open Chemistry', 'Nightwatch.js', 'OWASP Foundation', 'OWASP Foundation', 'Internet Health Report', 'JdeRobot', 'LAPPIS', 'R project for statistical computing', 'Zulip']
Processing idea 1...
Here are the available JSON objects based on the provided schema:
{
"organizationName": "Graphite",
"projectTitle": "Student projects",
"projectDescription": "Graphite offers a number of opportunities for students to contribute by building a self-contained project as part of a structured format. These projects are designed to be completed over several months and are ideal for Google Summer of Code or similar internship programs, solo or group university capstone projects, and other arrangements. Each project has a distinct focus and is a great way to make a meaningful contribution to open source over the length of the program while receiving mentorship and guidance from the Graphite team.",
"skills": [
"str

KeyboardInterrupt: 