In [1]:
# reload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import gradio as gr
import pandas as pd
from typing import List, Dict
from tools.logger import _logger

logger = _logger('main_notebook')

Loaded 11 environment variables from .env


In [3]:
INPUT = "A security auditing service for organizations utilizing smart contracts."

In [4]:
from search.inference import StackRequirements, identify_requirements


stack_requirements: StackRequirements = identify_requirements(
        INPUT
)

framework_list = stack_requirements.frameworks

search_terms = ', '.join(framework_list)

[32m[INFO:url_document_loader:] scrapingbee client params: {'render_js': 'True'}[0m




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo identify the specific stack requirements for a security auditing service focusing on smart contracts, we need to consider the nature of smart contracts themselves, which are primarily deployed on blockchain platforms. The most common blockchain for deploying smart contracts is Ethereum, but there are others like Binance Smart Chain, Polkadot, and Solana that are also popular. Given this, the stack requirements would likely involve tools and frameworks for analyzing and interacting with smart contracts across various blockchains, as well as security analysis tools specifically designed for these types of contracts.

Thought: Since the product is a security auditing service for smart contracts, the stack requirements will likely include blockchain development frameworks, smart contract analysis tools, and possibly blockchain simulation environments for testing. Ethereum being the most popular platform for smart contracts, to

In [5]:
logger.info(f"Search terms (frameworks): {search_terms}")

[32m[INFO:main_notebook:] Search terms (frameworks): Echidna, Slither, Mythril, Securify, Diligence Fuzzing, Cyfrin Aderyn, Medusa[0m


In [6]:
from loader.inference import AccountData, load_account_data_list


account_data_list: List[AccountData] = load_account_data_list(
    framework_list, verbose=True
)

[32m[INFO:github_loader:] Searching for repositories for ['Echidna', 'Slither', 'Mythril', 'Securify', 'Diligence Fuzzing', 'Cyfrin Aderyn', 'Medusa'][0m
[32m[INFO:github_loader:] Found 30 repositories for Echidna[0m
[32m[INFO:github_loader:] Found 30 repositories for Slither[0m
[32m[INFO:github_loader:] Found 30 repositories for Mythril[0m
[32m[INFO:github_loader:] Found 30 repositories for Securify[0m
[32m[INFO:github_loader:] Found 2 repositories for Diligence Fuzzing[0m
[32m[INFO:github_loader:] Found 0 repositories for Cyfrin Aderyn[0m
[32m[INFO:github_loader:] Found 30 repositories for Medusa[0m
[32m[INFO:github_loader:] 152 total repositories.[0m
[32m[INFO:github_loader:] 152 unique repositories.[0m
[32m[INFO:github_loader:] 146 repositories with issues.[0m
[32m[INFO:github_loader:] 81 repositories with open issues.[0m
[32m[INFO:github_loader:] 37 repositories with 100+ stars.[0m
[32m[INFO:github_loader:] Found 37 repositories for ['Echidna', 'Slither'

In [7]:
logger.info(f"Loaded {len(account_data_list)} accounts")

[32m[INFO:main_notebook:] Loaded 51 accounts[0m


In [12]:
import yaml
from copy import deepcopy
from labelling.inference import agenerate_labels_for_github_issue_list


def annotate_account_data_labels(account_data_list: List[AccountData]) -> List[AccountData]:
    """
    >> Adds labels to account_data.documents.metadata
    >> labels to account_data.account.metadata
    """


    # deep copy account_data_list

    account_data_list = deepcopy(account_data_list)


    # unpack issues (merge back on username / login (creator))

    github_issues_list = []
    for account_data in account_data_list:
        issues = account_data.documents
        github_issues_list.extend(issues)


    # get contents and generate labels

    contents = [issue.metadata['title'] for issue in github_issues_list] # first 300...?

    github_issue_labels_list = agenerate_labels_for_github_issue_list(contents)


    # merge back into github_issues_list

    for i, issue in enumerate(github_issues_list):
        issue.metadata['labels'] = github_issue_labels_list[i]


    # update documents in github accounts again

    for account in account_data_list:

        # technical update issue only.

        login = account.account.metadata['login']

        issues = [issue for issue in github_issues_list if issue.metadata['creator'] == login]

        account.documents = issues


        # update global merged labels (count?...score...?)

        labels = [label for issue in issues for label in issue.metadata['labels']]

        account.account.metadata['labels'] = labels

        account.account.page_content = yaml.dump(account.account.metadata, default_flow_style=False)

    return account_data_list


labelled_account_data_list = annotate_account_data_labels(account_data_list)

labelling (gpt3) (num_proc=16):   0%|          | 0/99 [00:00<?, ? examples/s]

In [13]:
import pandas as pd
accounts = [a.account.metadata for a in labelled_account_data_list]
accounts_df = pd.DataFrame(accounts) #[['login', 'labels']]   #...caption...
accounts_df.head()

Unnamed: 0,login,url,type,site_admin,name,company,bio,twitter_username,public_repos,public_gists,followers,following,created_at,updated_at,labels,blog,location,email,hireable
0,rappie,https://api.github.com/users/rappie,User,False,Rappie,@perimetersec,Smart contract fuzzing specialist,rappie_eth,35,1,34,80,2012-02-12T13:16:43Z,2024-08-30T15:51:43Z,"[security_and_compliance, integration_and_cust...",,,,
1,0xicingdeath,https://api.github.com/users/0xicingdeath,User,False,0xicingdeath,,blockchain nerd \r\n,0xicingdeath,39,9,122,57,2016-01-16T19:18:52Z,2024-07-22T17:28:28Z,"[analytics_and_advanced_monitoring, integratio...",,,,
2,aviggiano,https://api.github.com/users/aviggiano,User,False,Antonio Viggiano,,I help protocols improve their invariant tests,agfviggiano,149,4,198,102,2012-12-12T22:15:52Z,2024-09-17T15:11:46Z,"[integration_and_customizations, scalability, ...",https://allthingsfuzzy.substack.com/,"São Paulo, Brazil",,
3,0xalpharush,https://api.github.com/users/0xalpharush,User,False,alpharush,@asymmetric-research,,0xalpharush,62,11,504,171,2021-07-13T15:45:17Z,2024-09-30T23:04:52Z,"[integration_and_customizations, scalability, ...",https://0xalpharush.github.io/,,0xalpharush@protonmail.com,
4,0xPhaze,https://api.github.com/users/0xPhaze,User,False,,,,lovethewired,41,1,80,13,2022-04-06T10:36:55Z,2024-10-01T14:20:13Z,"[basic_setup_onboarding, integration_and_custo...",lovethewired.github.io,,,


In [10]:
from typing import List, Dict
from pydantic import BaseModel


class Contact(BaseModel):

    url: str                    # html url of github account.

    title: str                  # name or username (display name)

    caption: str                # bio / concerns / activity / why is he a good sale.

    labels: List[str]           # merged labels (duplicated... or top by count?)

    company_name_or_url: str    # company / organization (object???...)

    timeline: List[Dict]


def contact_from_account_data(account_data: AccountData) -> Contact:
    """Parse contact for display from AccountData for UI display."""


    account = account_data.account          # issues
    documents = account_data.documents      # issues


    url = account.metadata['html_url']

    ## main display title
    title = account.metadata['login']
    if account.metadata['name']:
        title = f"{title} | {account.metadata['name']}"
    
    ## main display caption

    contact = Contact(
        url=account.metadata['html_url'],
        title=account.metadata['login'],
        caption=account.metadata['bio'],        # name or otherwise email?
        labels=account.metadata['labels'],
        company_name_or_url=account.metadata['company'],
        timeline=account_data.documents
    )

    return contact


def get_contacts_list_from_account_data_list(account_data_list: List[AccountData]) -> List[Contact]:
    """Parse contacts for display from AccountData list for UI display."""





for account_data in labelled_account_data_list:

    account = account_data.account

    contact = Contact(
        url=account.metadata['html_url'],
        title=account.metadata['login'],
        caption=account.metadata['bio'],
        labels=account.metadata['labels'],
        company_name_or_url=account.metadata['company'],
        timeline=account_data.documents
    )

    logger.info(contact)

    break


Unnamed: 0,login,labels
0,rappie,"[security_and_compliance, integration_and_cust..."
1,0xicingdeath,"[analytics_and_advanced_monitoring, integratio..."
2,aviggiano,"[integration_and_customizations, scalability, ..."
3,0xalpharush,"[integration_and_customizations, scalability, ..."
4,0xPhaze,"[basic_setup_onboarding, integration_and_custo..."
