In [1]:
# reload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import pandas as pd
from typing import List, Dict
from tools.logger import _logger

logger = _logger('main_notebook')

Loaded 12 environment variables from .env


In [3]:
# inputs


VERBOSE = True

INPUT = "A security auditing service for organizations utilizing smart contracts."

In [4]:
from search.inference import StackRequirements, identify_requirements


stack_requirements: StackRequirements = identify_requirements(
        INPUT,
        verbose=VERBOSE
)

framework_list = stack_requirements.frameworks

search_terms = ', '.join(framework_list)

[32m[INFO:url_document_loader:] scrapingbee client params: {'render_js': 'True'}[0m




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo identify the specific stack requirements for a security auditing service focusing on smart contracts, we need to consider the technologies and frameworks that are commonly used in the development, deployment, and auditing of smart contracts. Given the emphasis on popular open source frameworks, we'll focus on those that are widely recognized and used within the blockchain and smart contract development communities.

Thought: The product description points towards a need for technologies related to blockchain, smart contracts, and security auditing. Given the focus on smart contracts, Ethereum is likely a primary blockchain of interest, but other blockchains supporting smart contracts should also be considered. Tools and frameworks for analyzing and auditing the code of smart contracts for vulnerabilities are essential. Additionally, considering the open-source preference, we should look for tools that are widely adopted by

In [7]:
logger.info(f"Search terms (frameworks): {search_terms}")

[32m[INFO:main_notebook:] Search terms (frameworks): Solidity, Truffle Suite, Mythril, Slither, Echidna, OpenZeppelin, Web3.js[0m


In [8]:
from loader.inference import AccountData, load_account_data_list


account_data_list: List[AccountData] = load_account_data_list(
    framework_list, verbose=VERBOSE
)

[32m[INFO:github_loader:] Searching for repositories for ['Solidity', 'Truffle Suite', 'Mythril', 'Slither', 'Echidna', 'OpenZeppelin', 'Web3.js'][0m


[32m[INFO:github_loader:] Found 30 repositories for Solidity[0m
[32m[INFO:github_loader:] Found 30 repositories for Truffle Suite[0m
[32m[INFO:github_loader:] Found 30 repositories for Mythril[0m
[32m[INFO:github_loader:] Found 30 repositories for Slither[0m
[32m[INFO:github_loader:] Found 30 repositories for Echidna[0m
[32m[INFO:github_loader:] Found 30 repositories for OpenZeppelin[0m
[32m[INFO:github_loader:] Found 30 repositories for Web3.js[0m
[32m[INFO:github_loader:] 210 total repositories.[0m
[32m[INFO:github_loader:] 209 unique repositories.[0m
[32m[INFO:github_loader:] 200 repositories with issues.[0m
[32m[INFO:github_loader:] 138 repositories with open issues.[0m
[32m[INFO:github_loader:] 72 repositories with 100+ stars.[0m
[32m[INFO:github_loader:] Found 72 repositories for ['Solidity', 'Truffle Suite', 'Mythril', 'Slither', 'Echidna', 'OpenZeppelin', 'Web3.js'][0m
[32m[INFO:github_loader:] Loaded 30 contributors from crytic/echidna[0m
[32m[INFO

In [9]:
logger.info(f"Loaded {len(account_data_list)} accounts")

[32m[INFO:main_notebook:] Loaded 41 accounts[0m


In [10]:
import yaml
from copy import deepcopy
from labelling.inference import agenerate_labels_for_github_issue_list


def annotate_account_data_labels(account_data_list: List[AccountData]) -> List[AccountData]:
    """
    >> Adds labels to account_data.documents.metadata
    >> labels to account_data.account.metadata
    """


    # deep copy account_data_list

    account_data_list = deepcopy(account_data_list)


    # unpack issues (merge back on username / login (creator))

    github_issues_list = []
    for account_data in account_data_list:
        issues = account_data.documents
        github_issues_list.extend(issues)


    # get contents and generate labels

    contents = [issue.metadata['title'] for issue in github_issues_list] # first 300...?

    github_issue_labels_list = agenerate_labels_for_github_issue_list(contents)


    # merge back into github_issues_list

    for i, issue in enumerate(github_issues_list):
        issue.metadata['labels'] = github_issue_labels_list[i]


    # update documents in github accounts again

    for account in account_data_list:

        # technical update issue only.

        login = account.account.metadata['login']

        issues = [issue for issue in github_issues_list if issue.metadata['creator'] == login]

        account.documents = issues


        # update global merged labels (count?...score...?)

        labels = [label for issue in issues for label in issue.metadata['labels']]

        account.account.metadata['labels'] = labels

        account.account.page_content = yaml.dump(account.account.metadata, default_flow_style=False)

    return account_data_list


labelled_account_data_list = annotate_account_data_labels(account_data_list)



labelling (gpt3) (num_proc=16):   0%|          | 0/95 [00:00<?, ? examples/s]

In [21]:
import pandas as pd
accounts = [a.account.metadata for a in labelled_account_data_list]
accounts_df = pd.DataFrame(accounts) #[['login', 'labels']]   #...caption...
accounts_df.head()

Unnamed: 0,login,url,type,site_admin,name,company,bio,twitter_username,public_repos,public_gists,followers,following,created_at,updated_at,labels,blog,location,email,hireable
0,rappie,https://api.github.com/users/rappie,User,False,Rappie,@perimetersec,Smart contract fuzzing specialist,rappie_eth,35,1,34,80,2012-02-12T13:16:43Z,2024-08-30T15:51:43Z,"[security_and_compliance, integration_and_cust...",,,,
1,0xicingdeath,https://api.github.com/users/0xicingdeath,User,False,0xicingdeath,,blockchain nerd \r\n,0xicingdeath,39,9,122,57,2016-01-16T19:18:52Z,2024-07-22T17:28:28Z,"[analytics_and_advanced_monitoring, integratio...",,,,
2,aviggiano,https://api.github.com/users/aviggiano,User,False,Antonio Viggiano,,I help protocols improve their invariant tests,agfviggiano,149,4,198,102,2012-12-12T22:15:52Z,2024-09-17T15:11:46Z,"[integration_and_customizations, scalability, ...",https://allthingsfuzzy.substack.com/,"São Paulo, Brazil",,
3,0xalpharush,https://api.github.com/users/0xalpharush,User,False,alpharush,@asymmetric-research,,0xalpharush,62,11,504,171,2021-07-13T15:45:17Z,2024-09-30T23:04:52Z,"[integration_and_customizations, scalability, ...",https://0xalpharush.github.io/,,0xalpharush@protonmail.com,
4,0xPhaze,https://api.github.com/users/0xPhaze,User,False,,,,lovethewired,41,1,80,13,2022-04-06T10:36:55Z,2024-10-01T14:20:13Z,"[basic_setup_onboarding, integration_and_custo...",lovethewired.github.io,,,


In [29]:
from typing import List, Dict
from pydantic import BaseModel
from collections import Counter


from processor.inference import generate_caption_for_account

from labelling.inference import compute_label_intent_score, normalize_list_of_numbers



class Contact(BaseModel):

    url: str = None                    # html url of github account.
    title: str = None                  # name or username (display name)
    labels: List[str] = None           # merged labels (duplicated... or top by count?)
    caption: str = None                # bio / concerns / activity / why is he a good sale.
    confidence: float = 0.0     # confidence score (from labels...)

    # links: List[Dict] = []          # links to company / socials / other 
    # timeline: List[Dict] = []
    # company_name_or_url: str    # company / organization (object???...)



def contact_from_account_data(account_data: AccountData) -> Contact:
    """Parse contact for display from AccountData for UI display."""

    account = account_data.account          # issues
    documents = account_data.documents      # issues... to timeline?

    ## github profile url
    url = f"https://github.com/{account.metadata['login']}"


    ## company links_and_socials (twitter_username, ) and other links??? all in one??? with icon??!!


    ## main display title
    title = account.metadata['login']
    if account.metadata.get('name', None):
        title = f"{title} | {account.metadata['name']}"
    

    ## main display caption (or llm generate if none?)
    caption = account.metadata.get('bio', 'Developer') # location...
    if account.metadata.get('company', None):
        caption = f"{account.metadata['company']} | {caption}"
    if account.metadata.get('location', None):
        caption = f"{caption} | {account.metadata['location']}"
    ## slow llm fallback..
    # if not caption:
    #     caption = generate_caption_for_account(
    #         account.page_content,
    #         product_description=INPUT
    #     )
    

    ## get company name or url (email / bio / company / blog)

    ## labels (top 3 max)
    labels = account.metadata.get('labels', [])
    labels = [l for l in labels if 'other' not in l]
    top_k_labels = [l for l, _ in Counter(labels).most_common(3)]


    ### confidence score
    confidence = compute_label_intent_score(labels)


    contact = Contact(
        url=url,

        title=title,

        caption=caption,

        labels=top_k_labels,

        confidence=confidence,

    )

    return contact



### Format, Normalize, Sort


# format contacts...

contacts = [contact_from_account_data(account_data) for account_data in labelled_account_data_list]


# normalize scores...

normalized_confidence_scores = normalize_list_of_numbers([c.confidence for c in contacts])

for c, score in zip(contacts, normalized_confidence_scores):
    c.confidence = min(0.95, score) # cap score at 0.95 + some std??? egh


# sort by score...

contacts = sorted(contacts, key=lambda c: c.confidence, reverse=True) # should never be 1


pd.DataFrame([c.model_dump() for c in contacts])

Unnamed: 0,url,title,caption,labels,confidence
0,https://github.com/rappie,rappie | Rappie,@perimetersec | Smart contract fuzzing specia...,"[integration_and_customizations, scalability, ...",0.95
1,https://github.com/0xicingdeath,0xicingdeath | 0xicingdeath,blockchain nerd \r\n,"[integration_and_customizations, analytics_and...",0.95
2,https://github.com/aviggiano,aviggiano | Antonio Viggiano,I help protocols improve their invariant tests...,"[integration_and_customizations, scalability, ...",0.95
3,https://github.com/OnlyOneJMJQ,OnlyOneJMJQ | Josh Quintal,"ConsenSys | Developer | Chicago, IL","[integration_and_customizations, scalability, ...",0.95
4,https://github.com/emilyJLin95,emilyJLin95,Developer,"[integration_and_customizations, scalability]",0.95
5,https://github.com/webthethird,webthethird | William E Bodell III,Smart contract developer and security research...,"[integration_and_customizations, scalability]",0.95
6,https://github.com/wbt,wbt,Blessed/cursed to frequently find bugs in soft...,"[integration_and_customizations, scalability]",0.95
7,https://github.com/shaka0x,shaka0x | Shaka,Developer,"[integration_and_customizations, scalability]",0.95
8,https://github.com/DanielGelfand,DanielGelfand | Daniel Gelfand,Developer,"[integration_and_customizations, analytics_and...",0.95
9,https://github.com/ShaneDuncan602,ShaneDuncan602 | Shane Duncan,Thrackle | Senior Software Engineer with over ...,"[integration_and_customizations, scalability]",0.95
