In [14]:
import json
from dataclasses import dataclass, asdict
from domainer.config import DATA_DIR
import re
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
@dataclass
class Company:
    # TODO: add more validations to the domain
    domain: str
    description: str

companies = []

In [3]:
# crawled with crawlers/antler.ipynb
with open(DATA_DIR / "antler_final.json") as f:
    antler_data = json.load(f)

antler_data[0]

{'name': '12iD',
 'website': 'https://www.12iD.com',
 'location': ['Sweden'],
 'sector': ['Fintech'],
 'description': ['12iD provides a solution to issue digital identities and to identify users remotely and uniquely.']}

In [4]:
# cleaning the data to only leave the top and second level domain as well as 
# removing any instances of second level domain from the description

def website_to_levels(website: str) -> tuple[str, str]:
    lvl2, lvl1 = website.split(".")[-2:]
    lvl2 = lvl2.replace("https://", "").replace("http://", "")
    lvl1 = lvl1.replace("/", "")
    return lvl1, lvl2

def clean_description(description: str, domain_level_2: str) -> str:
    # If the company already has a name, then using that as the domain name is probably best.
    # description = description.replace(domain_level_2, "")
    # description = re.sub(re.escape(domain_level_2), "", description, flags=re.IGNORECASE)
    return re.sub(r'\s+', ' ', description).strip()
    

def clean_antler(data: dict[str, str]) -> Company:
    lvl1, lvl2 = website_to_levels(data["website"])
    description = clean_description(data["description"][0], lvl2)
    domain_name = f"{lvl2}.{lvl1}"
    return Company(domain_name, description)

for data in antler_data:
    companies.append(clean_antler(data))

In [5]:
companies[:10]

[Company(domain='12iD.com', description='12iD provides a solution to issue digital identities and to identify users remotely and uniquely.'),
 Company(domain='913.ai', description='A unified platform to build, test, and integrate custom AI agents to handle all your workflows and knowledge.'),
 Company(domain='ai-fluence.com', description="Africa's 1st AI-Driven Influencer Marketing Platform"),
 Company(domain='aikacollectible.com', description='Aika collectible allows artists, athletes and creators to fuel their growth by tapping into their communities, and for the fansâ\x80\x94to invest in their favorites and be part of their success story.'),
 Company(domain='myalt.shop', description='Meta search engine for Fashion.'),
 Company(domain='arxsky.com', description='A recoverable self-custody wallet that sets up custom recovery schemes using a unique social recovery protocol based on threshold cryptography.'),
 Company(domain='avay.se', description='Investment platform for vacation rental

In [6]:
# crawled with crawlers/yc.ipynb
with open(DATA_DIR / "yc_final.json") as f:
    yc_data = json.load(f)

yc_data[0]

{'one_liner': 'VMware for GPUs',
 'website': 'https://outerport.com',
 'long_description': 'Outerport lets companies use their GPUs more efficiently by making it easy for them to be swapped from task to task. Just like how VMWare made it easy to put multiple users on a single server machine, we make it easy to put multiple AI models on a single GPU. Rather than having separate sets of GPUs for each task, you can buy less GPUs and make better use of it.\r\n\r\nHot swap foundation model weights instantly, minimize cold starts, scale horizontally, maintain version control, secure your models on a central registry, perform A/B tests, and save 40% on GPU costs. '}

In [7]:
def clean_yc(data: dict[str, str]) -> list[Company]:
    if not data["website"]:
        return []
    
    try:
        lvl1, lvl2 = website_to_levels(data["website"])
    except Exception:
        return []
    
    out = []
    domain_name = f"{lvl2}.{lvl1}"

    if desc:=data["long_description"]:
        long_description = clean_description(desc, lvl2)
        out.append(Company(domain_name, long_description))

    if desc:=data["one_liner"]:
        short_description = clean_description(desc, lvl2)
        out.append(Company(domain_name, short_description))

    return out

for data in yc_data:
    companies.extend(clean_yc(data))

len(companies)

10279

In [8]:
companies[-10:]

[Company(domain='snipshot.com', description='We sold Snipshot to Ansa in 2013.'),
 Company(domain='reddit.com', description='Founded by Steve Huffman and Alexis Ohanian in 2005, Reddit is an online community where users submit, vote, and comment on content, news, and discussions. Nicknamed "the front page of the internet,"\u200b Reddit is one of the top ten sites in the United States (source: Alexa), with hundreds of millions of users each month on desktop, mobile web, and our official Android/iOS apps. Interested in joining our growing team? Check out about.reddit.com/careers'),
 Company(domain='reddit.com', description='The frontpage of the internet.'),
 Company(domain='kiko.com', description='Kiko is a web calendar for anyone who wants to keep and share a calendar online. Kiko excels at three main things: * Universal access: Kiko integrates with your mobile phone and AIM and lets you access your calendar from any computer. * Share your calendar: Invite anyone to events (not just oth

In [9]:
# crawled with crawlers/ef.ipynb
with open(DATA_DIR / "ef_final.json") as f:
    ef_data = json.load(f)

ef_data[0]

{'name': 'CodeAnt AI',
 'ef_link': 'https://www.joinef.com/companies/codeant-ai/',
 'tags': ['https://www.joinef.com/industry/ai/',
  'https://www.joinef.com/industry/developer-tools/'],
 'tagline': 'AI to auto-fix bad code and security vulnerabilities.',
 'website': ['https://www.codeant.ai/'],
 'description': ['CodeAnt AI has developed a',
  'line-by-line code reviewer',
  'that saves developers time, and targets security vulnerabilities.',
  'Their AI Code Reviewer detects and auto-fixes code quality issues, highlighting best practices and anti-pattern violations for',
  'over 30 languages.',
  'Using an integration from developers’ IDEs to Pull Requests, CodeAnt AI’s code-quality tool acts as a',
  'clean code enforcer,',
  'ensuring bad code is never pushed again.']}

In [10]:
def clean_ef(data: dict[str, str]) -> list[Company]:
    if not data["website"]:
        return []
    
    lvl1, lvl2 = website_to_levels(data["website"][0])
    long_description = clean_description(" ".join(data["description"]), lvl2)
    short_description = clean_description(data["tagline"], lvl2)
    domain_name = f"{lvl2}.{lvl1}"
    return [Company(domain_name, long_description), Company(domain_name, short_description)]

for data in ef_data:
    companies.extend(clean_ef(data))

len(companies)

11083

In [11]:
companies[-10:]

[Company(domain='converge.io', description='Converge optimizes concrete operations and materials for the site, factory, and ready mixer by digitizing the physical world of construction with sensors and AI technologies. Construction is one of the world’s most crucial industries, but it’s incredibly inefficient and unsustainable. It accounts for about 8% of global CO 2 emissions, partly due to its heavy reliance on concrete. Low levels of digitization throughout the industry have created massive inefficiencies, contributing to safety issues, errors, and wasted resources. Converge optimizes concrete operations. Using sensors and intelligence platforms, they increase efficiencies across the construction site. These efficiencies include faster build times, reduced material usage, lower embodied carbon, and increased worker safety. They bridge the gap between sustainability and business viability by enabling customers to save money and the planet at the same time.'),
 Company(domain='converg

In [None]:
# TODO: ad linkedin dataset to the data

In [16]:
# splitting the data to train, test, val and saving to csv
data = [asdict(company) for company in companies]
df = pd.DataFrame(data)

train_data, temp_data = train_test_split(df, test_size=0.2)
val_data, test_data = train_test_split(temp_data, test_size=0.5)

train_data.to_csv(DATA_DIR / "train.csv", index=False)
test_data.to_csv(DATA_DIR / "test.csv", index=False)
val_data.to_csv(DATA_DIR / "val.csv", index=False)