# Sector Classificaiton LLM

In [53]:
import pandas as pd
import numpy as np
import awswrangler as wr
import boto3

pd.options.display.max_columns = None

In [2]:
# Load icms_issuer data
SCHEMA_NAME = "datalake-curated-production"

query_stry = """
SELECT * 
FROM icms_issuer 
where 1=1
"""

df = wr.athena.read_sql_query(sql=query_stry, database=SCHEMA_NAME)
df.head(5)

Unnamed: 0,id,subsector_id,crunchbase_url,description,override_valuation_dollars,year_founded,lifecycle_status,exclude_from_data_products,archived_at,total_funding_dollars,...,name,cap_iq_id,updated_at,sharex_id,sector,sub_sector,legal_address,phone_number,domicile_code,distribute_forge_price
0,5006,,https://www.crunchbase.com/organization/procyrion,"Procyrion is aiding the Aortix™ device, design...",,2005,,True,NaT,,...,Procyrion,,2024-04-02 22:54:11.927,100005205,Healthcare,Medical Devices,,,,False
1,2567,,https://www.crunchbase.com/organization/quanti...,Quantifind is a the developer of an AI platfor...,,2009,,True,NaT,,...,Quantifind,,2024-03-07 23:01:24.143,100001748,Enterprise Software,Data Intelligence,,,,False
2,3124,,https://www.crunchbase.com/organization/taskra...,TaskRabbit is the developer of a two-sided mar...,,2008,,False,2021-05-25 19:12:13.491,,...,TaskRabbit,,2024-01-18 00:00:00.000,10111,Consumer & Lifestyle,E-commerce,,,,False
3,3959,,crunchbase.com/organization/knoetic,"Knoetic, founded in 2020, is a technology comp...",,2017,,False,NaT,,...,Knoetic,0.0,2024-01-18 00:00:00.000,100004158,Enterprise Software,Human Resources,,,,False
4,2241,,,Olo is the developer of an on-demand ordering ...,,2005,,False,NaT,,...,Olo,,2024-01-18 00:00:00.000,100002141,Transportation,Delivery services,,,,False


In [3]:
df.to_csv("icms_issuers_data.csv", index=False)

## Sector and subsectors

##### 11 Sectors and 78 subsectors
#####  342 issuers missing sector classification

In [4]:
data = df[(df["sector"] != "Missing") | (df["sub_sector"] != "Missing")][
    ["sector", "sub_sector"]
].drop_duplicates()

sector_list = list(np.unique(data["sector"]))

for sector in sector_list:
    sub_sector_list = list(np.unique(data[data["sector"] == sector]["sub_sector"]))

    print(sector)
    for sub_sector in sub_sector_list:
        print("    ", sub_sector)

Consumer & Lifestyle
     Automotive
     Cannabis & Tobacco
     Clothing, Fashion, Beauty & Apparel
     Consumer & Lifestyle Enterprise Software
     Consumer Applications
     Consumer Electronics
     E-commerce
     E-commerce software
     Entertainment
     Fitness
     Gaming
     Home Products
     Media
     Other Consumer & Lifestyle
     Pets
     Restaurants & Coffee
     Social
     Travel & Hospitality
Education
     E-Learning
     Education software
     Other Education
Energy
     Clean Energy
     Energy Storage
     Energy efficiency
     Other Energy
Enterprise Software
     Business Operations
     Cloud/Networking Infrastructure
     Communication & Collaboration software
     Cybersecurity
     Data Intelligence
     Data Management/Storage
     Human Resources
     Other Enterprise Software
     Productivity
     Sales & Marketing / Adtech
     Software Development
FinTech
     Blockchain
     Blockchain software
     Capital Markets
     Digital Banking
     

## Templates

In [5]:
# sector descriptions generated via LLM Chat
sector_descriptions = """
Consumer & Lifestyle: This sector encompasses businesses that cater to individual needs and desires, offering products and services that enhance our daily lives, from apparel and entertainment to personal care and leisure activities.

Education: The education sector focuses on providing learning opportunities and knowledge dissemination through schools, universities, online platforms, and educational materials. It plays a crucial role in shaping individuals and the workforce.

Energy: This sector is responsible for powering our world, encompassing businesses involved in the exploration, production, distribution, and trading of various energy sources, like oil, gas, renewables, and nuclear power.

Enterprise Software:  The enterprise software sector provides businesses with the digital tools and applications they need to operate efficiently. This includes software for project management, customer relationship management, data analytics, and cybersecurity.

FinTech: The FinTech sector combines finance and technology, offering innovative solutions for financial services like mobile payments, online banking, cryptocurrency, and wealth management.

FoodTech: This sector applies technology to revolutionize the food industry, encompassing businesses involved in areas like online food delivery, farm automation, alternative proteins, and personalized nutrition.

Healthcare: The healthcare sector encompasses businesses and professionals involved in providing medical services, treatments, and products to diagnose, prevent, and treat illnesses, ensuring overall health and well-being.

Industrial: This sector focuses on large-scale production and manufacturing of goods, including machinery, heavy equipment, chemicals, and construction materials. It plays a vital role in building infrastructure and supporting other industries.

Real Estate: The real estate sector deals with the buying, selling, renting, and management of properties, including residential, commercial, and industrial spaces. It encompasses businesses like real estate agencies, developers, property managers, and landlords.

Technology Hardware: This sector focuses on the physical components of technological devices, encompassing businesses that design, manufacture, and sell computer hardware, mobile devices, networking equipment, and other electronic devices.

Transportation: The transportation sector is responsible for the movement of people and goods, including businesses involved in airlines, railways, shipping, trucking, and ride-sharing services. It keeps our world connected and facilitates global trade.
"""

In [6]:
# sector descriptions generated via LLM Chat
sector_descriptions_w_examples = """
Consumer & Lifestyle:  This sector encompasses businesses that cater to individual needs and desires, offering products and services that enhance our daily lives, from apparel and entertainment (Nike, Netflix) to personal care and leisure activities (Peloton, Ulta Beauty).

Education: The education sector focuses on providing learning opportunities and knowledge dissemination through schools (K-12), universities (Harvard, MIT), online platforms (Coursera, Udemy), and educational materials publishers (McGraw-Hill, Pearson).

Energy: This sector is responsible for powering our world, encompassing businesses involved in the exploration (ExxonMobil, Shell), production (SunPower, Tesla), distribution (National Grid, Consolidated Edison), and trading of various energy sources, like oil, gas, renewables, and nuclear power.

Enterprise Software: The enterprise software sector provides businesses with the digital tools and applications they need to operate efficiently. This includes software for project management (Asana, Trello), customer relationship management (Salesforce, Zendesk), data analytics (Tableau, Microsoft Power BI), and cybersecurity (CrowdStrike, Palo Alto Networks).

FinTech: The FinTech sector combines finance and technology, offering innovative solutions for financial services like mobile payments (Apple Pay, Venmo), online banking (Chime, Ally Bank), cryptocurrency (Coinbase, Gemini), and wealth management (Betterment, Wealthfront).

FoodTech: This sector applies technology to revolutionize the food industry, encompassing businesses involved in areas like online food delivery (DoorDash, Grubhub), farm automation (John Deere, DeLaval), alternative proteins (Impossible Foods, Beyond Meat), and personalized nutrition (Habit, Noom).

Healthcare: The healthcare sector encompasses businesses and professionals involved in providing medical services, treatments, and products to diagnose, prevent, and treat illnesses, ensuring overall health and well-being. This includes hospitals (Mayo Clinic, Cleveland Clinic), pharmaceutical companies (Pfizer, Merck), medical device manufacturers (Medtronic, Abbott Laboratories), and health insurance providers (Aetna, UnitedHealthcare).

Industrial: This sector focuses on large-scale production and manufacturing of goods, including machinery (Caterpillar, Siemens), heavy equipment (Komatsu, Hitachi), chemicals (Dow, BASF), and construction materials (Cemex, HeidelbergCement). It plays a vital role in building infrastructure and supporting other industries.

Real Estate: The real estate sector deals with the buying, selling, renting, and management of properties, including residential (Realtor.com, Zillow), commercial (Cushman & Wakefield, JLL), and industrial spaces (Prologis, Duke Realty). It encompasses businesses like real estate agencies, developers, property managers, and landlords.

Technology Hardware: This sector focuses on the physical components of technological devices, encompassing businesses that design, manufacture, and sell computer hardware (Apple, Dell), mobile devices (Samsung, Xiaomi), networking equipment (Cisco, Juniper Networks), and other electronic devices (Sony, LG).

Transportation: The transportation sector is responsible for the movement of people and goods, including businesses involved in airlines (American Airlines, Emirates), railways (Union Pacific, Deutsche Bahn), shipping (Maersk, CMA CGM), trucking (FedEx, UPS), and ride-sharing services (Uber, Lyft). It keeps our world connected and facilitates global trade.
"""

In [8]:
# sector descriptions generated via LLM Chat
sector_sub_sector_descriptions = """
Consumer & Lifestyle: This sector caters to the needs and desires of individual consumers, encompassing a wide range of products and services that enhance our daily lives.
- Automotive: Deals with the design, development, manufacturing, marketing, and sale of motor vehicles like cars, trucks, motorcycles, and related parts and services.
- Cannabis & Tobacco: Focuses on the production, distribution, and sale of cannabis and tobacco products, with increasing emphasis on regulations and alternative consumption methods.
- Clothing, Fashion, Beauty & Apparel: Encompasses businesses involved in the design, manufacturing, and retailing of clothing, footwear, accessories, cosmetics, and other personal care products.
- Consumer & Lifestyle Enterprise Software: Provides software solutions for businesses within the Consumer & Lifestyle sector, such as inventory management for fashion retailers or customer relationship management for restaurants.
- Consumer Applications: Refers to mobile apps and software applications designed for personal use, covering areas like entertainment, social media, productivity, and shopping.
- Consumer Electronics: Deals with electronic devices for personal use, including televisions, smartphones, tablets, gaming consoles, and home appliances.
- E-commerce: Focuses on the online buying and selling of goods and services, encompassing online retailers, marketplaces, and platforms facilitating digital transactions.
- E-commerce software: Provides tools and platforms for businesses to run online stores, manage inventory, process payments, and fulfill orders.
- Entertainment: Encompasses businesses involved in creating and distributing entertainment content, including movies, music, television shows, video games, and theme parks.
- Fitness: This sub-sector deals with products and services related to physical health and well-being, including fitness centers, wearable devices, fitness apps, and nutritional products.
- Gaming: Focuses on video games, consoles, esports, and the development and distribution of interactive entertainment software.
- Home Products: Encompasses businesses involved in the design, manufacturing, and retailing of furniture, appliances, housewares, and other products for the home environment.
- Media: Refers to various channels of communication, including traditional media like newspapers and television, and digital media like online publications and social media platforms.
- Other Consumer & Lifestyle: Includes businesses not easily categorized elsewhere, such as pet care products and services, adult entertainment, and luxury goods.
- Pets: Deals with products and services related to pet ownership, including food, toys, accessories, veterinary care, and pet grooming.
- Restaurants & Coffee: Encompasses businesses involved in preparing and serving food and beverages to consumers, including full-service restaurants, fast food chains, cafes, and coffee shops.
- Social: Refers to online platforms and applications that enable users to connect and share information with each other, including social media networks and dating apps.
- Travel & Hospitality: Focuses on businesses involved in the travel industry, including airlines, hotels, travel agencies, and online booking platforms.


Education: This sector focuses on providing learning opportunities and knowledge dissemination.
- E-Learning: Refers to the use of electronic technologies to deliver educational content and instruction, encompassing online courses, educational apps, and learning management systems.
- Education software: Provides tools and platforms for educational institutions and educators to manage learning activities, deliver course content, and assess student progress.
- Other Education: Includes businesses not easily categorized elsewhere, such as educational toys and games, educational consulting services, and standardized testing companies.


Energy: This sector deals with the exploration, production, distribution, and consumption of energy resources.
- Clean Energy: Focuses on renewable energy sources like solar, wind, geothermal, and hydropower, as well as energy efficiency technologies to reduce reliance on fossil fuels.
- Energy Storage: Deals with technologies that store energy for later use, playing a crucial role in integrating renewable energy sources into the grid.
- Energy efficiency: Promotes practices and technologies that reduce energy consumption without sacrificing functionality or comfort.
- Other Energy: Includes businesses involved in traditional energy sources like oil, gas, and nuclear power, as well as energy trading and transportation.


FinTech: This sector combines finance and technology to offer innovative solutions for financial services.
- Blockchain: A distributed ledger technology that facilitates secure, transparent, and tamper-proof record-keeping, with potential applications in various financial services.
- Blockchain software: Provides tools and platforms for businesses to build and integrate blockchain technology into their financial operations.
- Capital Markets: Focuses on the trading and financing of securities, including stocks, bonds, and derivatives, using electronic platforms and innovative technologies.
- Digital Banking: Refers to the use of online and mobile banking platforms to access and manage financial accounts, conduct transactions, and receive financial services.
- FinTech software: Encompasses software solutions specifically designed for financial institutions and businesses in the FinTech sector, such as fraud detection tools or robo-advisors.
- Insurance: Deals with businesses that provide financial protection against risks and unforeseen events, with FinTech offering new ways to underwrite, distribute, and manage insurance policies.
- Lending: Focuses on businesses that provide loans to individuals and businesses, with FinTech platforms streamlining the loan application process and offering alternative lending models.
- Other FinTech: Includes businesses involved in financial services not easily categorized elsewhere, such as crowdfunding platforms, wealth management startups using AI, or cryptocurrency exchanges.
- Payments: Deals with the electronic transfer of money between parties, with FinTech companies offering innovative payment solutions like mobile wallets and contactless payments.
- Personal Finance: Encompasses tools and services that help individuals manage their finances, including budgeting apps, investment platforms, and robo-advisors that provide automated financial advice.


Enterprise Software: This sector provides businesses with the software tools and applications they need to operate efficiently.
- Business Operations: Encompasses software solutions for various business functions, such as accounting (financial management software), human resources (applicant tracking systems, payroll software), project management (scheduling tools, collaboration platforms), and supply chain management (inventory management, logistics software).
- Cloud/Networking Infrastructure: Provides cloud computing services (rental access to computing resources), networking equipment (routers, switches, firewalls), and software for businesses to build and manage their IT infrastructure (data centers, networks).
- Communication & Collaboration software: Focuses on tools that facilitate communication and collaboration within and between organizations, including video conferencing platforms (Zoom, Microsoft Teams), instant messaging applications (Slack, Microsoft Teams chat), and project management platforms with collaboration features (Asana, Trello).
- Cybersecurity: Deals with software and services that protect computer systems and networks from cyberattacks and data breaches, including firewalls, intrusion detection systems, anti-malware software, and data encryption tools.
- Data Intelligence: Focuses on tools and technologies that help businesses gather, analyze, and interpret data to gain insights, improve decision-making, and optimize operations. This includes business intelligence (BI) platforms, data warehousing, and data analytics tools.
- Data Management/Storage: Provides solutions for storing, organizing, protecting, and accessing large volumes of data. This includes database management systems (DBMS), data lakes, and cloud storage solutions.
- Human Resources: Encompasses software solutions specifically designed for HR departments, such as applicant tracking systems (ATS) for recruiting, payroll software, performance management tools, and employee benefits administration platforms.
- Other Enterprise Software: Includes businesses involved in enterprise software not easily categorized elsewhere, such as content management systems (CMS) for managing websites, enterprise resource planning (ERP) systems that integrate various business functions, and low-code development platforms that allow businesses to build custom applications without extensive coding.
- Productivity: Focuses on software applications that help individuals and teams be more productive, including task management tools (To-Do lists), time management software, communication and collaboration platforms (mentioned previously), and document management systems.
- Sales & Marketing / Adtech: Encompasses software solutions used by sales and marketing teams to manage customer relationships, automate marketing campaigns, track leads, and personalize customer experiences. This includes customer relationship management (CRM) software, marketing automation platforms, and advertising technology (AdTech) tools.
- Software Development: Provides tools and platforms used by software developers to build, test, deploy, and maintain software applications. This includes integrated development environments (IDEs), version control systems (Git), code repositories (GitHub), and application lifecycle management (ALM) tools.


FoodTech: This sector applies technology to revolutionize the food industry, encompassing a wide range of areas.
- AgTech: Focuses on technology solutions for agricultural production, including precision farming techniques, automation tools, and data analytics platforms that optimize crop yields and resource management.
- Animal-free protein: Deals with companies developing and producing alternative protein sources that do not involve animal agriculture, such as plant-based meat substitutes or lab-grown meat.
- Food Products & Services: Encompasses businesses involved in the production, processing, distribution, and delivery of food products, with FoodTech companies using technology to improve efficiency, transparency, and access to food.
- Other FoodTech: Includes businesses in the food industry that leverage technology but don't fit neatly into other categories. This could involve areas like personalized nutrition services, food waste reduction solutions, or vertical farming technology.


Healthcare: This sector encompasses businesses and professionals involved in providing medical services, treatments, and products to diagnose, prevent, and treat illnesses, ensuring overall health and well-being.
- BioTech & Pharma: Focuses on the research, development, and production of biological drugs, vaccines, and therapies, as well as companies involved in genetic engineering and personalized medicine.
- Digital Health: Refers to the use of technology in healthcare delivery, including telehealth platforms for remote consultations, wearable devices that track health data, and mobile apps for medication management or mental health support.
- Healthcare providers & services: Encompasses hospitals, clinics, doctor's offices, and other facilities and professionals that provide medical care to patients, with some incorporating technology into their services.
- Healthcare software: Provides tools and platforms for managing healthcare operations, electronic health records (EHR) systems, medical billing and coding software, and patient engagement tools.
- Medical Devices: Deals with the development, manufacturing, and distribution of medical equipment used for diagnosis, treatment, and monitoring of patients, including implants, prosthetics, and diagnostic imaging devices.
- Other Healthcare: Includes businesses involved in healthcare that don't fit neatly into other categories. This could involve areas like medical tourism, health insurance companies, or medical research organizations.


Industrial: This sector focuses on large-scale production and manufacturing of goods, with a growing emphasis on automation and technology integration.
- Aerospace & Defense: Encompasses businesses involved in the design, development, manufacturing, and maintenance of aircraft, spacecraft, weapons systems, and other technologies used in the aerospace and defense industries.
- Augmented/Virtual Reality (AR/VR): Focuses on technologies that overlay digital information onto the real world (AR) or create fully immersive virtual environments (VR), with potential applications in industrial training, design, and maintenance.
- Construction: Deals with the planning, design, and building of infrastructure and structures, with Industrial businesses providing advanced materials, construction equipment, and software solutions for efficiency and safety.
- Industrial equipment: Focuses on the design, manufacture, and sale of machinery and equipment used in large-scale industrial processes, such as machine tools, robots, assembly lines, and heavy machinery used in mining or construction.
- Industrial software: Provides tools and platforms for managing industrial operations, including computer-aided design (CAD) software for product design, manufacturing execution systems (MES) for production planning and control, and enterprise resource planning (ERP) systems for overall business management.
- Logistics: Deals with the planning, organization, and execution of the movement and storage of goods, with Industrial companies providing technology solutions for optimizing supply chains, warehousing, and transportation.
- Manufacturing: Encompasses the large-scale production of goods using various processes, including assembly lines, fabrication, and chemical processing. Industrial businesses focus on improving efficiency, automation, and sustainability within manufacturing processes.
- Materials: Focuses on the development, production, and processing of raw materials and advanced materials used in various industries, including metals, plastics, composites, and new materials with unique properties.
- Other Industrial: Includes businesses involved in the industrial sector that don't fit neatly into other categories. This could involve areas like industrial waste management, industrial automation solutions beyond robotics, or specialized industrial services.
- Robotics: Deals with the design, development, and use of robots for various industrial applications, including automation of tasks, material handling, assembly line operations, and complex manufacturing processes.


Real Estate: This sector deals with the buying, selling, renting, and management of properties.
- Other Real Estate: Includes businesses involved in real estate that don't fit neatly into other categories. This could involve areas like real estate investment trusts (REITs), crowdfunding platforms for real estate investments, or property appraisal services.
- Real Estate Products & Services: Encompasses businesses that provide products and services related to real estate transactions, such as title companies, mortgage lenders, property management companies, and real estate listing platforms.
- Real estate software: Provides tools and platforms for managing real estate activities, including property listing platforms, property management software, and data analytics tools for market research and investment decisions.


Technology Hardware: This sector focuses on the physical components of technological devices.
- Communications equipment: Deals with the hardware used for telecommunications and data transmission, including routers, switches, fiber optic cables, and cellular network infrastructure.
- Computing hardware: Encompasses the physical components of computers, including central processing units (CPUs), graphics processing units (GPUs), memory (RAM), storage devices (hard drives, solid-state drives), and computer peripherals (monitors, keyboards, mice).
- Other Technology Hardware: Includes businesses involved in hardware for technology applications beyond computers and communication, such as sensors used in the Internet of Things (IoT), wearable device components, or specialized hardware for industrial applications.


Transportation: This sector focuses on the movement of people and goods.
- Autonomous Vehicles: Deals with the development and deployment of self-driving vehicles, including cars, trucks, and delivery drones, using various technologies like sensors, cameras, and artificial intelligence (AI).
- Delivery services: Encompasses businesses involved in the last-mile delivery of goods to consumers, including traditional delivery companies, on-demand delivery platforms, and drone delivery services.
- Electric Vehicles (EVs): Focuses on the design, production, and sale of vehicles powered by electricity, including electric cars, buses, trucks, and motorcycles, with a focus on sustainability and reducing reliance on fossil fuels.
- Other Transportation: Includes businesses involved in transportation that don't fit neatly into other categories. This could involve areas like maritime shipping, public transportation systems, or air traffic control technology companies.
- Ridesharing: Refers to companies that provide app-based ride-hailing services, connecting passengers with drivers for on-demand transportation.
- Transportation software: Provides tools and platforms for managing transportation operations, including logistics software for optimizing delivery routes, fleet management software for tracking vehicles and drivers, and passenger booking platforms for public transportation systems.
"""

In [9]:
template = """
You are a helpful assistant. Please classify companies into a sector and sub sector based on their company description. You are a helpful assistant. Please classify companies into a sector and sub sector based on their company description. The sectors and sub sectors, and company description are provided in the the XML tags below.

Provide only the final answer in a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```": {output_format}

<company_description>
{company_description}
</company_description>

<sectors>
{sectors}
</sectors>

<sub_sectors>
{sub_sectors}
</sub_sectors>

<final_answer>"""

template_descriptions = """
You are a helpful assistant. Please classify companies into a sector and sub sector based on their company description. The sectors and sub sectors, and company description are provided in the the XML tags below. We have add descriptions to the sector list to help you classify companies.

Provide only the final answer in a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```": {output_format}

<company_description>
{company_description}
</company_description>

<sectors>
{sectors}
</sectors>

<sub_sectors>
{sub_sectors}
</sub_sectors>

<final_answer>"""

template_combined = """
You are a helpful assistant. Please classify companies into a sector and sub sector based on their company description. The sectors and sub sectors, and company description are provided in the the XML tags below. We have add descriptions to both the sector and sub sector list to help you classify companies.

Provide only the final answer in a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```": {output_format}

<company_description>
{company_description}
</company_description>

<sector_sub_sector_information>
{sector_sub_sector_information}
</sector_sub_sector_information>

<final_answer>"""

output_format = """
{
    "sector": string,
    "sub_sector": string
}
"""

## LLMs

In [10]:
import boto3
from langchain.document_loaders import AmazonTextractPDFLoader
from langchain.llms import Bedrock
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import json
import pandas as pd
import numpy as np
from datetime import datetime
import time
import re

In [45]:
df_llm = (
    df[
        ~(
            (pd.isnull(df["description"]))
            | (df["sector"] == "Missing")
            | (df["sub_sector"] == "Missing")
        )
    ][["slug", "name", "description", "sector", "sub_sector"]]
    .reset_index(drop=True)
    .copy()
)
print(df_llm.shape)
df_llm.head()

(4486, 5)


Unnamed: 0,slug,name,description,sector,sub_sector
0,procyrion,Procyrion,"Procyrion is aiding the Aortix™ device, design...",Healthcare,Medical Devices
1,quantifind,Quantifind,Quantifind is a the developer of an AI platfor...,Enterprise Software,Data Intelligence
2,taskrabbit,TaskRabbit,TaskRabbit is the developer of a two-sided mar...,Consumer & Lifestyle,E-commerce
3,knoetic,Knoetic,"Knoetic, founded in 2020, is a technology comp...",Enterprise Software,Human Resources
4,olo,Olo,Olo is the developer of an on-demand ordering ...,Transportation,Delivery services


In [48]:
pd.isnull(df_llm).sum()

slug           0
name           0
description    0
sector         0
sub_sector     0
dtype: int64

In [60]:
# Define a function to remove trailing commas from JSON-like strings
def remove_trailing_commas(json_like_str):
    # Regular expression to find trailing commas before a closing bracket or brace
    pattern = re.compile(r",\s*([}\]])")
    # Replace instances found by the pattern with the captured group without the comma
    return pattern.sub(r"\1", json_like_str)


def extract_json_obj(input_str):
    start_index = input_str.find("`json\n") + len("`json\n")
    end_index = input_str.rfind("```")
    json_string = input_str[start_index:end_index]
    cleaned_json_string = remove_trailing_commas(json_string)

    try:
        json_obj = json.loads(cleaned_json_string)
    except:
        json_obj = json.loads(cleaned_json_string + "}")

    return json_obj

In [61]:
bedrock = boto3.client("bedrock-runtime", region_name="us-west-2")


def generate_llm_output(
    company_description, template, input_dict, model_id="anthropic.claude-v2"
):

    input_dict["company_description"] = company_description

    input_list = list(input_dict.keys())
    prompt = PromptTemplate(
        template=template,
        input_variables=input_list,
    )

    bedrock_llm = Bedrock(client=bedrock, model_id=model_id)

    llm_chain = LLMChain(
        prompt=prompt,
        llm=bedrock_llm,
        llm_kwargs={"max_tokens_to_sample": 5000, "temperature": 0},
    )
    output = llm_chain.run(input_dict)

    return output

In [62]:
def run_sector_class(df, template, input_dict, model_id="anthropic.claude-v2"):

    output_dict = {}

    count = 0
    start_time = time.time()
    for idx, row in df.iterrows():

        company_description = row["description"]
        output = generate_llm_output(
            company_description, template, input_dict, model_id="anthropic.claude-v2"
        )
        try:
            output_obj = extract_json_obj(output)

            output_dict[row["slug"]] = {
                "name": row["name"],
                "description": row["description"],
                "sector": row["sector"],
                "sub_sector": row["sub_sector"],
                "llm_sector": output_obj["sector"],
                "llm_sub_sector": output_obj["sub_sector"],
            }
        except:
            print(f"Error occurred (index: {idx}):", output)
            pass
        
        count += 1
        if count % 100 == 0:
            end_time = time.time()
            print(f"{count} | {np.round(end_time - start_time,3)}")
            start_time = end_time

    return output_dict

### Sample Issuers
- Generating a sample of issuers to test different templates and prompts
- Time to run across entire dataset would take a long time, when a random sampling likely will yield similar results

In [63]:
df_sample = df_llm.sample(n=250, random_state=23)

In [64]:
# Template and prompt contains only a list of the sectors and subsectors to choose from
sector_list = list(np.unique(df_llm["sector"]))
sub_sector_list = list(np.unique(df_llm["sub_sector"]))

input_dict = {
    "sectors": sector_list,
    "sub_sectors": sub_sector_list,
    "output_format": output_format,
}
output_dict = run_sector_class(
    df_sample, template, input_dict, model_id="anthropic.claude-v2"
)

dfo = (
    pd.DataFrame.from_dict(output_dict, orient="index")
    .reset_index()
    .rename(columns={"index": "slug"})
)
dfo["sector_match"] = dfo["sector"] == dfo["llm_sector"]
dfo["sub_sector_match"] = dfo["sub_sector"] == dfo["llm_sub_sector"]

print(len(dfo))
print(f"Sector accuracy: {np.mean(dfo['sector_match'])}")
print(f"Sub sector accuracy: {np.mean(dfo['sub_sector_match'])}")

100 | 145.326
200 | 143.875
250
Sector accuracy: 0.596
Sub sector accuracy: 0.436


In [65]:
# Template and prompt contains sector descriptions and a list of subsectors to choose from

input_dict = {
    "sectors": sector_descriptions,
    "sub_sectors": sub_sector_list,
    "output_format": output_format,
}
output_dict = run_sector_class(
    df_sample, template_descriptions, input_dict, model_id="anthropic.claude-v2"
)

dfo = (
    pd.DataFrame.from_dict(output_dict, orient="index")
    .reset_index()
    .rename(columns={"index": "slug"})
)
dfo["sector_match"] = dfo["sector"] == dfo["llm_sector"]
dfo["sub_sector_match"] = dfo["sub_sector"] == dfo["llm_sub_sector"]

print(len(dfo))
print(f"Sector accuracy: {np.mean(dfo['sector_match'])}")
print(f"Sub sector accuracy: {np.mean(dfo['sub_sector_match'])}")

100 | 177.119
200 | 178.717
250
Sector accuracy: 0.712
Sub sector accuracy: 0.472


In [66]:
# Template and prompt contains sector descriptions with examples and a list of subsectors to choose from

input_dict = {
    "sectors": sector_descriptions_w_examples,
    "sub_sectors": sub_sector_list,
    "output_format": output_format,
}
output_dict = run_sector_class(
    df_sample, template_descriptions, input_dict, model_id="anthropic.claude-v2"
)

dfo = (
    pd.DataFrame.from_dict(output_dict, orient="index")
    .reset_index()
    .rename(columns={"index": "slug"})
)
dfo["sector_match"] = dfo["sector"] == dfo["llm_sector"]
dfo["sub_sector_match"] = dfo["sub_sector"] == dfo["llm_sub_sector"]

print(len(dfo))
print(f"Sector accuracy: {np.mean(dfo['sector_match'])}")
print(f"Sub sector accuracy: {np.mean(dfo['sub_sector_match'])}")

100 | 231.628
200 | 235.806
250
Sector accuracy: 0.692
Sub sector accuracy: 0.468


In [67]:
# Template and prompt contains sector and subsector descriptions
input_dict = {
    "sector_sub_sector_information": sector_sub_sector_descriptions,
    "output_format": output_format,
}
output_dict = run_sector_class(
    df_sample, template_combined, input_dict, model_id="anthropic.claude-v2"
)

dfo = (
    pd.DataFrame.from_dict(output_dict, orient="index")
    .reset_index()
    .rename(columns={"index": "slug"})
)
dfo["sector_match"] = dfo["sector"] == dfo["llm_sector"]
dfo["sub_sector_match"] = dfo["sub_sector"] == dfo["llm_sub_sector"]

print(len(dfo))
print(f"Sector accuracy: {np.mean(dfo['sector_match'])}")
print(f"Sub sector accuracy: {np.mean(dfo['sub_sector_match'])}")

100 | 525.698
200 | 476.281
250
Sector accuracy: 0.74
Sub sector accuracy: 0.444


### Priority 1 Issuers
- Testing different templates and prompts for priority 1 issuers

In [72]:
imp_issuers = pd.read_csv('top_tier_issuer_list.csv')
df_ii = df_llm.merge(imp_issuers, how='inner', left_on='slug', right_on='issuerKey')

In [73]:
# Template and prompt contains only a list of the sectors and subsectors to choose from
sector_list = list(np.unique(df_llm["sector"]))
sub_sector_list = list(np.unique(df_llm["sub_sector"]))

input_dict = {
    "sectors": sector_list,
    "sub_sectors": sub_sector_list,
    "output_format": output_format,
}
output_dict = run_sector_class(
    df_ii, template, input_dict, model_id="anthropic.claude-v2"
)

dfo = (
    pd.DataFrame.from_dict(output_dict, orient="index")
    .reset_index()
    .rename(columns={"index": "slug"})
)
dfo["sector_match"] = dfo["sector"] == dfo["llm_sector"]
dfo["sub_sector_match"] = dfo["sub_sector"] == dfo["llm_sub_sector"]

print(len(dfo))
print(f"Sector accuracy: {np.mean(dfo['sector_match'])}")
print(f"Sub sector accuracy: {np.mean(dfo['sub_sector_match'])}")

100 | 166.242
200 | 174.454
279
Sector accuracy: 0.5591397849462365
Sub sector accuracy: 0.5017921146953405


In [74]:
# Template and prompt contains sector descriptions and a list of subsectors to choose from
input_dict = {
    "sectors": sector_descriptions,
    "sub_sectors": sub_sector_list,
    "output_format": output_format,
}
output_dict = run_sector_class(
    df_ii, template_descriptions, input_dict, model_id="anthropic.claude-v2"
)

dfo = (
    pd.DataFrame.from_dict(output_dict, orient="index")
    .reset_index()
    .rename(columns={"index": "slug"})
)
dfo["sector_match"] = dfo["sector"] == dfo["llm_sector"]
dfo["sub_sector_match"] = dfo["sub_sector"] == dfo["llm_sub_sector"]

print(len(dfo))
print(f"Sector accuracy: {np.mean(dfo['sector_match'])}")
print(f"Sub sector accuracy: {np.mean(dfo['sub_sector_match'])}")

100 | 185.579
200 | 186.062
279
Sector accuracy: 0.6881720430107527
Sub sector accuracy: 0.4444444444444444


In [75]:
# Template and prompt contains sector descriptions with examples and a list of subsectors to choose from
input_dict = {
    "sectors": sector_descriptions_w_examples,
    "sub_sectors": sub_sector_list,
    "output_format": output_format,
}
output_dict = run_sector_class(
    df_ii, template_descriptions, input_dict, model_id="anthropic.claude-v2"
)

dfo = (
    pd.DataFrame.from_dict(output_dict, orient="index")
    .reset_index()
    .rename(columns={"index": "slug"})
)
dfo["sector_match"] = dfo["sector"] == dfo["llm_sector"]
dfo["sub_sector_match"] = dfo["sub_sector"] == dfo["llm_sub_sector"]

print(len(dfo))
print(f"Sector accuracy: {np.mean(dfo['sector_match'])}")
print(f"Sub sector accuracy: {np.mean(dfo['sub_sector_match'])}")

100 | 240.52
200 | 235.003
279
Sector accuracy: 0.7132616487455197
Sub sector accuracy: 0.4838709677419355


In [76]:
# Template and prompt contains sector and subsector descriptions
input_dict = {
    "sector_sub_sector_information": sector_sub_sector_descriptions,
    "output_format": output_format,
}
output_dict = run_sector_class(
    df_ii, template_combined, input_dict, model_id="anthropic.claude-v2"
)

dfo = (
    pd.DataFrame.from_dict(output_dict, orient="index")
    .reset_index()
    .rename(columns={"index": "slug"})
)
dfo["sector_match"] = dfo["sector"] == dfo["llm_sector"]
dfo["sub_sector_match"] = dfo["sub_sector"] == dfo["llm_sub_sector"]

print(len(dfo))
print(f"Sector accuracy: {np.mean(dfo['sector_match'])}")
print(f"Sub sector accuracy: {np.mean(dfo['sub_sector_match'])}")

Error occurred (index: 8):  Based on the company description, I would classify WHOOP in the following sector and sub-sector:

```json
{
  "sector": "Consumer & Lifestyle", 
  "sub_sector": "Fitness"
}
```

The description indicates WHOOP develops wearable devices and software to monitor health metrics like sleep and recovery to optimize performance. This aligns with the Fitness sub-sector under Consumer & Lifestyle, which covers products and services related to physical health and well-being.
```
Error occurred (index: 62):  Based on the company description, I would classify Checkr in the following sector and sub-sector:

```json
{
  "sector": "Enterprise Software", 
  "sub_sector": "Human Resources"
}
```

Checkr provides background check services and related software tools for employers evaluating job candidates. This aligns with the Enterprise Software sector and specifically the Human Resources sub-sector, which encompasses software solutions designed for HR departments.
```
100 | 