In [1]:
import pandas as pd
import os
import json
import asyncio
import re

from openai import OpenAI
from openai import AsyncOpenAI

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = None

OPENAI_KEY = ""

# Labeling

In [2]:
cnpjs = [
    "38220040000111",
    "41160388000175",
    "31975959000176",
    "10497494000115",
    "11615383000129",
    "38008510000188",
    "24582955000162",
    "32390384000192",
    "26228525000172",
    "09573540000139",
    "36062381000180",
    "37365788000140",
    "52835814000140",
    "55803947000122",
    "55352047000106",
    "52965952000143",
    "34266553000102",
    "21892103000183",
    "29983313000199",
    "34251650000121",
]

nomes = [
    "LASTLINK",
    "THE MEMBERS",
    "GREENN PAGAMENTOS E TECNOLOGIA LTDA",
    "PROESC.COM",
    "ESCOLAWEB",
    "ISAAC",
    "CLIPESCOLA",
    "PROVI",
    "SUNO RESEARCH",
    "EDUZZ",
    "CHATPAY TECNOLOGIA LTDA",
    "PROFITFY.ME",
    "PAGMEX SOLUCOES DE PAGAMENTOS LTDA",
    "ROYALTY PAY LTDA",
    "P2PAY PAGAMENTOS LTDA",
    "DIGITO PAY TECNOLOGIA EM PAGAMENTOS LTDA",
    "ALICE OPERADORA LTDA",
    "BIOLOGIX SISTEMAS LTDA",
    "PSICO GESTOR TECNOLOGIA LTDA",
    "UOON",
]

cnaes = [
    "6203100",
    "7490104",
    "8599604",
    "6202300",
    "6202300",
    "8291100",
    "6201501",
    "6619302",
    "6319400",
    "8599699",
    "6203100",
    "9511800",
    "8599604",
    "8211300",
    "6619302",
    "6203100",
    "6550200",
    "6202300",
    "6201501",
    "7420001",
]

df = pd.DataFrame({
    "cnpj": cnpjs,
    "nome": nomes,
    "cnae": cnaes,
})

df.head(25)

Unnamed: 0,cnpj,nome,cnae
0,38220040000111,LASTLINK,6203100
1,41160388000175,THE MEMBERS,7490104
2,31975959000176,GREENN PAGAMENTOS E TECNOLOGIA LTDA,8599604
3,10497494000115,PROESC.COM,6202300
4,11615383000129,ESCOLAWEB,6202300
5,38008510000188,ISAAC,8291100
6,24582955000162,CLIPESCOLA,6201501
7,32390384000192,PROVI,6619302
8,26228525000172,SUNO RESEARCH,6319400
9,9573540000139,EDUZZ,8599699


In [3]:
df.head()

Unnamed: 0,cnpj,nome,cnae
0,38220040000111,LASTLINK,6203100
1,41160388000175,THE MEMBERS,7490104
2,31975959000176,GREENN PAGAMENTOS E TECNOLOGIA LTDA,8599604
3,10497494000115,PROESC.COM,6202300
4,11615383000129,ESCOLAWEB,6202300


In [4]:
df.isnull().sum()

cnpj    0
nome    0
cnae    0
dtype: int64

In [None]:
BATCH_SIZE = 50
TEST_LEN = 250
BACKUP_PATH = "../data/sample_classified.csv"

class Classifier:
    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key)

    @staticmethod
    def _safe_get(value):
        return value if value not in [None, ""] else "Não informado"
    
    @staticmethod
    def _output_parser(content):
        match = re.search(r'\{.*\}', content, re.DOTALL)

        if match:
            json_string = match.group(0)
            data = json.loads(json_string)
            data = {k: data.get(k) for k in ['domain', 'is_ecommerce']}
            return data
        else:
            print("No JSON found")

    def prompt(self, row: pd.Series):
        return f""" 
        Analise as informações abaixo sobre uma empresa e classifique-a em um dos seguintes subsegmentos segundo a tabela:

        | Segmento 	|  Nicho Tech 	|
        |:--------:	|:-----------:	|
        |    Bet   	|   Bettech   	|
        | Educação 	|    Edtech   	|
        |   Saas   	|    Adtech   	|
        |   Saas   	|   Agrotech  	|
        |   Saas   	|   Biotech   	|
        |   Saas   	| Construtech 	|
        |   Saas   	|  Energytech 	|
        |   Saas   	|   Fintech   	|
        |   Saas   	|   Foodtech  	|
        |   Saas   	|   Govtech   	|
        |   Saas   	|    Hrtech   	|
        |   Saas   	|   Indtech   	|
        |   Saas   	|  Insurtech  	|
        |   Saas   	|  Legaltech  	|
        |   Saas   	|   Proptech  	|
        |   Saas   	|  Retailtech 	|
        |   Saas   	|  Sporttech  	|
        |   Saas   	|  Outra tech 	|
        |   Saúde  	|  Healthech  	|

        Dados da empresa:

        CNPJ: {self._safe_get(row.get('cnpj'))}
        Nome: {self._safe_get(row.get('nome'))}
        CNAE: {self._safe_get(row.get('cnae'))}

        Regras para classificação: para tomar a sua decisão consulte apenas o conteúdo HTML do site da empresa, juntamente com a sua descrição e o slogan da empresa no LinkedIn, se disponível. Se não se encaixar em nenhum segmento ou nicho tech, retorne "Não listado" para ambos.

        Responda o CNPJ da empresa, o segmento e o nicho tech, em JSON, base exclusivamente na estrutura da tabela apresentada.

        Resposta:
        """
    
    def classify(self, row: pd.Series):
        chat_completion = self.client.chat.completions.create(
            messages=[
                {"role": "user", "content": self.prompt(row)}
            ],
            model="gpt-4o-mini",
            temperature=0,
            max_tokens=100,
        )

        data = self._output_parser(chat_completion.choices[0].message.content)
        return data
    
        
    async def parallel_classify(self, X: pd.DataFrame, batch_size: int=10, sleep=0.0):
        """Classifies data in batches asynchronously and yields it"""
        
        async_client = AsyncOpenAI(api_key=self.client.api_key)
        all_results = {}

        for batch_start in range(0, len(X), batch_size):
            batch_end = min(batch_start + batch_size, len(X))
            batch = X.iloc[batch_start:batch_end]

            tasks = []
            batch_indices = []

            for idx, row in batch.iterrows():
                prompt = [{'role': 'user', 'content': self.prompt(row)}]
                task = async_client.chat.completions.create(
                    messages=prompt,
                    model="gpt-4o-mini",
                    temperature=0,
                    max_tokens=100,
                )
                tasks.append(task)
                batch_indices.append(idx)

            # Execute batch asynchronously
            batch_responses = await asyncio.gather(*tasks)

            batch_results = {}
            for idx, response in zip(batch_indices, batch_responses):
                data = self._output_parser(response.choices[0].message.content)

                batch_results[idx] = data
                all_results[idx] = data

            # Yield results per batch
            yield pd.DataFrame.from_dict(batch_results, orient='index')

            # Delay between batches
            await asyncio.sleep(sleep)

async def annotate(df, classifier, batch_size=50):
    """Yields batches of annotated data, including original DataFrame columns."""
    async for batch in classifier.parallel_classify(df, batch_size=batch_size):
        # Create a DataFrame for the batch with original df index
        annotated_batch = pd.DataFrame(batch, index=df.loc[batch.index].index)

        yield annotated_batch

def append_if_exists(df: pd.DataFrame, path: str):
    """Appends `df` to an existing file if it exists, otherwise creates a new file.
    
    Supports both CSV and Parquet formats. If a Parquet file is stored as a folder, it loads and appends correctly.
    
    Args:
        df (pd.DataFrame): The DataFrame to append.
        path (str): The file path, should end in .csv or .parquet.
    """
    existing_df = load_file_if_exists(path)

    if not existing_df is None:
        df = pd.concat([existing_df, df], ignore_index=True)

    # Save the DataFrame in the correct format
    if path.endswith(".csv"):
        print('saved at', path)
        df.to_csv(path, index=False, sep=';')
    elif path.endswith(".parquet"):
        df.to_parquet(path, index=False)  # Will save as a folder if partitioning is used
    else:
        raise ValueError("Unsupported file format. Use .csv or .parquet")
    
    return df

def load_file_if_exists(path: str):
    """Checks and open a file if it exists.
    
    Supports both CSV and Parquet formats. If a Parquet file is stored as a folder, it loads and appends correctly.
    
    Args:
        path (str): The file path, should end in .csv or .parquet.
    """
    file_exists = os.path.exists(path) or os.path.isdir(path)  # Check if it's a folder (Parquet case)

    if file_exists:
        if path.endswith(".csv"):
            existing_df = pd.read_csv(path, sep=';')
        elif path.endswith(".parquet"):
            existing_df = pd.read_parquet(path)  # Reads the folder as a Parquet dataset
        else:
            raise ValueError("Unsupported file format. Use .csv or .parquet")
        
        return existing_df
    
    return None

In [None]:
classifier = Classifier(api_key=OPENAI_KEY)
samples = []
df_copy = df.copy()
existing_samples = load_file_if_exists(BACKUP_PATH)

if not existing_samples is None:
    # excluding the samples that already were processed
    unique_domains = existing_samples["cnpj"].unique().tolist()
    df = df[~df["cnpj"].isin(unique_domains)].reset_index(drop=True)

async for batch in annotate(df, classifier, batch_size=BATCH_SIZE):
    samples.append(batch)
    append_if_exists(batch, BACKUP_PATH)  # Append in batches
    print(f'{min(BATCH_SIZE, len(batch))} classified samples were added to {BACKUP_PATH}')

labeled_df = pd.concat(samples, axis=0)
labeled_df = labeled_df[labeled_df["Segmento"] != "Não listado"].reset_index(drop=True)
labeled_df = labeled_df[labeled_df["Nicho Tech"] != "Não listado"].reset_index(drop=True)

if df.shape[0] != df_copy.shape[0]:
    labeled_df = pd.merge(left=df_copy, right=labeled_df, how="inner")
else:
    labeled_df = pd.merge(left=df, right=labeled_df, how="inner")

labeled_df