# Imports and df reading

In [None]:
import pandas as pd
import os
import json
import asyncio

from openai import OpenAI
from openai import AsyncOpenAI

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = None

OPENAI_KEY = ""

# Feature Engineering

In [None]:
def check_integrity(dataframe):
    try:
        columns_expected = [
            'domain',
            'html',
            ]
        
        if not all(item in dataframe.columns.tolist() for item in columns_expected):
            raise Exception('Missing required columns. Columns expected:\n' + str(columns_expected))
        
        dataframe['html'] = dataframe['html'].astype(str)

        dataframe_filtered = dataframe[(dataframe['html'] != '[]') & 
                                (dataframe['html'] != '') & 
                                (dataframe['domain'].str.endswith('.br'))]
        if len(dataframe) != len(dataframe_filtered):
            count = len(dataframe) - len(dataframe_filtered)
            print(f"WARNING: dataframe has {count} entries with empty HTML and/or does not ends with '.br'. Removing those entries.")
            dataframe = dataframe_filtered

        dataframe_filtered = dataframe.drop_duplicates()
        if len(dataframe) != len(dataframe_filtered):
            count = len(dataframe) - len(dataframe_filtered)
            print(f"WARNING: dataframe has {count} entries with duplicates values. Removing those entries.")
            dataframe = dataframe_filtered
    
    
        nulls = dataframe['domain'].isnull().sum()
        if nulls > 0:
            print(f"WARNING: column 'domain' has {nulls} empty values. Removing those entries.")
            dataframe = dataframe.dropna(subset=['domain'])

        nulls = dataframe['html'].isnull().sum()
        if nulls > 0:
            print(f"WARNING: column 'html' has {nulls} empty values. Removing those entries.")
            dataframe = dataframe.dropna(subset=['html'])
        
        return dataframe
    except Exception as e:
        raise Exception('Failed in integrity check.\nError:\n' + str(e))

In [None]:
def build_lemmatizer_pt_dict():
    try:
        import os
        import requests
        
        url = "https://github.com/michmech/lemmatization-lists/raw/master/lemmatization-pt.txt"
        file_name = "lemmatization-pt.txt"

        # Verificar se o arquivo já existe
        if not os.path.exists(file_name):
            response = requests.get(url)
            with open(file_name, 'wb') as f:
                f.write(response.content)

        # Processar o arquivo
        lemmatizer_pt_dict = {}
        with open(file_name, 'r') as dic:
            for line in dic:
                txt = line.split()
                if len(txt) == 2:
                    lemmatizer_pt_dict[txt[1]] = txt[0]

        return lemmatizer_pt_dict
    except Exception as e:
        file_name = "lemmatization-pt.txt"
        if os.path.exists(file_name):
            os.remove(file_name)
        raise Exception('An error occurred on custom_lemmatizer.\nError:\n' + str(e))

    finally:
        file_name = "lemmatization-pt.txt"
        if os.path.exists(file_name):
            os.remove(file_name)


In [None]:
def custom_lemmatizer(tokens, lemmatizer_pt_dict):
    try:
      from nltk.stem.wordnet import WordNetLemmatizer
  
      lemmatizer = WordNetLemmatizer()
      tokens_lemmatized = []
      for token in tokens:
        if token in lemmatizer_pt_dict.keys():
          tokens_lemmatized.append(lemmatizer_pt_dict.get(token))
        else:
          tokens_lemmatized.append(lemmatizer.lemmatize(token))

      return tokens_lemmatized
    except Exception as e:
        raise Exception('An error occurred on custom_lemmatizer.\nError:\n' + str(e))

In [None]:
def get_html_body(html_str):
    from bs4 import BeautifulSoup
    try:
        # Tentar usar diferentes parsers
        for parser in ['html.parser', 'html5lib', 'lxml']:
            try:
                soup = BeautifulSoup(html_str, parser)
                text = soup.body.get_text() if soup.body else ''
                return text
            except Exception as parser_e:
                continue
        
    except Exception as e:
        return ''
    
def process_html_for_vectorizer(html_text, lemmatizer_pt_dict):
    import nltk
    from nltk.corpus import stopwords
    import unicodedata
    import re
    
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)

    try:              
        STOP_WORDS = (set(stopwords.words('portuguese'))).union(set(stopwords.words('english')))

        # pegar somente o body do HTML
        text = get_html_body(html_text)
        # soup = BeautifulSoup(html_text, 'html.parser')
        # text = soup.body.get_text() if soup.body else ''

        preprocessed_text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

        # Remover espaços em branco e quebras de linha desnecessárias
        preprocessed_text = re.sub(r'\s+', ' ', preprocessed_text).strip()

        # substitui tudo que não é letra ou espaço por um espaço
        preprocessed_text = re.sub(r"[^a-zA-Z\s]", " ", preprocessed_text)

        # Regex para identificar palavras
        pattern = re.compile(r'([A-Z]+(?![a-z])|[A-Z][a-z]*|[a-z]+)')

        # Substituir as correspondências por elas mesmas precedidas por um espaço
        preprocessed_text = pattern.sub(r' \1', preprocessed_text)

        # lowercase
        preprocessed_text = preprocessed_text.lower()

        # remover possives espaços repetidos
        preprocessed_text = re.sub(r"\s+", " ", preprocessed_text).strip()

        # tokenizar
        tokens = nltk.word_tokenize(preprocessed_text)

        # remover stopwords
        tokens = [
            token for token in tokens if token not in STOP_WORDS and len(token) > 2
        ]

        # Aplicar lemmatizer
        tokens = custom_lemmatizer(tokens, lemmatizer_pt_dict)

        return tokens
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for vectorizer.\nError:\n' + str(e))

In [None]:
import re

def process_html_for_how_many_prices(text):
    try:              
        regex_precos = re.compile(r'\$|R\$')
        precos = regex_precos.findall(text)
        return len(precos)
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for prices.\nError:\n' + str(e))

def process_html_for_how_many_values(text):
    try:              
        regex_valores = re.compile(r'\d+(?:\.\d{3})*(?:,\d{2})?|\d+(?:,\d{3})*(?:\.\d{2})?')
        valores = regex_valores.findall(text)
        return len(valores)
    except Exception as e:
        raise Exception('An error occurred while processing HTMLs for values.\nError:\n' + str(e))

In [None]:
def get_html_links(row):
    try:
        from scrapy.http import HtmlResponse
        from scrapy.linkextractors import LinkExtractor
        
        # Definindo os parâmetros do LinkExtractor
        allowed_domains = []
        tags = ['a', 'area']
        attrs = ['href',]
        link_extractor = LinkExtractor(allow_domains=allowed_domains, tags=tags, attrs=attrs, unique=False)

        html = row['html']
        domain = row['domain']
        response = HtmlResponse(url=domain, body=html, encoding='utf-8')
        links = link_extractor.extract_links(response)

        return [link.url for link in links]
    except Exception as e:
        raise Exception('An error occurred while searching for links in HTML.\nError:\n' + str(e))

In [None]:
def get_autoreference_links_from_html(row):
    from scrapy.http import HtmlResponse
    from scrapy.linkextractors import LinkExtractor
    from urllib.parse import urlparse, urljoin
    
    html = row['html']
    domain = row['domain']
    parsed_domain = urlparse(domain).netloc  # Parse the domain to get the netloc part

    response = HtmlResponse(url=domain, body=html, encoding='utf-8')
    link_extractor = LinkExtractor(tags=['a', 'area'], attrs=['href'], unique=False)
    links = link_extractor.extract_links(response)

    autoreference_links = []
    for link in links:
        link_url = urlparse(link.url)
        # Check if the link is a relative link or it belongs to the same domain
        if not link_url.netloc or link_url.netloc == parsed_domain:
            # Resolve relative link to absolute URL
            full_url = urljoin(domain, link.url)
            autoreference_links.append(full_url)

    return autoreference_links

In [None]:
def only_number(text):
    text = re.sub(r'[^\d]', '', text)
    return text

def remove_invalid_company(company_id):
    company_id = re.sub(r'(\d)\1{12}', '', company_id)
    if len(company_id) == 14:
        return company_id
    return None 

def order_by_common(data):
    from collections import Counter
    data_output = Counter(data)
    return [k for k, v in data_output.most_common()]

def extract_and_process_cnpjs(text):
    pattern = re.compile(r'\d{2}\.\d{3}\.\d{3}[\/ ]\d{4}[- ]\d{2}')
    matches = pattern.findall(text)
    processed_matches = []
    for match in matches:
        cleaned = only_number(match)
        valid_company = remove_invalid_company(cleaned)
        if valid_company:
            processed_matches.append(valid_company)
    return processed_matches

In [None]:
def get_features_dataframe(dataframe, aditional_columns):
    try:
        # Colunas necessárias para o modelo
        feature_columns = []

        for columns in aditional_columns:
            feature_columns += columns
        df_features = dataframe.loc[:, feature_columns]
        
        return df_features
    except Exception as e:
        raise Exception('An error occurred while trying to build features DataFrame.\nError:\n' + str(e))

In [None]:
def generate_features(dataframe):
    try:
        dataframe = check_integrity(dataframe)

        lem_dict = build_lemmatizer_pt_dict()    
        html_body = dataframe.loc[:,'html'].apply(get_html_body)    
        dataframe.loc[:, 'tokens'] = dataframe.loc[:, 'html'].apply(lambda x: process_html_for_vectorizer(x, lem_dict))
        # dataframe.loc[:, 'html_size'] = dataframe.loc[:, 'html'].apply(len)
        # dataframe.loc[:, 'qntd_tokens'] = dataframe.loc[:, 'tokens'].apply(len)
        # dataframe.loc[:, 'qntd_tokens_unicos'] = dataframe.loc[:, 'tokens'].apply(lambda x: len(set(x)))

        # dataframe.loc[:, 'autoreference_links'] = dataframe.apply(get_autoreference_links_from_html, axis=1)
        # dataframe.loc[:, 'qntd_autoreference_links'] = dataframe.loc[:, 'autoreference_links'].apply(len)

        # dataframe.loc[:, 'links'] = dataframe.apply(get_html_links, axis=1)
        # dataframe.loc[:, 'qntd_links'] = dataframe.loc[:, 'links'].apply(len)

        dataframe.loc[:, 'processed_cnpjs'] = dataframe.loc[:, 'html'].apply(extract_and_process_cnpjs)
        dataframe.loc[:, 'has_cnpj'] = dataframe.loc[:, 'processed_cnpjs'].apply(bool)

        dataframe.loc[:, 'count_prices'] = html_body.apply(process_html_for_how_many_prices)
        # #dataframe.loc[:, 'count_prices'] = dataframe.loc[:, 'prices'].apply(len)
        dataframe['has_prices'] = dataframe['count_prices'] > 1

        # dataframe.loc[:, 'count_values'] = html_body.apply(process_html_for_how_many_values)
        # #dataframe.loc[:, 'count_values'] = dataframe.loc[:, 'values'].apply(len)
        
        # aditional_columns = [
        #     ['true_ecommerce']
        # ]
        # df_features = get_features_dataframe(dataframe, aditional_columns)

        return dataframe#, df_features
    except Exception as e:
        raise Exception('An error occured while trying to generate features.\nError:\n' + str(e))

# Labeling

In [None]:
df = pd.read_parquet('../data/filtered_data_samples_25k.parquet')
df = df.rename(columns={'host': 'domain'})
df['html'] = df['html'].astype(str)
df.shape

In [None]:
df = generate_features(df)
df = df.sample(5000, random_state=42)
df = df.reset_index(drop=True)
df.head()

In [None]:
df['domain'].nunique()

In [None]:
df.isnull().sum()

In [None]:
BATCH_SIZE = 1000
TEST_LEN = 250
BACKUP_PATH = "../data/sample_classified.csv"

class Classifier:
    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key)

    @staticmethod
    def _safe_get(value):
        return value if value not in [None, ""] else "Não informado"
    
    @staticmethod
    def _output_parser(content):
        match = re.search(r'\{.*\}', content, re.DOTALL)

        if match:
            json_string = match.group(0)
            data = json.loads(json_string)
            data = {k: data.get(k) for k in ['domain', 'is_ecommerce']}
            return data
        else:
            print("No JSON found")

    def prompt(self, row: pd.Series):
        return f""" 
        Objetivo:
        Classificar um site como e-commerce ou não e-commerce, utilizando apenas os tokens extraídos do seu conteúdo HTML.

        Definição de E-commerce:
        Um site é considerado e-commerce somente se permitir a realização completa da compra de produtos ou serviços pela internet, incluindo todas as etapas abaixo:
            1- Seleção de produtos ou serviços diretamente no site.
            2- Adição dos itens a um carrinho de compras.
            3- Escolha de forma de pagamento.
            4- Definição de endereço de entrega ou retirada.
            5- Finalização da compra com um processo de checkout online.

        Importante:
        Sites que apenas exibem produtos ou serviços, como catálogos, cardápios ou listas, sem permitir a compra direta no site, não devem ser considerados e-commerce, mesmo que exibam preços ou informações de contato.
        Para que um site seja considerado e-commerce, ele deve conter todos os itens citados na definição do e-commerce.

        Não caracterizam e-commerce:
            1- Sites que apenas recebem pedidos por telefone, WhatsApp, formulário ou e-mail.
            2- Sites com listas de produtos sem carrinho, sem checkout ou sem formas de pagamento online.
            3- Cardápios de lanchonetes, padarias ou mercearias sem sistema de compra online.
            4- Sites que apenas realizam o orçamento dos produtos e sem sistema de compra online.
            5- Sites que não possuem carrinho ou que não permitam a compra dos produtos diretamente pelo site.

        Entrada:
        Uma lista de tokens extraídos do HTML do site. O URL/domain do site não deve ser usado como entrada para realizar a classificação, apenas na identificação do site na resposta.

        Saída esperada:
        Apenas um dos seguintes valores inteiros:
            1: É um e-commerce (atende a todos os critérios listados).
            0: Não é um e-commerce (não permite a compra diretamente no site).
            -1: Indefinido (os tokens não são suficientes para concluir com confiança).
        
        Dados:
        URL: {self._safe_get(row.get('domain'))}
        Tokens: {self._safe_get(row.get('tokens'))}

        Responda o 'domain' (URL) do site e 'is_ecommerce' (com o valor se ele é e-commerce ou não), em JSON.

        Resposta:
        """
    
    def classify(self, row: pd.Series):
        chat_completion = self.client.chat.completions.create(
            messages=[
                {"role": "user", "content": self.prompt(row)}
            ],
            model="gpt-4o-mini",
            temperature=0,
            max_tokens=100,
        )

        data = self._output_parser(chat_completion.choices[0].message.content)
        return data
    
        
    async def parallel_classify(self, X: pd.DataFrame, batch_size: int=10, sleep=0.0):
        """Classifies data in batches asynchronously and yields it"""
        
        async_client = AsyncOpenAI(api_key=self.client.api_key)
        all_results = {}

        for batch_start in range(0, len(X), batch_size):
            batch_end = min(batch_start + batch_size, len(X))
            batch = X.iloc[batch_start:batch_end]

            tasks = []
            batch_indices = []

            for idx, row in batch.iterrows():
                prompt = [{'role': 'user', 'content': self.prompt(row)}]
                task = async_client.chat.completions.create(
                    messages=prompt,
                    model="gpt-4o-mini",
                    temperature=0,
                    max_tokens=100,
                )
                tasks.append(task)
                batch_indices.append(idx)

            # Execute batch asynchronously
            batch_responses = await asyncio.gather(*tasks)

            batch_results = {}
            for idx, response in zip(batch_indices, batch_responses):
                data = self._output_parser(response.choices[0].message.content)

                batch_results[idx] = data
                all_results[idx] = data

            # Yield results per batch
            yield pd.DataFrame.from_dict(batch_results, orient='index')

            # Delay between batches
            await asyncio.sleep(sleep)

async def annotate(df, classifier, batch_size=50):
    """Yields batches of annotated data, including original DataFrame columns."""
    async for batch in classifier.parallel_classify(df, batch_size=batch_size):
        # Create a DataFrame for the batch with original df index
        annotated_batch = pd.DataFrame(batch, index=df.loc[batch.index].index)

        yield annotated_batch

def append_if_exists(df: pd.DataFrame, path: str):
    """Appends `df` to an existing file if it exists, otherwise creates a new file.
    
    Supports both CSV and Parquet formats. If a Parquet file is stored as a folder, it loads and appends correctly.
    
    Args:
        df (pd.DataFrame): The DataFrame to append.
        path (str): The file path, should end in .csv or .parquet.
    """
    existing_df = load_file_if_exists(path)

    if not existing_df is None:
        df = pd.concat([existing_df, df], ignore_index=True)

    # Save the DataFrame in the correct format
    if path.endswith(".csv"):
        print('saved at', path)
        df.to_csv(path, index=False, sep=';')
    elif path.endswith(".parquet"):
        df.to_parquet(path, index=False)  # Will save as a folder if partitioning is used
    else:
        raise ValueError("Unsupported file format. Use .csv or .parquet")
    
    return df

def load_file_if_exists(path: str):
    """Checks and open a file if it exists.
    
    Supports both CSV and Parquet formats. If a Parquet file is stored as a folder, it loads and appends correctly.
    
    Args:
        path (str): The file path, should end in .csv or .parquet.
    """
    file_exists = os.path.exists(path) or os.path.isdir(path)  # Check if it's a folder (Parquet case)

    if file_exists:
        if path.endswith(".csv"):
            existing_df = pd.read_csv(path, sep=';')
        elif path.endswith(".parquet"):
            existing_df = pd.read_parquet(path)  # Reads the folder as a Parquet dataset
        else:
            raise ValueError("Unsupported file format. Use .csv or .parquet")
        
        return existing_df
    
    return None

In [None]:
classifier = Classifier(api_key=OPENAI_KEY)
samples = []
df_copy = df.copy()
existing_samples = load_file_if_exists(BACKUP_PATH)

if not existing_samples is None:
    # excluding the samples that already were processed
    unique_domains = existing_samples["domain"].unique().tolist()
    df = df[~df["domain"].isin(unique_domains)].reset_index(drop=True)

async for batch in annotate(df, classifier, batch_size=BATCH_SIZE):
    samples.append(batch)
    append_if_exists(batch, BACKUP_PATH)  # Append in batches
    print(f'{min(BATCH_SIZE, len(batch))} classified samples were added to {BACKUP_PATH}')

labeled_df = pd.concat(samples, axis=0)
labeled_df = labeled_df[labeled_df["is_ecommerce"] != -1].reset_index(drop=True)

if df.shape[0] != df_copy.shape[0]:
    labeled_df = pd.merge(left=df_copy, right=labeled_df, how="inner")
else:
    labeled_df = pd.merge(left=df, right=labeled_df, how="inner")

labeled_df

In [None]:
# labeled_df.to_parquet("../data/noisy_training_data.parquet", engine="pyarrow")