In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import string
import unicodedata
import spacy
import re
import json
from openai import OpenAI
from typing import List
import os
from dotenv import load_dotenv, find_dotenv

env_path = find_dotenv('keys.env')
API_KEY = ''

if not env_path:
    print("O arquivo keys.env não foi encontrado. Certifique-se de que ele exista no diretório raiz do projeto.")
else:
    load_dotenv(env_path)

    API_KEY = os.getenv('API_KEY')

    if API_KEY is None:
        print("A variável API_KEY não foi encontrada no arquivo keys.env.")
    else:
        print(f"API Key: {API_KEY}")

        def use_api(API_KEY):
            print(f"Usando a chave API: {API_KEY}")

        use_api(API_KEY)

client = OpenAI(max_retries=5, api_key=API_KEY)
embedding_model = "text-embedding-3-small"

nlp = spacy.load('pt_core_news_md')

In [None]:
df = pd.read_csv('./data/data_preprocessed.csv')
df['COMBINED_TEXT'] = df['SENDER']+ ' | ' + df['SUBJECT']
df

In [None]:
def remove_acentos(text):
    normalized_text = unicodedata.normalize('NFD', text)
    text_without_accents = ''.join(c for c in normalized_text if unicodedata.category(c) != 'Mn')
    return text_without_accents

def remove_numeros(sentence):
    return re.sub(r'\b\d+\w*\b', '', sentence)

def preprocessing_text(text, language='portuguese'):
    text = remove_acentos(text)
    text = text.lower()
    text = text.replace('r$', '')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = remove_numeros(text)
    text = text.strip()
    
    doc = nlp(text)
    
    lemmatized_tokens = [token.lemma_ if token.lemma_ != '-PRON-' else token.text for token in doc]
    
    return ' '.join(lemmatized_tokens)

df['SUBJECT_PREPROCESSED'] = df['COMBINED_TEXT'].apply(preprocessing_text)
df

## Embeddings

In [None]:
def get_embeddings(text: str, model="text-embedding-3-small", **kwargs) -> List[float]:
    text = text.replace("\n", " ")

    response = client.embeddings.create(input=[text], model=model, **kwargs)

    return response.data[0].embedding

def apply_tfidf(data):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data)
    return list(tfidf_matrix.toarray())

def apply_word2vec(data):
    sentences = [sentence.split() for sentence in data]
    
    model = Word2Vec(
        sentences=sentences,
        vector_size=200,
        window=2,
        min_count=2,
        negative=5,
        workers=1,
        seed=42
    )
    
    embeddings = []
    
    for sentence in sentences:
        word_vecs = [model.wv[word] for word in sentence if word in model.wv]
        if word_vecs:
            sentence_embedding = np.mean(word_vecs, axis=0)
        else:
            sentence_embedding = np.zeros(model.vector_size)
        embeddings.append(sentence_embedding)
    
    return embeddings

df['EMBEDDING_TFIDF'] = apply_tfidf(df['SUBJECT_PREPROCESSED'])
df['EMBEDDING_WORD2VEC'] = apply_word2vec(df['SUBJECT_PREPROCESSED'])

In [None]:
df['EMBEDDING_OPENAI'] = df['COMBINED_TEXT'].apply(lambda x: get_embeddings(x, model=embedding_model))
df

In [None]:
df['EMBEDDING_TFIDF'] = df['EMBEDDING_TFIDF'].apply(lambda x: x.tolist())
df['EMBEDDING_WORD2VEC'] = df['EMBEDDING_WORD2VEC'].apply(lambda x: x.tolist())

In [None]:
df.to_csv('./data/data_preprocessed_embeddings.csv', index=False)