In [None]:
import spacy
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from gensim.models import Word2Vec
import torch
from autocorrect import Speller
import re
import contractions
import emoji

# Checar esses acentos

In [None]:
class SyntacticEmbeddingPipeline:
    def __init__(self, word2vec_model, glove_embeddings, pos_labels):
        """
        Pipeline para gerar representações sintáticas para cada palavra em um texto.

        Args:
            word2vec_model (gensim.models.Word2Vec): Modelo Word2Vec treinado.
            glove_embeddings (dict): Embeddings GloVe carregados.
            pos_labels (list): Lista de etiquetas POS para one-hot encoding.
        """
        self.word2vec = word2vec_model
        self.glove = glove_embeddings
        self.pos_encoder = OneHotEncoder(handle_unknown='ignore')
        self.pos_encoder.fit(np.array(pos_labels).reshape(-1, 1))
        self.nlp = spacy.load("en_core_web_lg")
        self.spell = Speller(lang='en')
        self.slang_map = {
            "asap": "as soon as possible",
            "idk": "i do not know",
            "lol": "laughing out loud"
        }

    def preprocess_text(self, text):
        """Pré-processa o texto inicial."""
        text = self.spell(text)  # Corrigir erros ortográficos
        text = emoji.replace_emoji(text, replace="")  # Remover emojis
        text = re.sub(r'[\n\t]', ' ', text)  # Corrigir quebras de linha
        text = re.sub(r'[\u0300-\u036f]', '', text)  # Remover acentos
        text = contractions.fix(text)  # Expandir contrações
        text = re.sub(r'http\S+|www\S+|\w+\.\w{2,3}|#[\w]+|<[^>]+>', '', text)  # Remover links, URLs, hashtags, XMLs
        text = ' '.join([self.slang_map.get(word.lower(), word) for word in text.split()])
        return text.strip()

    def get_pos_one_hot(self, text):
        """Gera a codificação one-hot das etiquetas POS do texto pré-processado."""
        doc = self.nlp(text)
        pos_tags = [token.pos_ for token in doc]
        pos_one_hot = [self.pos_encoder.transform(np.array([pos]).reshape(-1, 1)).toarray()[0] for pos in pos_tags]
        return np.array(pos_one_hot)

    def preprocess_for_embeddings(self, text):
        """Aplica lematização e remove pontuações para uso em Word2Vec e GloVe."""
        doc = self.nlp(text)
        tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_space]
        return ' '.join(tokens)

    def get_word2vec_embedding(self, word):
        """Obtém o embedding Word2Vec para uma palavra."""
        return self.word2vec.wv[word] if word in self.word2vec.wv else np.zeros(self.word2vec.vector_size)

    def get_glove_embedding(self, word):
        """Obtém o embedding GloVe para uma palavra."""
        return self.glove.get(word, np.zeros(len(next(iter(self.glove.values())))))

    def process_sentence(self, sentence):
        """Gera a representação concatenada para cada palavra em uma sentença."""
        # Pré-processar o texto
        preprocessed_text = self.preprocess_text(sentence)

        # Codificar POS tags em one-hot
        pos_one_hot_embeddings = self.get_pos_one_hot(preprocessed_text)

        # Lematizar e preparar para Word2Vec e GloVe
        preprocessed_for_embeddings = self.preprocess_for_embeddings(preprocessed_text)
        words = preprocessed_for_embeddings.split()

        embeddings = []
        for i, word in enumerate(words):
            pos_one_hot = pos_one_hot_embeddings[i] if i < len(pos_one_hot_embeddings) else np.zeros(len(pos_one_hot_embeddings[0]))
            w2v_embedding = self.get_word2vec_embedding(word)
            glove_embedding = self.get_glove_embedding(word)

            # Concatenar (POS + Word2Vec + GloVe)
            word_embedding = np.concatenate([pos_one_hot, w2v_embedding, glove_embedding])
            embeddings.append(word_embedding)

        return np.array(embeddings)

    def process_text(self, text):
        """Processa um texto completo (várias frases)."""
        sentences = [sent.text for sent in self.nlp(text).sents]
        all_embeddings = [self.process_sentence(sentence) for sentence in sentences]
        return np.vstack(all_embeddings)

In [None]:
# Exemplo de dados
corpus = ["I love natural language processing.", "Word2Vec is great for syntactic embeddings."]

# Carregar Word2Vec (exemplo: treinar com corpus de exemplo)
sentences = [text.split() for text in corpus]
word2vec_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

# Carregar GloVe (exemplo: dicionário fictício)
glove_embeddings = {
        "i": np.random.rand(100),
        "love": np.random.rand(100),
        "natural": np.random.rand(100),
        "language": np.random.rand(100),
        "processing": np.random.rand(100),
        "asap": np.random.rand(100),
        "word2vec": np.random.rand(100),
        "syntactic": np.random.rand(100),
        "embeddings": np.random.rand(100),
}

# Lista de etiquetas POS para one-hot encoding
pos_labels = ["NOUN", "VERB", "ADJ", "ADV", "PRON", "DET", "ADP", "CCONJ", "NUM", "PART", "INTJ"]

# Inicializar a pipeline
pipeline = SyntacticEmbeddingPipeline(word2vec_model, glove_embeddings, pos_labels)

    # Processar um texto
for text in corpus:
    embeddings = pipeline.process_text(text)
    print(f"Embeddings para: '{text}'\n{embeddings}\n")