In [1]:
import numpy as np

# Calcular similitud de palabras
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm1 * norm2)
    return similarity

# Leer el conjunto de datos desde un archivo de texto plano (reemplaza 'dataset.txt' con tu archivo)
with open('Reglamento_transito.txt', 'r', encoding='utf-8') as file:
    sentences = [line.strip().split() for line in file]

# Crear un vocabulario único de palabras
vocab = list(set(word for sentence in sentences for word in sentence))
vocab.sort()

# Crear una matriz de co-ocurrencia
window_size = 2
co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))

for sentence in sentences:
    for i, target_word in enumerate(sentence):
        target_index = vocab.index(target_word)
        context_window = sentence[max(0, i - window_size):i] + sentence[i + 1:i + window_size + 1]

        for context_word in context_window:
            context_index = vocab.index(context_word)
            co_occurrence_matrix[target_index][context_index] += 1

# Aplicar SVD para obtener vectores de palabras
U, S, Vt = np.linalg.svd(co_occurrence_matrix)

# Reducir la dimensionalidad (opcional)
vector_size = 100
word_vectors = U[:, :vector_size]

# Consultar el vector de una palabra específica
target_word = "peatón"
target_index = vocab.index(target_word)
vector = word_vectors[target_index]
print(f"Vector de '{target_word}':", vector)


similar_words = {}
for word in vocab:
    word_index = vocab.index(word)
    similarity = cosine_similarity(word_vectors[target_index], word_vectors[word_index])
    similar_words[word] = similarity

sorted_similar_words = sorted(similar_words.items(), key=lambda x: x[1], reverse=True)
top_similar_words = sorted_similar_words[:3]
print(f"Palabras similares a '{target_word}': {top_similar_words}")


Vector de 'peatón': [-0.00079804  0.00026729  0.00283094  0.00512318 -0.00243684  0.00332727
  0.00076656 -0.00782469  0.00105726 -0.00263226  0.000217   -0.00305116
 -0.00186844 -0.00284281 -0.00581306 -0.00341293  0.00046467 -0.0063665
  0.00072817 -0.00285218 -0.00592317 -0.00066303  0.00819733 -0.00112108
  0.01057125 -0.01025103  0.00588627  0.00786563 -0.00183012  0.00454674
  0.0048258   0.00361952  0.00106194  0.00541633 -0.0012307   0.00246343
  0.00268157 -0.00765443  0.00314659  0.0042031   0.00549844 -0.00346648
 -0.00828313  0.00716317 -0.00845947 -0.00309628  0.00497665  0.00245242
  0.00364914  0.00092612  0.00385434  0.01145598 -0.0116412  -0.00776407
 -0.0026741  -0.00374533  0.005307   -0.004415   -0.00409443  0.02256503
 -0.0066718   0.01047302  0.0028373  -0.00632983  0.00768044 -0.00363937
  0.00093727 -0.00594666 -0.00758751  0.00386622 -0.01032235  0.00078467
  0.00122991  0.01080539 -0.00176178 -0.00445706  0.00969965  0.00144843
  0.00911687  0.00770672 -0.0121

  similarity = dot_product / (norm1 * norm2)
