In [6]:
import sys
import os
import numpy as np
import torch
import pandas as pd
import math

import docx

In [7]:
doc_names = os.listdir('docs')

In [8]:
docs = [
	docx.Document(f'docs/{d}')
	for d
	in os.listdir('docs')
]
print(f"{len(docs)} documents loaded")

12 documents loaded


In [9]:
texts = ['\n'.join(p.text.lower() for p in d.paragraphs if p.text)
    for d
    in docs ]
print(texts)

['la realidad aumentada transforma la experiencia móvil\ncon la tecnología de realidad aumentada (ar) cada vez más avanzada, los dispositivos móviles ofrecen experiencias nunca antes vistas. la app arworld, lanzada recientemente, permite a los usuarios visualizar muebles en sus propios hogares antes de comprarlos, o incluso probarse ropa virtualmente. las posibilidades son infinitas y los desarrolladores de apps están trabajando arduamente para aprovechar el potencial de ar en el mundo móvil.', 'ecodrive: el coche eléctrico económico para todos\nla revolución eléctrica no se detiene. la compañía automovilística ecomovers ha lanzado su modelo ecodrive, un vehículo eléctrico pensado para ser asequible para el gran público. con una autonomía respetable de 350 kilómetros y un precio competitivo, ecodrive busca ser la opción ideal para aquellos que desean dar el salto al mundo eléctrico sin vaciar sus bolsillos. las reservas ya están disponibles y se espera una alta demanda.', 'integración 

In [10]:
def remove_punctuation(txt):
    txt = txt.replace(',', '')
    txt = txt.replace('.', '')
    txt = txt.replace(';', '')
    txt = txt.replace(':', '')
    txt = txt.replace('?', '')
    txt = txt.replace('¿', '')
    txt = txt.replace('!', '')
    txt = txt.replace('¡', '')
    for i in range(10):
        txt = txt.replace(f'{i}', '')
    return txt

In [11]:
texts_2 = [remove_punctuation(t) for t in texts]

In [12]:
print(texts_2[0])

la realidad aumentada transforma la experiencia móvil
con la tecnología de realidad aumentada (ar) cada vez más avanzada los dispositivos móviles ofrecen experiencias nunca antes vistas la app arworld lanzada recientemente permite a los usuarios visualizar muebles en sus propios hogares antes de comprarlos o incluso probarse ropa virtualmente las posibilidades son infinitas y los desarrolladores de apps están trabajando arduamente para aprovechar el potencial de ar en el mundo móvil


In [13]:
def remove_stop_words(txt):
    stop_words = {"de", "la", "que", "el", "en", "y", "a", "los", "se", "del", "las", "por", "un", "para", "con", "no", "una", "su", "al", "lo", "como", "más", "pero", "sus", "le", "ya", "o", "este", "sí", "porque", "esta", "entre", "cuando", "muy", "sin", "sobre", "también", "me", "hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante", "todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante", "ellos", "e", "esto", "mí", "antes", "algunos", "qué", "unos", "yo", "otro", "otras", "otra", "él", "tanto", "esa", "estos", "mucho", "quienes", "nada", "muchos", "cual", "poco", "ella", "estar", "estas", "algunas", "algo", "nosotros", "mi", "mis", "tú", "te", "ti", "tu", "tus", "ellas", "nosotras", "vosotros", "vosotras", "os", "mío", "mía", "míos", "mías", "tuyo", "tuya", "tuyos", "tuyas", "suyo", "suya", "suyos", "suyas", "nuestro", "nuestra", "nuestros", "nuestras", "vuestro", "vuestra", "vuestros", "vuestras", "esos", "esas", "estoy", "estás", "está", "estamos", "estáis", "están", "esté", "estés", "estemos", "estéis", "estén", "estaré", "estarás", "estará", "estaremos", "estaréis", "estarán", "estaría", "estarías", "estaríamos", "estaríais", "estarían", "estaba", "estabas", "estábamos", "estabais", "estaban", "estuve", "estuviste", "estuvo", "estuvimos", "estuvisteis", "estuvieron", "estuviera", "estuvieras", "estuviéramos", "estuvierais", "estuvieran", "estuviese", "estuvieses", "estuviésemos", "estuvieseis", "estuviesen", "estando", "estado", "estada", "estados", "estadas", "estad", "he", "has", "ha", "hemos", "habéis", "han", "haya", "hayas", "hayamos", "hayáis", "hayan", "habré", "habrás", "habrá", "habremos", "habréis", "habrán", "habría", "habrías", "habríamos", "habríais", "habrían", "había", "habías", "habíamos", "habíais", "habían", "hube", "hubiste", "hubo", "hubimos", "hubisteis", "hubieron", "hubiera", "hubieras", "hubiéramos", "hubierais", "hubieran", "hubiese", "hubieses", "hubiésemos", "hubieseis", "hubiesen", "habiendo", "habido", "habida", "habidos", "habidas", "soy", "eres", "es"}
    return [t for t in txt if t not in stop_words]

In [14]:
texts_3 = [remove_stop_words(t.split()) for t in texts_2]
texts_3[0]

['realidad',
 'aumentada',
 'transforma',
 'experiencia',
 'móvil',
 'tecnología',
 'realidad',
 'aumentada',
 '(ar)',
 'cada',
 'vez',
 'avanzada',
 'dispositivos',
 'móviles',
 'ofrecen',
 'experiencias',
 'nunca',
 'vistas',
 'app',
 'arworld',
 'lanzada',
 'recientemente',
 'permite',
 'usuarios',
 'visualizar',
 'muebles',
 'propios',
 'hogares',
 'comprarlos',
 'incluso',
 'probarse',
 'ropa',
 'virtualmente',
 'posibilidades',
 'son',
 'infinitas',
 'desarrolladores',
 'apps',
 'trabajando',
 'arduamente',
 'aprovechar',
 'potencial',
 'ar',
 'mundo',
 'móvil']

In [15]:
vocabulary = {word for text in texts_3 for word in text}
print(len(vocabulary))
print(vocabulary)

419
{'gobierno', 'presentó', 'trimestres', 'garantizar', 'ofrecerá', 'además', 'diversas', 'momentos', 'distribución', 'esperan', 'evidente', 'compacto', 'autovolt', 'nuevo', 'realidad', 'energía', 'arworld', 'desarrolladores', 'coche', 'alta', 'descuento', 'específico', 'solo', 'puntos', 'subvención', 'empresas', 'permite', 'tendencia', 'asequible', 'aumentando', 'comerciales', 'virtualmente', 'año"', 'cotidiana', 'muestra', 'red', 'busca', 'grado', 'paquete', 'procesador', 'constante', 'expertos', 'visualizar', 'protegidos', 'mira', 'dispositivos', 'celebran', 'cerca', 'aplicaciones', 'trabajando', 'próximo', 'revolución', 'carga', 'vehículos', 'auge', 'creciente', 'bajo', 'vistas', 'meses', 'reducir', 'tres', 'si', 'precio', 'incorpora', 'empresa', 'apps', 'permitiendo', 'gran', 'lanzamiento', 'compradores', 'agigantados', 'capaces', 'anuncio', 'fluida', 'ram', 'experimentando', 'demanda', 'áreas', 'encriptación', 'transforma', 'causarán', 'mayores', 'fabricantes', 'moderado', 'crec

In [16]:
idx_to_text = list(vocabulary)
text_to_idx = {word: id for id, word in enumerate(idx_to_text)}
N = len(vocabulary)

In [17]:
idx_to_text[text_to_idx['modelo']]

'modelo'

In [18]:
def create_ohe_vector(idx, n):
    v = [0 for _ in range(n)]
    v[idx] = 1
    return np.array(v)

In [19]:
tokenized_texts = [
    sum(create_ohe_vector(text_to_idx[word], N) for word in text)
    for text in texts_3
]
tokenized_texts

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [20]:
tokenized_texts_torch = [
    torch.from_numpy(text).type(torch.float32)
    for text in tokenized_texts
]
tokenized_texts_torch[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 1., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 

In [21]:
cos = torch.nn.CosineSimilarity(dim=0)

cos(tokenized_texts_torch[0], tokenized_texts_torch[1])
num_docs = len(tokenized_texts_torch)
similarity_matrix = np.ones((num_docs, num_docs))

for i in range(num_docs):
    for j in range(i):
        similarity = cos(tokenized_texts_torch[i], tokenized_texts_torch[j])
        similarity_matrix[i, j] = similarity
        similarity_matrix[j, i] = similarity

In [22]:
pd.DataFrame(similarity_matrix, columns=doc_names, index=doc_names)

Unnamed: 0,doc07.docx,doc08.docx,doc11.docx,doc04.docx,doc09.docx,doc02.docx,doc03.docx,doc01.docx,doc10.docx,doc05.docx,doc06.docx,doc12.docx
doc07.docx,1.0,0.018547,0.192232,0.057703,0.0,0.125501,0.10421,0.032556,0.060634,0.0,0.130984,0.160899
doc08.docx,0.018547,1.0,0.090917,0.127357,0.085498,0.033918,0.016429,0.015397,0.019118,0.189219,0.0354,0.035122
doc11.docx,0.192232,0.090917,1.0,0.134693,0.16457,0.188326,0.0,0.102591,0.070767,0.112066,0.131036,0.303352
doc04.docx,0.057703,0.127357,0.134693,1.0,0.212798,0.0,0.0,0.079839,0.0,0.098115,0.073422,0.23068
doc09.docx,0.0,0.085498,0.16457,0.212798,1.0,0.0,0.0,0.0,0.037268,0.202871,0.017252,0.308094
doc02.docx,0.125501,0.033918,0.188326,0.0,0.0,1.0,0.095286,0.029768,0.110883,0.0,0.102658,0.11317
doc03.docx,0.10421,0.016429,0.0,0.0,0.0,0.095286,1.0,0.0,0.089514,0.0,0.03315,0.054816
doc01.docx,0.032556,0.015397,0.102591,0.079839,0.0,0.029768,0.0,1.0,0.033558,0.099641,0.124274,0.06165
doc10.docx,0.060634,0.019118,0.070767,0.0,0.037268,0.110883,0.089514,0.033558,1.0,0.041239,0.077152,0.089304
doc05.docx,0.0,0.189219,0.112066,0.098115,0.202871,0.0,0.0,0.099641,0.041239,1.0,0.05727,0.063135


In [23]:
similar_pairs = [(doc_names[j], doc_names[i], similarity_matrix[i, j]) for i in range(12) for j in range(i) if similarity_matrix[i, j] > 0.15]
pd.DataFrame(similar_pairs, columns=["left document", "right document", "similarity"]).sort_values("similarity", ascending=False)

Unnamed: 0,left document,right document,similarity
9,doc09.docx,doc12.docx,0.308094
7,doc11.docx,doc12.docx,0.303352
8,doc04.docx,doc12.docx,0.23068
2,doc04.docx,doc09.docx,0.212798
5,doc09.docx,doc05.docx,0.202871
0,doc07.docx,doc11.docx,0.192232
4,doc08.docx,doc05.docx,0.189219
3,doc11.docx,doc02.docx,0.188326
1,doc11.docx,doc09.docx,0.16457
6,doc07.docx,doc12.docx,0.160899


In [24]:
def tf_idf(term, document, documents):
    # TF
    words = document  # Dividir el documento en palabras
    tf = words.count(term) / len(words)  # Calcular TF
    # print("TF:", tf)
    
    # IDF
    n_documents_with_term = 0  # Contador de documentos con el término
    for doc in documents:
        if term in doc:  # Si el término está en el documento
            n_documents_with_term += 1  # Incrementar el contador
    if n_documents_with_term != 0:
        idf = math.log(len(documents) / n_documents_with_term)  # Calcular IDF
    else:
        idf = 0
    # print("IDF:", idf)

    # TF-IDF
    tf_idf = tf * idf  # Calcular TF-IDF

    return tf_idf

In [25]:
tf_idf

<function __main__.tf_idf(term, document, documents)>