In [1]:
import string
from nltk.stem import *
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Binarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import joblib
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Preprocesamiento
Se limpian los datos retirando las stopwords que se indica en el archivo ademas de aplicar la tecnica de stimming a todo el corpus

In [2]:
CORPUS_DIR = "reuters/training"
documents = {}
with open('reuters/stopwords.txt', 'r', encoding='utf-8') as file:
    stop_words = set(word.strip() for word in file.readlines())

In [3]:
def clean_text(*, text, stopwords):
    text = re.sub(r'\d+', '', text)
    tokens = text.lower().translate(str.maketrans('', '', string.punctuation)).split(" ")
    stemmer = SnowballStemmer("spanish")
    no_stw = [token for token in tokens if token not in stopwords]
    stemmed_tokens = [stemmer.stem(token) for token in no_stw]
    text_cleaned = " ".join(stemmed_tokens)
    return text_cleaned

Obtencion de un diccionario de textos limpios y libres de stopwords

In [4]:
for filename in os.listdir(CORPUS_DIR):
    if filename.endswith(".txt"):
        filepath = os.path.join(CORPUS_DIR, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            cleaned_text = clean_text(text=text,stopwords=stop_words)
            documents[filename] = cleaned_text

Creacion de directorios

In [5]:
folders = ['API_resources', 'API_resources/bow', 'API_resources/tfidf']
for folder in folders:
    if not os.path.exists(folder):
        os.makedirs(folder)

## Aplicacion de Tecnicas de Vectorizacion
Se empleara la libreria scikit-learn para la aplicacion de BoW y Tf-Idf

In [5]:
document_names = list(documents.keys())
joblib.dump(document_names, 'API_resources/document_names.joblib')

['API_resources/document_names.joblib']

### Aplicacion de BoW

In [6]:
vectorizer_bow = CountVectorizer()
bow_counts = vectorizer_bow.fit_transform(documents.values())
onehot = Binarizer()
bow_counts = onehot.fit_transform(bow_counts.toarray())
print(vectorizer_bow.get_feature_names_out())
print(bow_counts)

['aa' 'aaa' 'aachen' ... 'zuyu' 'zverev' 'zzzz']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Guardamos los objetos para usarlos proximamente en la API

In [27]:
joblib.dump(vectorizer_bow, 'API_resources/bow/vectorizer_bow.joblib')
joblib.dump(bow_counts,'API_resources/bow/bow_counts.joblib')

['API_resources/bow/onehot.joblib']

### Aplicacion de tf-Idf

In [7]:
vectorizer_tfidf = TfidfVectorizer()
tfidf_counts = vectorizer_tfidf.fit_transform(documents.values())
print(vectorizer_tfidf.get_feature_names_out())
print(tfidf_counts.toarray())

['aa' 'aaa' 'aachen' ... 'zuyu' 'zverev' 'zzzz']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


Guardamos los objetos para usarlos proximamente en la API

In [30]:
joblib.dump(vectorizer_tfidf, 'API_resources/tfidf/vectorizer_tfidf.joblib')
joblib.dump(tfidf_counts,'API_resources/tfidf/tfidf_counts.joblib')

['API_resources/tfidf/tfidf_counts.joblib']

# Prueba de funcionamiento

In [8]:
# Carga los objetos guardados
vectorizer_tfidf = joblib.load('API_resources/tfidf/vectorizer_tfidf.joblib')
tfidf_counts = joblib.load('API_resources/tfidf/tfidf_counts.joblib')

In [9]:
print(f"Número de documentos: {len(documents)}")
print(f"Tamaño de tfidf_counts: {tfidf_counts.shape[0]}")

Número de documentos: 7769
Tamaño de tfidf_counts: 7769


In [10]:
def process_query(query, vectorizer, tfidf_matrix):
    # Limpia y vectoriza la consulta
    cleaned_query = clean_text(text=query, stopwords=stop_words)
    query_vec = vectorizer.transform([cleaned_query])
    
    # Calcula la similitud coseno
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # Obtiene los índices de los documentos más similares
    related_docs_indices = cosine_similarities.argsort()[:-6:-1]
    
    return related_docs_indices, cosine_similarities

In [14]:
# Realiza una consulta
query = "rye"
related_docs_indices, cosine_similarities = process_query(query, vectorizer_tfidf, tfidf_counts)

# Muestra los resultados
document_names = list(documents.keys())
for idx in related_docs_indices:
    if idx < len(document_names) and cosine_similarities[idx] > 0:
        print(f"Documento: {document_names[idx]}")
        print(f"Similitud coseno: {cosine_similarities[idx]}")
        print("---")

Documento: 3132.txt
Similitud coseno: 0.3247133757319705
---
Documento: 97.txt
Similitud coseno: 0.09721889984305128
---
Índice 7768 fuera de rango. Tamaño de document_names: 7769
Índice 2592 fuera de rango. Tamaño de document_names: 7769
Índice 2581 fuera de rango. Tamaño de document_names: 7769
