In [22]:
import string
from nltk.stem import *
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Binarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import joblib

# Preprocesamiento
Se limpian los datos retirando las stopwords que se indica en el archivo ademas de aplicar la tecnica de stimming a todo el corpus

In [23]:
CORPUS_DIR = "reuters/training"
documents = {}
with open('reuters/stopwords.txt', 'r', encoding='utf-8') as file:
    stop_words = set(word.strip() for word in file.readlines())

In [24]:
def clean_text(*, text, stopwords):
    text = re.sub(r'\d+', '', text)
    tokens = text.lower().translate(str.maketrans('', '', string.punctuation)).split(" ")
    stemmer = SnowballStemmer("spanish")
    no_stw = [token for token in tokens if token not in stopwords]
    stemmed_tokens = [stemmer.stem(token) for token in no_stw]
    text_cleaned = " ".join(stemmed_tokens)
    return text_cleaned

Obtencion de un diccionario de textos limpios y libres de stopwords

In [25]:
for filename in os.listdir(CORPUS_DIR):
    if filename.endswith(".txt"):
        filepath = os.path.join(CORPUS_DIR, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            cleaned_text = clean_text(text=text,stopwords=stop_words)
            documents[filename] = cleaned_text

Creacion de directorios

In [None]:
folders = ['API_resources', 'API_resources/bow', 'API_resources/tfidf']
for folder in folders:
    if not os.path.exists(folder):
        os.makedirs(folder)

## Aplicacion de Tecnicas de Vectorizacion
Se empleara la libreria scikit-learn para la aplicacion de BoW y Tf-Idf

### Aplicacion de BoW

In [26]:
vectorizer_bow = CountVectorizer()
bow_counts = vectorizer_bow.fit_transform(documents.values())
onehot = Binarizer()
bow_counts = onehot.fit_transform(bow_counts.toarray())
print(vectorizer_bow.get_feature_names_out())
print(bow_counts)

['aa' 'aaa' 'aachen' ... 'zuyu' 'zverev' 'zzzz']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Guardamos los objetos para usarlos proximamente en la API

In [27]:
joblib.dump(vectorizer_bow, 'API_resources/bow/vectorizer_bow.joblib')
joblib.dump(bow_counts,'API_resources/bow/bow_counts.joblib')
joblib.dump(onehot, 'API_resources/bow/onehot.joblib')

['API_resources/bow/onehot.joblib']

### Aplicacion de tf-Idf

In [28]:
vectorizer_tfidf = TfidfVectorizer()
tfidf_counts = vectorizer_tfidf.fit_transform(documents.values())
print(vectorizer_tfidf.get_feature_names_out())
print(tfidf_counts.toarray())

['aa' 'aaa' 'aachen' ... 'zuyu' 'zverev' 'zzzz']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


Guardamos los objetos para usarlos proximamente en la API

In [30]:
joblib.dump(vectorizer_tfidf, 'API_resources/tfidf/vectorizer_tfidf.joblib')
joblib.dump(tfidf_counts,'API_resources/tfidf/tfidf_counts.joblib')

['API_resources/tfidf/tfidf_counts.joblib']