In [1]:
# Carga de datos
import glob


def load_data(input_directory):

    sequence = []
    files = glob.glob(f"{input_directory}/*")
    for file in files:
        with open(file, "rt", encoding="utf-8") as f:
            raw_text = f.read()
            sequence.append((file, raw_text))
    return sequence


sequence = load_data(input_directory="../files/input")
for file, text in sequence:
    print(f"{file}  {text[:70]}")

../files/input\file1.txt  It is essential to develop non-precious metal-based alternatives used 
../files/input\file2.txt  Electric vehicles are gaining global popularity lately, and along with
../files/input\file3.txt  Global solar irradiation is an important variable that can be used to 


In [2]:
import re


def clean_text(sequence):
    cleaned_sequence = []
    for file, text in sequence:
        cleaned_text = re.sub(r"\n", " ", text)
        cleaned_text = re.sub(r"\s+", " ", cleaned_text)
        cleaned_text = cleaned_text.strip()
        cleaned_text = cleaned_text.lower()
        cleaned_sequence.append((file, cleaned_text))
    return cleaned_sequence


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
for file, text in cleaned_sequence:
    print(f"{file}  {text[:70]}")

../files/input\file1.txt  it is essential to develop non-precious metal-based alternatives used 
../files/input\file2.txt  electric vehicles are gaining global popularity lately, and along with
../files/input\file3.txt  global solar irradiation is an important variable that can be used to 


In [8]:
# Tokenization
import nltk
from nltk.tokenize import word_tokenize

#nltk.download("punkt_tab")


def tokenize(sequence):
    tokenized_sequence = []
    for file, text in sequence:
        tokens = word_tokenize(text)
        tokenized_sequence.append((file, tokens))
    return tokenized_sequence


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
for file, text in tokenized_sequence:
    print(f"{file}  {' '.join(text)[:70]}")

../files/input\file1.txt  it is essential to develop non-precious metal-based alternatives used 
../files/input\file2.txt  electric vehicles are gaining global popularity lately , and along wit
../files/input\file3.txt  global solar irradiation is an important variable that can be used to 


In [None]:
# Remoción de datos ruidosos (Opcion A)
def filter_tokens_a(sequence):
    """Esta solucion puede perder tokens que contienen caracteres no alfabeticos"""
    filtered_sequence = []
    for file, tokens in sequence:
        filtered_tokens = [token for token in tokens if token.isalpha()]
        filtered_sequence.append((file, filtered_tokens))
    return filtered_sequence


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_a(tokenized_sequence)