In [1]:
# Carga de datos
import glob


def load_data(input_directory):

    sequence = []
    files = glob.glob(f"{input_directory}/*")
    for file in files:
        with open(file, "rt", encoding="utf-8") as f:
            raw_text = f.read()
            sequence.append((file, raw_text))
    return sequence


sequence = load_data(input_directory="../files/input")
for file, text in sequence:
    print(f"{file}  {text[:70]}")
    
    

../files/input/file2.txt  Electric vehicles are gaining global popularity lately, and along with
../files/input/file3.txt  Global solar irradiation is an important variable that can be used to 
../files/input/file1.txt  It is essential to develop non-precious metal-based alternatives used 


In [5]:
# Clean text
import re


def clean_text(sequence):
    cleaned_sequence = []
    for file, text in sequence:
        cleaned_text = re.sub(r"\n", " ", text)
        cleaned_text = re.sub(r"\s+", " ", cleaned_text)
        cleaned_text = cleaned_text.strip()
        cleaned_text = cleaned_text.lower()
        cleaned_sequence.append((file, cleaned_text))
    return cleaned_sequence


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
for file, text in cleaned_sequence:
    print(f"{file}  {text[:70]}")
    
    
    
    

../files/input/file2.txt  electric vehicles are gaining global popularity lately, and along with
../files/input/file3.txt  global solar irradiation is an important variable that can be used to 
../files/input/file1.txt  it is essential to develop non-precious metal-based alternatives used 


In [6]:
# Tokenization
import nltk

import nltk
from nltk.tokenize import word_tokenize


### Download punkt tokenizer bypass because ssl is asking for verification
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

## comentario de arriba hasta aca
nltk.download('punkt_tab')


def tokenize(sequence):
    tokenized_sequence = []
    for file, text in sequence:
        tokens = word_tokenize(text)
        tokenized_sequence.append((file, tokens))
    return tokenized_sequence


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
for file, text in tokenized_sequence:
    print(f"{file}  {' '.join(text)[:70]}")
    
    
    

../files/input/file2.txt  electric vehicles are gaining global popularity lately , and along wit
../files/input/file3.txt  global solar irradiation is an important variable that can be used to 
../files/input/file1.txt  it is essential to develop non-precious metal-based alternatives used 


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/ingmanuelf/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
# Remoción de datos ruidosos (Opcion A)
def filter_tokens_a(sequence):
    """Esta solucion puede perder tokens que contienen caracteres no alfabeticos"""
    filtered_sequence = []
    for file, tokens in sequence:
        filtered_tokens = [token for token in tokens if token.isalpha()]
        filtered_sequence.append((file, filtered_tokens))
    return filtered_sequence


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_a(tokenized_sequence)
for file, text in filtered_sequence:
    print(f"{file}  {' '.join(text)[:70]}")
    

../files/input/file2.txt  electric vehicles are gaining global popularity lately and along with 
../files/input/file3.txt  global solar irradiation is an important variable that can be used to 
../files/input/file1.txt  it is essential to develop alternatives used in hydrogen evolution rea


In [8]:
# Remoción de datos ruidosos (Opcion B)
def filter_tokens_b(sequence):
    """Esta solucion puede perder tokens que contienen caracteres no alfabeticos"""
    filtered_sequence = []
    for file, tokens in sequence:
        filtered_tokens = [re.sub(r"[^a-zA-Z\s]", " ", token) for token in tokens]
        filtered_tokens = [re.sub(r"\s+", " ", token) for token in filtered_tokens]
        filtered_tokens = [token.strip() for token in filtered_tokens]
        filtered_tokens = [token for token in filtered_tokens if token != ""]
        filtered_sequence.append((file, filtered_tokens))
    return filtered_sequence


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_b(tokenized_sequence)
for file, text in filtered_sequence:
    print(f"{file}  {' '.join(text)[:70]}")


../files/input/file2.txt  electric vehicles are gaining global popularity lately and along with 
../files/input/file3.txt  global solar irradiation is an important variable that can be used to 
../files/input/file1.txt  it is essential to develop non precious metal based alternatives used 


In [9]:
# Remove the stopwords
nltk.download("stopwords")


def remove_stopwords(sequence):
    stop_words = set(nltk.corpus.stopwords.words("english"))
    filtered_sequence = []
    for file, tokens in sequence:
        filtered_tokens = [token for token in tokens if token not in stop_words]
        filtered_sequence.append((file, filtered_tokens))
    return filtered_sequence


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_b(tokenized_sequence)
filtered_sequence = remove_stopwords(filtered_sequence)
for file, text in filtered_sequence:
    print(f"{file}  {' '.join(text)[:70]}")
    nltk.download("stopwords")


../files/input/file2.txt  electric vehicles gaining global popularity lately along efficient bat
../files/input/file3.txt  global solar irradiation important variable used determine suitability
../files/input/file1.txt  essential develop non precious metal based alternatives used hydrogen 


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ingmanuelf/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ingmanuelf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ingmanuelf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ingmanuelf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
import os
import textwrap


def save_data(output_directory, sequence):

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for file, tokens in sequence:
        file = file.replace("\\", "/")
        with open(
            f"{output_directory}/{file.split('/')[-1]}",
            "wt",
            encoding="utf-8",
        ) as f:
            f.write(textwrap.fill(" ".join(tokens), width=50))


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_b(tokenized_sequence)
filtered_sequence = remove_stopwords(filtered_sequence)
save_data(output_directory="../files/output", sequence=filtered_sequence)


# 
    