In [7]:
# %%
# Carga de datos
import glob


def load_data(input_directory):

    sequence = []
    files = glob.glob(f"{input_directory}/*")
    for file in files:
        with open(file, "rt", encoding="utf-8") as f:
            raw_text = f.read()
            sequence.append((file, raw_text))
    return sequence


sequence = load_data(input_directory="../files/input")
for file, text in sequence:
    print(f"{file}  {text[:70]}")
    

../files/input\file1.txt  It is essential to develop non-precious metal-based alternatives used 
../files/input\file2.txt  Electric vehicles are gaining global popularity lately, and along with
../files/input\file3.txt  Global solar irradiation is an important variable that can be used to 


In [8]:
# %%
# Clean text
import re


def clean_text(sequence):
    cleaned_sequence = []
    for file, text in sequence:
        cleaned_text = re.sub(r"\n", " ", text)
        cleaned_text = re.sub(r"\s+", " ", cleaned_text)
        cleaned_text = cleaned_text.strip()
        cleaned_text = cleaned_text.lower()
        cleaned_sequence.append((file, cleaned_text))
    return cleaned_sequence


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
for file, text in cleaned_sequence:
    print(f"{file}  {text[:70]}")

../files/input\file1.txt  it is essential to develop non-precious metal-based alternatives used 
../files/input\file2.txt  electric vehicles are gaining global popularity lately, and along with
../files/input\file3.txt  global solar irradiation is an important variable that can be used to 


In [None]:
# %%
# Tokenization
import nltk
from nltk.tokenize import word_tokenize

# nltk.download("punkt_tab")


def tokenize(sequence):
    tokenized_sequence = []
    for file, text in sequence:
        tokens = word_tokenize(text)
        tokenized_sequence.append((file, tokens))
    return tokenized_sequence


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
nltk.download("punkt_tab")
tokenized_sequence = tokenize(cleaned_sequence)
for file, text in tokenized_sequence:
    print(f"{file}  {' '.join(text)[:70]}")
    
    

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pipe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\pipe/nltk_data'
    - 'c:\\Users\\pipe\\Desktop\\Unal\\Materia\\Fundamentos de analisis\\PRE-10-tokenizacion-de-texto-Lemus2901\\.venv\\nltk_data'
    - 'c:\\Users\\pipe\\Desktop\\Unal\\Materia\\Fundamentos de analisis\\PRE-10-tokenizacion-de-texto-Lemus2901\\.venv\\share\\nltk_data'
    - 'c:\\Users\\pipe\\Desktop\\Unal\\Materia\\Fundamentos de analisis\\PRE-10-tokenizacion-de-texto-Lemus2901\\.venv\\lib\\nltk_data'
    - 'C:\\Users\\pipe\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [None]:
import textwrap

for file, text in tokenized_sequence:
  print(textwrap.fill(' '.join(text)))
  print()
  print()

it is essential to develop non-precious metal-based alternatives used
in hydrogen evolution reaction ( her ) due to high cost and scarcity
of pt-based catalysts . herein , through density functional theory (
dft ) calculations , the her activity over 26 single-atom anchored
phosphorus carbide ( pc3 ) monolayer ( tm @ pc3 ) has been
systematically investigated . results indicate that δg * h of v , fe ,
nb , mo , and pd @ pc3 are lower than that of pt ( 1 1 1 ) catalyst ,
with 0.03 , −0.03 , −0.07 , −0.04 , and − 0.02 ev , respectively . by
imposing the criterion window ( −0.2 ≤ δg * h ≤ 0.2 ev ) , the d band
centre ( εd ) for catalysts with excellent her ability is in the range
of − 0.68–0.41 ev . besides , the five promising her catalysts follow
volmer-tafel mechanism . fe , nb , and mo @ pc3 show activation
barriers of 0.75 , 0.74 , and 0.55 ev , lower than that of pt .
machine learning ( ml ) was employed to explore the intrinsic
relationship between catalytic performance and feature

In [None]:
# Remoción de datos ruidosos (Opcion A)
def filter_tokens_a(sequence):
    """Esta solucion puede perder tokens que contienen caracteres no alfabeticos"""
    filtered_sequence = []
    for file, tokens in sequence:
        filtered_tokens = [token for token in tokens if token.isalpha()]
        filtered_sequence.append((file, filtered_tokens))
    return filtered_sequence


In [None]:
sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_a(tokenized_sequence)
for file, text in filtered_sequence:
    print(f"{file}  {' '.join(text)[:75]}")

../files/input\file1.txt  it is essential to develop alternatives used in hydrogen evolution reaction
../files/input\file2.txt  electric vehicles are gaining global popularity lately and along with it ef
../files/input\file3.txt  global solar irradiation is an important variable that can be used to deter


In [None]:

for file, text in tokenized_sequence:
  print(textwrap.fill(' '.join(text)))
  print()
  print()

it is essential to develop non-precious metal-based alternatives used
in hydrogen evolution reaction ( her ) due to high cost and scarcity
of pt-based catalysts . herein , through density functional theory (
dft ) calculations , the her activity over 26 single-atom anchored
phosphorus carbide ( pc3 ) monolayer ( tm @ pc3 ) has been
systematically investigated . results indicate that δg * h of v , fe ,
nb , mo , and pd @ pc3 are lower than that of pt ( 1 1 1 ) catalyst ,
with 0.03 , −0.03 , −0.07 , −0.04 , and − 0.02 ev , respectively . by
imposing the criterion window ( −0.2 ≤ δg * h ≤ 0.2 ev ) , the d band
centre ( εd ) for catalysts with excellent her ability is in the range
of − 0.68–0.41 ev . besides , the five promising her catalysts follow
volmer-tafel mechanism . fe , nb , and mo @ pc3 show activation
barriers of 0.75 , 0.74 , and 0.55 ev , lower than that of pt .
machine learning ( ml ) was employed to explore the intrinsic
relationship between catalytic performance and feature

In [None]:
# Remoción de datos ruidosos (Opcion A)
def filter_tokens_a(sequence):
    """Esta solucion puede perder tokens que contienen caracteres no alfabeticos"""
    filtered_sequence = []
    for file, tokens in sequence:
        filtered_tokens = [token for token in tokens if token.isalpha()]
        filtered_sequence.append((file, filtered_tokens))
    return filtered_sequence

In [None]:
sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_a(tokenized_sequence)
for file, text in filtered_sequence:
    print(f"{file}  {' '.join(text)[:75]}")

../files/input\file1.txt  it is essential to develop alternatives used in hydrogen evolution reaction
../files/input\file2.txt  electric vehicles are gaining global popularity lately and along with it ef
../files/input\file3.txt  global solar irradiation is an important variable that can be used to deter


In [None]:
for file, text in filtered_sequence:
    print(textwrap.fill(' '.join(text)))
    print()
    print()  

it is essential to develop alternatives used in hydrogen evolution
reaction her due to high cost and scarcity of catalysts herein through
density functional theory dft calculations the her activity over
anchored phosphorus carbide monolayer tm has been systematically
investigated results indicate that δg h of v fe nb mo and pd are lower
than that of pt catalyst with and ev respectively by imposing the
criterion window δg h ev the d band centre εd for catalysts with
excellent her ability is in the range of ev besides the five promising
her catalysts follow mechanism fe nb and mo show activation barriers
of and ev lower than that of pt machine learning ml was employed to
explore the intrinsic relationship between catalytic performance and
feature parameters we demonstrated that the first ionization energy
bond length of tm h and d band center are more correlated with
hydrogen adsorption behaviour our work not only predicts that fe nb
and mo can be substitutes for pt metal in her but also

In [None]:
# Remoción de datos ruidosos (Opcion B)
def filter_tokens_b(sequence):
    """Esta solucion puede perder tokens que contienen caracteres no alfabeticos"""
    filtered_sequence = []
    for file, tokens in sequence:
        filtered_tokens = [re.sub(r"[^a-zA-Z\s]", " ", token) for token in tokens]
        filtered_tokens = [re.sub(r"\s+", " ", token) for token in filtered_tokens]
        filtered_tokens = [token.strip() for token in filtered_tokens]
        filtered_tokens = [token for token in filtered_tokens if token != ""]
        filtered_sequence.append((file, filtered_tokens))
    return filtered_sequence

In [None]:
sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_b(tokenized_sequence)
for file, text in filtered_sequence:
    print(f"{file}  {' '.join(text)[:70]}")

../files/input\file1.txt  it is essential to develop non precious metal based alternatives used 
../files/input\file2.txt  electric vehicles are gaining global popularity lately and along with 
../files/input\file3.txt  global solar irradiation is an important variable that can be used to 


In [None]:
for file, text in filtered_sequence:
    print(textwrap.fill(' '.join(text)))
    print()
    print()

it is essential to develop non precious metal based alternatives used
in hydrogen evolution reaction her due to high cost and scarcity of pt
based catalysts herein through density functional theory dft
calculations the her activity over single atom anchored phosphorus
carbide pc monolayer tm pc has been systematically investigated
results indicate that g h of v fe nb mo and pd pc are lower than that
of pt catalyst with and ev respectively by imposing the criterion
window g h ev the d band centre d for catalysts with excellent her
ability is in the range of ev besides the five promising her catalysts
follow volmer tafel mechanism fe nb and mo pc show activation barriers
of and ev lower than that of pt machine learning ml was employed to
explore the intrinsic relationship between catalytic performance and
feature parameters we demonstrated that the first ionization energy
bond length of tm h and d band center are more correlated with
hydrogen adsorption behaviour our work not only predic

In [None]:
# Remove the stopwords
nltk.download("stopwords")


def remove_stopwords(sequence):
    stop_words = set(nltk.corpus.stopwords.words("english"))
    filtered_sequence = []
    for file, tokens in sequence:
        filtered_tokens = [token for token in tokens if token not in stop_words]
        filtered_sequence.append((file, filtered_tokens))
    return filtered_sequence


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_b(tokenized_sequence)
filtered_sequence = remove_stopwords(filtered_sequence)
for file, text in filtered_sequence:
    print(f"{file}  {' '.join(text)[:70]}")   

../files/input\file1.txt  essential develop non precious metal based alternatives used hydrogen 
../files/input\file2.txt  electric vehicles gaining global popularity lately along efficient bat
../files/input\file3.txt  global solar irradiation important variable used determine suitability


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jeron\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Save to disk
import os
import textwrap


def save_data(output_directory, sequence):

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for file, tokens in sequence:
        file = file.replace("\\", "/")
        with open(
            f"{output_directory}/{file.split('/')[-1]}",
            "wt",
            encoding="utf-8",
        ) as f:
            f.write(textwrap.fill(" ".join(tokens), width=70))


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_b(tokenized_sequence)
filtered_sequence = remove_stopwords(filtered_sequence)
save_data(output_directory="../files/output", sequence=filtered_sequence)