In [1]:
import glob

In [2]:
def load_data(input_directory):
    sequence = []
    files = glob.glob(f"{input_directory}/*")
    for file in files:
        with open(file, "rt", encoding="utf-8") as f:
            raw_text = f.read()
            sequence.append((file, raw_text))
    return sequence

In [3]:
sequence = load_data(input_directory="../files/input")
for file, text in sequence:
    print(f"{file} {text[:70]}")

../files/input\file1.txt It is essential to develop non-precious metal-based alternatives used 
../files/input\file2.txt Electric vehicles are gaining global popularity lately, and along with
../files/input\file3.txt Global solar irradiation is an important variable that can be used to 


In [5]:
import re

def clean_text(sequence):
    cleaned_sequence = []
    for file, text in sequence:
        cleaned_text = re.sub(r"\n", " ", text)
        cleaned_text = re.sub(r"\s+", " ", cleaned_text)
        cleaned_text = cleaned_text.strip()
        cleaned_text = cleaned_text.lower()
        cleaned_sequence.append((file, cleaned_text))
    return cleaned_sequence

sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
for file, text in cleaned_sequence:
    print(f"{file} {text[:70]}")

../files/input\file1.txt it is essential to develop non-precious metal-based alternatives used 
../files/input\file2.txt electric vehicles are gaining global popularity lately, and along with
../files/input\file3.txt global solar irradiation is an important variable that can be used to 


In [7]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt_tab")

def tokenize(sequence):
    tokenized_sequence = []
    for file, text in sequence:
        tokens = word_tokenize(text)
        tokenized_sequence.append((file, tokens))
    return tokenized_sequence

sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
for file, text in tokenized_sequence:
    print(f"{file} {' '.join(text)[:70]}")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Crimson\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


../files/input\file1.txt it is essential to develop non-precious metal-based alternatives used 
../files/input\file2.txt electric vehicles are gaining global popularity lately , and along wit
../files/input\file3.txt global solar irradiation is an important variable that can be used to 


In [8]:
tokenized_sequence[0][1]

['it',
 'is',
 'essential',
 'to',
 'develop',
 'non-precious',
 'metal-based',
 'alternatives',
 'used',
 'in',
 'hydrogen',
 'evolution',
 'reaction',
 '(',
 'her',
 ')',
 'due',
 'to',
 'high',
 'cost',
 'and',
 'scarcity',
 'of',
 'pt-based',
 'catalysts',
 '.',
 'herein',
 ',',
 'through',
 'density',
 'functional',
 'theory',
 '(',
 'dft',
 ')',
 'calculations',
 ',',
 'the',
 'her',
 'activity',
 'over',
 '26',
 'single-atom',
 'anchored',
 'phosphorus',
 'carbide',
 '(',
 'pc3',
 ')',
 'monolayer',
 '(',
 'tm',
 '@',
 'pc3',
 ')',
 'has',
 'been',
 'systematically',
 'investigated',
 '.',
 'results',
 'indicate',
 'that',
 'δg',
 '*',
 'h',
 'of',
 'v',
 ',',
 'fe',
 ',',
 'nb',
 ',',
 'mo',
 ',',
 'and',
 'pd',
 '@',
 'pc3',
 'are',
 'lower',
 'than',
 'that',
 'of',
 'pt',
 '(',
 '1',
 '1',
 '1',
 ')',
 'catalyst',
 ',',
 'with',
 '0.03',
 ',',
 '−0.03',
 ',',
 '−0.07',
 ',',
 '−0.04',
 ',',
 'and',
 '−',
 '0.02',
 'ev',
 ',',
 'respectively',
 '.',
 'by',
 'imposing',
 'the'

In [9]:
def filter_tokens_a(sequence):
    filtered_sequence = []
    for file, tokens in sequence:
        filtered_tokens = [token for token in tokens if token.isalpha()]
        filtered_sequence.append((file, filtered_tokens))
    return filtered_sequence

sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_a(tokenized_sequence)
for file, text in tokenized_sequence:
    print(f"{file} {' '.join(text)[:70]}")

../files/input\file1.txt it is essential to develop non-precious metal-based alternatives used 
../files/input\file2.txt electric vehicles are gaining global popularity lately , and along wit
../files/input\file3.txt global solar irradiation is an important variable that can be used to 


In [10]:
filtered_sequence[0][1]

['it',
 'is',
 'essential',
 'to',
 'develop',
 'alternatives',
 'used',
 'in',
 'hydrogen',
 'evolution',
 'reaction',
 'her',
 'due',
 'to',
 'high',
 'cost',
 'and',
 'scarcity',
 'of',
 'catalysts',
 'herein',
 'through',
 'density',
 'functional',
 'theory',
 'dft',
 'calculations',
 'the',
 'her',
 'activity',
 'over',
 'anchored',
 'phosphorus',
 'carbide',
 'monolayer',
 'tm',
 'has',
 'been',
 'systematically',
 'investigated',
 'results',
 'indicate',
 'that',
 'δg',
 'h',
 'of',
 'v',
 'fe',
 'nb',
 'mo',
 'and',
 'pd',
 'are',
 'lower',
 'than',
 'that',
 'of',
 'pt',
 'catalyst',
 'with',
 'and',
 'ev',
 'respectively',
 'by',
 'imposing',
 'the',
 'criterion',
 'window',
 'δg',
 'h',
 'ev',
 'the',
 'd',
 'band',
 'centre',
 'εd',
 'for',
 'catalysts',
 'with',
 'excellent',
 'her',
 'ability',
 'is',
 'in',
 'the',
 'range',
 'of',
 'ev',
 'besides',
 'the',
 'five',
 'promising',
 'her',
 'catalysts',
 'follow',
 'mechanism',
 'fe',
 'nb',
 'and',
 'mo',
 'show',
 'acti

In [11]:
def filter_tokens_b(sequence):
    filtered_sequence = []
    for file, tokens in sequence:
        filtered_tokens = [re.sub(r"[^a-zA-Z\s]", " ", token) for token in tokens]
        filtered_tokens = [re.sub(r"\s+", " ", token) for token in filtered_tokens]
        filtered_tokens = [token.strip() for token in filtered_tokens]
        filtered_tokens = [token for token in filtered_tokens if token != ""]
        filtered_sequence.append((file, filtered_tokens))
    return filtered_sequence

sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_b(tokenized_sequence)
for file, text in tokenized_sequence:
    print(f"{file} {' '.join(text)[:70]}")

../files/input\file1.txt it is essential to develop non-precious metal-based alternatives used 
../files/input\file2.txt electric vehicles are gaining global popularity lately , and along wit
../files/input\file3.txt global solar irradiation is an important variable that can be used to 


In [12]:
nltk.download("stopwords")

def remove_stopwords(sequence):
    stop_words = set(nltk.corpus.stopwords.words("english"))
    filtered_sequence = []
    for file, tokens in sequence:
        filtered_tokens = [token for token in tokens if token not in stop_words]
        filtered_sequence.append((file, filtered_tokens))
    return filtered_sequence

sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_b(tokenized_sequence)
filtered_sequence = remove_stopwords(filtered_sequence)
for file, text in filtered_sequence:
    print(f"{file} {' '.join(text)[:70]}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Crimson\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


../files/input\file1.txt essential develop non precious metal based alternatives used hydrogen 
../files/input\file2.txt electric vehicles gaining global popularity lately along efficient bat
../files/input\file3.txt global solar irradiation important variable used determine suitability


In [13]:
print(set(nltk.corpus.stopwords.words("english")))

{'him', "it's", 'herself', 'on', 'few', "didn't", 'her', 'whom', 'be', "they've", 'more', 'wasn', "mustn't", 'any', "she'd", 'ma', "needn't", "i'll", 'between', "shan't", "we'll", 'y', 'haven', 'not', 'is', 'most', "hasn't", 'into', 'to', 'yourselves', "couldn't", 'them', "shouldn't", 'through', 'under', 'very', 'aren', 'ourselves', "weren't", 'other', "we've", 'during', 't', 'd', 'now', 'own', 'above', 'this', 'me', 'wouldn', 'did', 'up', 'about', 'should', 're', "i'm", 'weren', 'because', 'and', 'the', "it'll", 'themselves', "you've", 'itself', 'll', 'of', 'myself', 'we', 'they', 'won', 'until', 'all', 'shouldn', 'that', "he'd", 'from', "won't", 'am', 'who', 'in', 'were', 'isn', 'then', 'out', 'only', 'further', "don't", "hadn't", "haven't", 'how', 'o', "i've", "they're", "they'd", 'too', 'didn', 'himself', 'once', 'he', 'if', 'my', "isn't", 'by', 'no', 'again', 'can', 'needn', "wasn't", 'down', 'after', 'shan', 'it', "it'd", 'do', 'such', 'same', 'being', 'she', 'have', 'those', "wo

In [14]:
import os
import textwrap

def save_data(output_directory, sequence):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    for file, tokens in sequence:
        file = file.replace("\\", "/")
        with open(
            f"{output_directory}/{file.split("/")[-1]}",
            "wt",
            encoding="utf-8",
        ) as f:
            f.write(textwrap.fill(" ".join(tokens), width=70))

sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_b(tokenized_sequence)
filtered_sequence = remove_stopwords(filtered_sequence)
save_data(output_directory="../files/output", sequence=filtered_sequence)
for file, text in filtered_sequence:
    print(f"{file} {' '.join(text)[:70]}")

../files/input\file1.txt essential develop non precious metal based alternatives used hydrogen 
../files/input\file2.txt electric vehicles gaining global popularity lately along efficient bat
../files/input\file3.txt global solar irradiation important variable used determine suitability
