In [1]:
# Carga de datos
import glob


def load_data(input_directory):

    sequence = []
    files = glob.glob(f"{input_directory}/*")
    for file in files:
        with open(file, "rt", encoding="utf-8") as f:
            raw_text = f.read()
            sequence.append((file, raw_text))
    return sequence


sequence = load_data(input_directory="../files/input")
for file, text in sequence:
    print(f"{file}  {text[:70]}")
    

../files/input\file1.txt  It is essential to develop non-precious metal-based alternatives used 
../files/input\file2.txt  Electric vehicles are gaining global popularity lately, and along with
../files/input\file3.txt  Global solar irradiation is an important variable that can be used to 


In [None]:
# Clean text
import re


def clean_text(sequence):
    cleaned_sequence = []
    for file, text in sequence:
        cleaned_text = re.sub(r"\n", " ", text)
        cleaned_text = re.sub(r"\s+", " ", cleaned_text)
        cleaned_text = cleaned_text.strip()
        cleaned_text = cleaned_text.lower()
        cleaned_sequence.append((file, cleaned_text))
    return cleaned_sequence


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
for file, text in cleaned_sequence:
    print(f"{file}  {text[:70]}")
    

../files/input\file1.txt  it is essential to develop non-precious metal-based alternatives used 
../files/input\file2.txt  electric vehicles are gaining global popularity lately, and along with
../files/input\file3.txt  global solar irradiation is an important variable that can be used to 


In [7]:
# Tokenization
import nltk
from nltk.tokenize import word_tokenize

nltk.download("punkt_tab")


def tokenize(sequence):
    tokenized_sequence = []
    for file, text in sequence:
        tokens = word_tokenize(text)
        tokenized_sequence.append((file, tokens))
    return tokenized_sequence


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
for file, text in tokenized_sequence:
    print(f"{file}  {' '.join(text)[:70]}")
    


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...


../files/input\file1.txt  it is essential to develop non-precious metal-based alternatives used 
../files/input\file2.txt  electric vehicles are gaining global popularity lately , and along wit
../files/input\file3.txt  global solar irradiation is an important variable that can be used to 


[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [8]:
import textwrap

for file, text in tokenized_sequence:
    print(textwrap.fill(' '.join(text)))
    print()
    print()


it is essential to develop non-precious metal-based alternatives used
in hydrogen evolution reaction ( her ) due to high cost and scarcity
of pt-based catalysts . herein , through density functional theory (
dft ) calculations , the her activity over 26 single-atom anchored
phosphorus carbide ( pc3 ) monolayer ( tm @ pc3 ) has been
systematically investigated . results indicate that δg * h of v , fe ,
nb , mo , and pd @ pc3 are lower than that of pt ( 1 1 1 ) catalyst ,
with 0.03 , −0.03 , −0.07 , −0.04 , and − 0.02 ev , respectively . by
imposing the criterion window ( −0.2 ≤ δg * h ≤ 0.2 ev ) , the d band
centre ( εd ) for catalysts with excellent her ability is in the range
of − 0.68–0.41 ev . besides , the five promising her catalysts follow
volmer-tafel mechanism . fe , nb , and mo @ pc3 show activation
barriers of 0.75 , 0.74 , and 0.55 ev , lower than that of pt .
machine learning ( ml ) was employed to explore the intrinsic
relationship between catalytic performance and feature

In [9]:
# Remoción de datos ruidosos (Opcion B)
def filter_tokens_b(sequence):
    """Esta solucion puede perder tokens que contienen caracteres no alfabeticos"""
    filtered_sequence = []
    for file, tokens in sequence:
        filtered_tokens = [re.sub(r"[^a-zA-Z\s]", " ", token) for token in tokens]
        filtered_tokens = [re.sub(r"\s+", " ", token) for token in filtered_tokens]
        filtered_tokens = [token.strip() for token in filtered_tokens]
        filtered_tokens = [token for token in filtered_tokens if token != ""]
        filtered_sequence.append((file, filtered_tokens))
    return filtered_sequence

sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_b(tokenized_sequence)
for file, text in filtered_sequence:
    print(f"{file}  {' '.join(text)[:70]}")

../files/input\file1.txt  it is essential to develop non precious metal based alternatives used 
../files/input\file2.txt  electric vehicles are gaining global popularity lately and along with 
../files/input\file3.txt  global solar irradiation is an important variable that can be used to 


In [None]:
# Remove the stopwords
nltk.download("stopwords")


def remove_stopwords(sequence):
    stop_words = set(nltk.corpus.stopwords.words("english"))
    filtered_sequence = []
    for file, tokens in sequence:
        filtered_tokens = [token for token in tokens if token not in stop_words]
        filtered_sequence.append((file, filtered_tokens))
    return filtered_sequence


sequence = load_data(input_directory="../files/input")
cleaned_sequence = clean_text(sequence)
tokenized_sequence = tokenize(cleaned_sequence)
filtered_sequence = filter_tokens_b(tokenized_sequence)
filtered_sequence = remove_stopwords(filtered_sequence)
for file, text in filtered_sequence:
    print(f"{file}  {' '.join(text)[:70]}")    

In [None]:
## country_scientific_production.py


"""Taller Presencial Evaluable"""

import os

import folium  # type: ignore
import pandas as pd  # type: ignore


def load_affiliations():
    """Carga el archivo scopus-papers.csv y retorna un dataframe con la
    columna 'Affiliations'"""

    dataframe = pd.read_csv(
        (
            "https://raw.githubusercontent.com/jdvelasq/datalabs/"
            "master/datasets/scopus-papers.csv"
        ),
        sep=",",
        index_col=None,
    )[["Affiliations"]]
    return dataframe


def remove_na_rows(affiliations):
    """Elimina las filas con valores nulos en la columna 'Affiliations'"""

    affiliations = affiliations.copy()
    affiliations = affiliations.dropna(subset=["Affiliations"])

    return affiliations


def add_countries_column(affiliations):
    """Transforma la columna 'Affiliations' a una lista de paises."""

    affiliations = affiliations.copy()
    affiliations["countries"] = affiliations["Affiliations"].copy()
    affiliations["countries"] = affiliations["countries"].str.split(";")
    affiliations["countries"] = affiliations["countries"].map(
        lambda x: [y.split(",") for y in x]
    )
    affiliations["countries"] = affiliations["countries"].map(
        lambda x: [y[-1].strip() for y in x]
    )
    affiliations["countries"] = affiliations["countries"].map(set)
    affiliations["countries"] = affiliations["countries"].str.join(", ")

    return affiliations


def clean_countries(affiliations):

    affiliations = affiliations.copy()
    affiliations["countries"] = affiliations["countries"].str.replace(
        "United States", "United States of America"
    )
    return affiliations


def count_country_frequency(affiliations):
    """Cuenta la frecuencia de cada país en la columna 'countries'"""

    countries = affiliations["countries"].copy()
    countries = countries.str.split(", ")
    countries = countries.explode()
    countries = countries.value_counts()
    return countries


def plot_world_map(countries):
    """Grafica un mapa mundial con la frecuencia de cada país."""

    countries = countries.copy()
    countries = countries.to_frame()
    countries = countries.reset_index()

    m = folium.Map(location=[0, 0], zoom_start=2)

    folium.Choropleth(
        geo_data=(
            "https://raw.githubusercontent.com/python-visualization/"
            "folium/master/examples/data/world-countries.json"
        ),
        data=countries,
        columns=["countries", "count"],
        key_on="feature.properties.name",
        fill_color="Greens",
    ).add_to(m)

    m.save("files/map.html")


def make_worldmap():
    """Función principal"""

    if not os.path.exists("files"):
        os.makedirs("files")

    affiliations = load_affiliations()
    affiliations = remove_na_rows(affiliations)
    affiliations = add_countries_column(affiliations)
    affiliations = clean_countries(affiliations)
    countries = count_country_frequency(affiliations)
    countries.to_csv("files/countries.csv")
    plot_world_map(countries)


if __name__ == "__main__":
    make_worldmap()





