### Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import requests
from bs4 import BeautifulSoup
import escritoras_latinas.data.load as load
import pandas as pd

### Load data

In [3]:
data_raw = load.data_raw
data_processed = load.data_processed

### Read data

In [4]:
# Read 'csv' file as dataframe
df = pd.read_csv(f'{data_raw}')

# Create a list from a column values
urls = df['Url'].tolist()

# Show sample from dataframe
df.sample(1)

Unnamed: 0,Nombre,Url,País
486,Stella Corvalán Vega,https://es.wikipedia.org/wiki/Stella_Corval%C3...,Chile


### Web scrapping

In [5]:
# Create an empty list
bios = []
# HTML parsing to download text from Wikipedia
for i in range(len(urls)):
    # Make requests to the specified urls
    response = requests.get(urls[i])
    # Return content of the response
    html = response.text
    # Parse html
    soup = BeautifulSoup(html, 'html.parser')
    # Look for <p> tag
    bio = soup.find('p')
    # Extract text from tag
    text = bio.text
    # Clean text with regular expressions
    text = re.sub(r'\n', '', text) # new lines
    text = re.sub(r'\u200b', '', text) # unicode character
    text = re.sub(r'\[\d\]', '', text) # references on wikipedia
    # Append text to list
    bios.append(text)

### Process data

In [6]:
# Convert list to column on dataframe
df = df.assign(Biografía=bios)

# Show sample from dataframe
df.sample(1)

Unnamed: 0,Nombre,Url,País,Biografía
824,Jane Durán,https://es.wikipedia.org/wiki/Jane_Dur%C3%A1n,Cuba,Jane Durán (1944) es una poeta anglocubana.


### Save data

In [7]:
# Save dataframe as 'csv' file
df.to_csv(f'{data_processed}/escritoras_wiki.csv', index=False)