# Adjectival and demonymic forms from Wikipedia (Spanish)

Demonym or gentilic extraction from tables from Wikipedia spanish annex. Permalinks are used.



## Global demonyms

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import re

url = "https://es.wikipedia.org/w/index.php?title=Anexo:Gentilicios&oldid=144144171"
req = requests.get(url)
req_text = req.text
req_text = re.sub(r'<br />', '\n', req_text)

# tables = pd.read_html(url)  # or pd.read_html(req.text)
tables = pd.read_html(req_text)  # or pd.read_html(req.text)
soup = bs(req_text, "lxml")
sections = soup.select("h3 span.mw-headline")

tables = {section.text: table for section, table in zip(sections, tables)}

print('tables:')
print(tables.keys())

In [None]:
import re
import unicodedata
from more_itertools import stagger


def text_cleaner(value):
    if not isinstance(value, str):
        return value

    text = unicodedata.normalize('NFD', value)
    text = re.sub(r'\[[\w\s,]+\]\u200b', ' ', text)
    text = re.sub(r'[\s\n]+', ' ', text)
    text = re.sub(r'\[cita requerida\]', ' ', text)
    return text



def join_by_rules(prev: str, current: str) -> str:
    if not isinstance(current, str):
        return None
    
    # FIXME: added workaround for only 1 value
    if not current.startswith('-') and current != 'a':
        return current
    
    suffix = re.sub(r'^-', '', current)

    if suffix == 'a' and prev[-1] in ['o']:
        current = prev[:-1] + 'a'
    elif suffix == 'a' and prev[-1] in ['s']:
        current = prev + 'a'
    elif suffix in ['sa', 'la'] and prev[-1] in ['s', 'l']:
        current = prev[:-1] + suffix
    else:
        current = None

    return current


def preprocess_demonyms(value):
    if not isinstance(value, str):
        return value

    text = re.sub(r'(;|,| o | u )', '|', value)
    terms = [t.strip() for t in text.split('|')]
    
    pairs = stagger(terms, offsets=(-1, 0), longest=True)
    demonyms = map(lambda args: join_by_rules(*args), pairs) 
    demonyms = filter(bool, demonyms)
    
    return list(demonyms)

    


In [None]:

table = tables["Países con 2 gentilicios"]
table = table.applymap(text_cleaner)
table
columns_rename = {
    "País, dependencia, o territorio": "origin",
    "Gentilicio": "demonym",
    "Capital(es)": "origin",
    "Gentilicio.1": "demonym",
}
df = pd.concat(
    [
        table[["País, dependencia, o territorio", "Gentilicio"]].rename(
            columns=columns_rename
        ),
        table[["Capital(es)", "Gentilicio.1"]].rename(columns=columns_rename),
    ],
    ignore_index=True,
)

df['demonym'] = df['demonym'].apply(preprocess_demonyms)
df['demonym'] = df['demonym'].fillna(df['origin'].apply(lambda x: [x]))

df = df.explode('demonym')
df


## export

In [None]:
df.to_csv('es-demonyms-global.csv', index=False)

# Argenti