In [20]:
# Pipeline inicial para Requerimiento 5
# - instalar dependencias si faltan (solo si lo deseas)
# - parsear archivo .bib grande a un DataFrame
# - extraer primer autor y campos clave
# - enriquecer por DOI usando Crossref (con cache) para obtener afiliación/pais
# NOTA: Este bloque está diseñado para ejecutarse por partes; por defecto procesa un subconjunto de registros para pruebas.
import importlib, subprocess, sys, os, time, json, re
from pathlib import Path

def ensure_packages(packages):
    """Instala paquetes pip que no estén presentes.
    No instala si ya están disponibles.
    """
    for pkg in packages:
        try:
            importlib.import_module(pkg)
        except Exception:
            print(f"Instalando {pkg}...")
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg])

# Paquetes recomendados
REQUIRED = ['bibtexparser','pandas','requests','pycountry','tqdm','rapidfuzz']
# Descomenta la siguiente línea si quieres que el notebook instale dependencias automáticamente
# ensure_packages(REQUIRED)

# Imports principales (después de instalar si es necesario)
import bibtexparser
import pandas as pd
import requests
import pycountry
from tqdm import tqdm
from rapidfuzz import fuzz, process

# ---------- Utilidades de parseo y normalización ----------
def parse_bib_to_df(bib_path, max_entries=None):
    """Parsea un archivo .bib a un DataFrame con campos clave.
    Devuelve pandas.DataFrame con columnas: id, title, authors, first_author, doi, year, venue, abstract, keywords, raw_entry
    """
    bib_text = Path(bib_path).read_text(encoding='utf-8', errors='ignore')
    bib_db = bibtexparser.loads(bib_text)
    rows = []
    for i, entry in enumerate(bib_db.entries):
        if max_entries is not None and i >= max_entries:
            break
        eid = entry.get('ID') or entry.get('key') or f'row{i}'
        title = entry.get('title','').strip()
        authors = entry.get('author','').strip()
        doi = entry.get('doi','').strip()
        year = entry.get('year','').strip()
        venue = entry.get('journal', entry.get('booktitle','')).strip()
        abstract = entry.get('abstract','').strip()
        keywords = entry.get('keywords', entry.get('keyword','')).strip()
        raw = str(entry)
        first_author = extract_first_author(authors)
        rows.append({
            'id': eid,
            'title': title,
            'authors': authors,
            'first_author': first_author,
            'doi': doi,
            'year': year,
            'venue': venue,
            'abstract': abstract,
            'keywords': keywords,
            'raw_entry': raw,
        })
    df = pd.DataFrame(rows)
    return df


def extract_first_author(authors_str):
    if not authors_str:
        return ''
    # BibTeX authors are usually separated by ' and '
    parts = [p.strip() for p in authors_str.split(' and ')]
    first = parts[0] if parts else ''
    # Normalize formats like 'Last, First' -> 'Last' or 'First Last' -> 'Last'
    if ',' in first:
        last = first.split(',')[0].strip()
    else:
        toks = first.split()
        last = toks[-1] if toks else first
    return last

# ---------- Enriquecimiento: Crossref + heurísticas ----------
COUNTRIES = [c.name.lower() for c in pycountry.countries]
# Add some common aliases
ALIASES = {'usa':'united states','us':'united states','u.s.a.':'united states','uk':'united kingdom','england':'united kingdom'}

def find_country_in_text(text):
    if not text:
        return None
    t = text.lower()
    # direct match country names
    for cname in COUNTRIES:
        if cname in t:
            return cname.title()
    # aliases
    for a,v in ALIASES.items():
        if a in t:
            return v.title()
    return None


def enrich_by_doi(doi, email=None, sleep=1.0):
    """Consulta Crossref por DOI y trata de extraer afiliación/pais del primer autor.
    Devuelve dict con keys: affiliation_raw, country, country_iso2, source, confidence
    """
    if not doi:
        return {'affiliation_raw':'','country':'','country_iso2':'','source':'','confidence':0.0}
    # normalize doi for URL (remove leading DOI: if present)
    doi_clean = doi.strip()
    doi_clean = doi_clean.replace('doi:','').replace('DOI:','').strip()
    url = f'https://api.crossref.org/works/{requests.utils.requote_uri(doi_clean)}'
    headers = {'User-Agent': f'proyecto-analisis-algoritmos (mailto:{email})' if email else 'proyecto-analisis-algoritmos'}
    try:
        r = requests.get(url, headers=headers, timeout=20)
        if r.status_code != 200:
            return {'affiliation_raw':'','country':'','country_iso2':'','source':'crossref','confidence':0.0}
        data = r.json()
        msg = data.get('message', {})
        authors = msg.get('author', [])
        if not authors:
            return {'affiliation_raw':'','country':'','country_iso2':'','source':'crossref','confidence':0.0}
        first = authors[0]
        affs = first.get('affiliation', [])
        aff_text = ''
        if affs:
            # affiliation is often list of dicts with 'name'
            if isinstance(affs, list):
                aff_text = ' '.join([a.get('name','') for a in affs if isinstance(a, dict)])
            else:
                aff_text = str(affs)
        # try to find country in affiliation text
        country = find_country_in_text(aff_text)
        country_iso = ''
        if country:
            try:
                c = pycountry.countries.get(name=country) or pycountry.countries.get(common_name=country)
                if c:
                    country_iso = c.alpha_2
            except Exception:
                country_iso = ''
        # sleep to respect rate limits
        time.sleep(sleep)
        return {'affiliation_raw':aff_text, 'country': country.title() if country else '', 'country_iso2': country_iso, 'source':'crossref', 'confidence': 0.9 if country else 0.5}
    except Exception as e:
        # error contacting crossref
        return {'affiliation_raw':'','country':'','country_iso2':'','source':'crossref_error','confidence':0.0, 'error': str(e)}


def load_cache(path):
    if not os.path.exists(path):
        return {}
    try:
        return pd.read_csv(path, dtype=str).set_index('id').to_dict(orient='index')
    except Exception:
        return {}


def save_cache(dct, path):
    df = pd.DataFrame.from_dict(dct, orient='index')
    df.index.name = 'id'
    df.reset_index(inplace=True)
    df.to_csv(path, index=False, encoding='utf-8')


def batch_enrich(df, cache_path='country_lookup.csv', email=None, sleep=1.0, max_rows=None):
    cache = load_cache(cache_path)
    updated = False
    total = len(df) if max_rows is None else min(len(df), max_rows)
    for idx in tqdm(range(total)):
        row = df.iloc[idx]
        rid = row['id']
        if rid in cache:
            continue
        doi = row.get('doi','')
        if not doi:
            # try heuristics on raw_entry
            aff = find_country_in_text(row.get('raw_entry',''))
            cache[rid] = {'doi': doi, 'affiliation_raw': row.get('raw_entry',''), 'country': aff.title() if aff else '', 'country_iso2': '', 'source':'heuristic' if aff else '', 'confidence': 0.3 if aff else 0.0}
            updated = True
            continue
        res = enrich_by_doi(doi, email=email, sleep=sleep)
        rowd = {'doi': doi, 'affiliation_raw': res.get('affiliation_raw',''), 'country': res.get('country',''), 'country_iso2': res.get('country_iso2',''), 'source': res.get('source',''), 'confidence': res.get('confidence',0.0)}
        cache[rid] = rowd
        updated = True
    if updated:
        save_cache(cache, cache_path)
    return cache

# ---------- Guardar registros y preparar estado para la nube ----------
def prepare_state_files(df, out_dir='proyecto/requerimiento5/data'):
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    records_path = Path(out_dir)/'records.csv'
    df.to_csv(records_path, index=False, encoding='utf-8')
    # frequencies.json placeholder
    freq_path = Path(out_dir)/'frequencies.json'
    if not freq_path.exists():
        with open(freq_path,'w',encoding='utf-8') as f:
            json.dump({'total_terms':0,'terms':{}}, f, ensure_ascii=False, indent=2)
    return records_path, Path(out_dir)/'country_lookup.csv', freq_path

print('Módulo cargado. Define rutas y ejecuta las funciones de prueba con un subconjunto de registros.')


Módulo cargado. Define rutas y ejecuta las funciones de prueba con un subconjunto de registros.


In [21]:
# Ejemplo de uso: procesar TODO el .bib (sin límite)
# Ajusta rutas según estructura del repositorio
BIB_PATH = '../primeros_' \
'500.bib'  # ruta relativa al notebook (ajusta si es necesario)
OUT_DIR = 'data'

# Parsear TODO el .bib (sin max_entries)
print('Parsing completo del archivo .bib — esto puede tardar dependiendo del tamaño (~10k registros).')
df = parse_bib_to_df(BIB_PATH, max_entries=None)
print(f'Parsed {len(df)} records (total)')

# Guardar registros y preparar archivos de estado
records_csv, country_cache_path, freq_path = prepare_state_files(df, out_dir=OUT_DIR)
print('Records guardados en', records_csv)

# Enriquecer por DOI (ejecútalo si quieres probar crossref; suministra tu correo en email)
# Para evitar bloqueos, procesa con sleep>=1.0 y considera ejecutar por la noche para muchos registros.
email = ''  # opcional: tu correo para User-Agent en Crossref
# Nota: batch_enrich ahora procesará todos los registros pendientes en cache (sin max_rows)
cache = batch_enrich(df, cache_path=str(country_cache_path), email=email, sleep=1.0, max_rows=None)
print('Enriquecimiento finalizado. Caché guardada en', country_cache_path)

# Mostrar estadísticas de cobertura
covered = sum(1 for v in cache.values() if v.get('country'))
print(f'Paises asignados: {covered} / {len(df)} = {covered/len(df):.2%}')

# Frecuencias (placeholder)
print('Fichero de frecuencias (placeholder):', freq_path)


Parsing completo del archivo .bib — esto puede tardar dependiendo del tamaño (~10k registros).
Parsed 500 records (total)
Records guardados en data\records.csv
Parsed 500 records (total)
Records guardados en data\records.csv


100%|██████████| 500/500 [10:37<00:00,  1.27s/it]



Enriquecimiento finalizado. Caché guardada en data\country_lookup.csv
Paises asignados: 377 / 500 = 75.40%
Fichero de frecuencias (placeholder): data\frequencies.json


In [None]:
import pandas as pd
import pycountry
import plotly.express as px
from pathlib import Path

OUT_DIR = Path('outputs')
OUT_DIR.mkdir(parents=True, exist_ok=True)

records_path = Path('data/records.csv')
cache_path = Path('data/country_lookup.csv')

if not records_path.exists():
    raise FileNotFoundError(f"No se encontró {records_path}. Ejecuta la celda de parseo antes.")

records = pd.read_csv(records_path, dtype=str).fillna('')
cache = pd.read_csv(cache_path, dtype=str).fillna('') if cache_path.exists() else pd.DataFrame(columns=['id','country','country_iso2'])

if 'id' not in records.columns:
    raise ValueError('La tabla de records no contiene la columna `id`.')

if cache.empty:
    print('Advertencia: cache de países vacía.')

if 'country_iso2' in cache.columns:
    cache['country_iso2'] = cache['country_iso2'].str.upper().replace({'NAN':''})

merged = records.merge(cache[['id','country','country_iso2']], on='id', how='left')

agg = merged.groupby(['country','country_iso2'], dropna=False).size().reset_index(name='count')
agg = agg[(agg['country'].notna()) & (agg['country'] != '')]

def iso2_to_iso3(a2):
    try:
        if not a2 or pd.isna(a2):
            return None
        c = pycountry.countries.get(alpha_2=str(a2).upper())
        return c.alpha_3 if c else None
    except Exception:
        return None

agg['iso3'] = agg['country_iso2'].apply(iso2_to_iso3)

if agg['iso3'].isna().any():
    def name_to_iso3(name):
        try:
            if not name or pd.isna(name):
                return None
            c = pycountry.countries.lookup(name)
            return c.alpha_3
        except Exception:
            return None
    agg['iso3'] = agg.apply(lambda r: r['iso3'] if pd.notna(r['iso3']) else name_to_iso3(r['country']), axis=1)

df_plot = agg.dropna(subset=['iso3'])

if df_plot.empty:
    print('No hay países con ISO3 válido para plotear.')
    display(agg.sort_values('count', ascending=False).head(20))
else:
    fig = px.choropleth(
        df_plot, locations='iso3', color='count', hover_name='country',
        color_continuous_scale='Viridis', projection='natural earth',
        title='Publicaciones por país (primer autor)'
    )
    fig.update_layout(coloraxis_colorbar=dict(title='Número de publicaciones'))
    fig.show()

    out_png = OUT_DIR / 'mapa_paises.png'
    out_pdf = OUT_DIR / 'mapa_paises.pdf'

    try:
        # Guardar PNG (requiere kaleido)
        fig.write_image(str(out_png), format='png', engine='kaleido')
        print('Mapa guardado en', out_png)
        # Convertir PNG -> PDF usando Pillow
        try:
            from PIL import Image
            im = Image.open(out_png).convert('RGB')
            im.save(out_pdf, 'PDF', resolution=300)
            print('PDF generado en', out_pdf)
        except Exception as e:
            print('No se pudo convertir PNG a PDF con Pillow:', e)
    except Exception as e:
        print('No se pudo exportar PNG. Error:', e)


No se pudo guardar como PDF directamente con kaleido. Error: 
Image export using the "kaleido" engine requires the Kaleido package,
which can be installed using pip:

    $ pip install --upgrade kaleido

No se pudo exportar imagen con `kaleido`. Intenta `pip install -U kaleido pillow`. Error: 
Image export using the "kaleido" engine requires the Kaleido package,
which can be installed using pip:

    $ pip install --upgrade kaleido

Fallo guardado: revisa la instalación de `kaleido` y `Pillow` y reinicia el kernel si acabas de instalarlas.




Support for the 'engine' argument is deprecated and will be removed after September 2025.
Kaleido will be the only supported engine at that time.




Support for the 'engine' argument is deprecated and will be removed after September 2025.
Kaleido will be the only supported engine at that time.


