In [1]:
import requests
from bs4 import BeautifulSoup
from collections import Counter
import pandas as pd
import re
import matplotlib.pyplot as plt
from urllib.parse import urljoin

In [2]:
def get_text_from_url(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error al acceder a {url}")
        return "", []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    content = soup.find('div', {'id': 'mw-content-text'})
    if not content:
        return "", []
    
    for element in content(['script', 'style', 'table', 'sup']):
        element.decompose()
    
    text = ' '.join(p.get_text() for p in content.find_all('p'))
    text = text.lower()
    
    links = [a['href'] for a in content.find_all('a', href=True) if a['href'].startswith("/wiki/") and ":" not in a['href']]
    full_links = [urljoin(url, link) for link in set(links)]
    
    return text, full_links

In [3]:
def analyze_word_frequency(text, words_to_count):
    words = re.findall(r'\b\w{4,}\b', text)
    word_counts = {word: words.count(word) for word in words_to_count}
    return word_counts

In [6]:
url = "https://es.wikipedia.org/wiki/Python"
words_to_count = ["python", "lenguaje", "programación", "software"]

text, links = get_text_from_url(url)
main_freq = analyze_word_frequency(text, words_to_count)

selected_links = links[:3]  # Tomamos los primeros 3 enlaces internos
data = {"Página Principal": main_freq}

for link in selected_links:
    sub_text, _ = get_text_from_url(link)
    sub_freq = analyze_word_frequency(sub_text, words_to_count)
    data[link] = sub_freq

df = pd.DataFrame.from_dict(data, orient='index').fillna(0)
df.transpose()

Unnamed: 0,Página Principal,https://es.wikipedia.org/wiki/Manejo_de_excepciones,https://es.wikipedia.org/wiki/Encapsulamiento_(inform%C3%A1tica),https://es.wikipedia.org/wiki/Pip_(administrador_de_paquetes)
python,73,0,0,9
lenguaje,16,4,2,0
programación,20,4,4,0
software,11,0,0,2
