In [2]:
# pip install pandas feedparser nltk beautifulsoup4 requests numpy

In [3]:
# Sistema Inteligente de Búsqueda y Clasificación de Noticias
# Módulos: Crawling, Procesamiento e Integración de Dataset

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import feedparser
import json
import re
import nltk
nltk.download('averaged_perceptron_tagger_eng')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
import string
from collections import Counter
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Descargar recursos de NLTK necesarios
def download_nltk_resources():
    """Descargar todos los recursos de NLTK necesarios"""
    resources = [
        ('tokenizers/punkt', 'punkt'),
        ('corpora/stopwords', 'stopwords'),
        ('corpora/wordnet', 'wordnet'),
        ('taggers/averaged_perceptron_tagger', 'averaged_perceptron_tagger'),
        ('tokenizers/punkt_tab', 'punkt_tab')
    ]
    
    for resource_path, resource_name in resources:
        try:
            nltk.data.find(resource_path)
            print(f"✅ {resource_name} ya está disponible")
        except LookupError:
            print(f"📥 Descargando {resource_name}...")
            try:
                nltk.download(resource_name, quiet=True)
                print(f"✅ {resource_name} descargado correctamente")
            except Exception as e:
                print(f"⚠️ Error descargando {resource_name}: {e}")

# Descargar recursos
download_nltk_resources()
print("✅ Librerías y recursos NLTK configurados")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...


✅ punkt ya está disponible
✅ stopwords ya está disponible
📥 Descargando wordnet...
✅ wordnet descargado correctamente
✅ averaged_perceptron_tagger ya está disponible
✅ punkt_tab ya está disponible
✅ Librerías y recursos NLTK configurados


[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


In [4]:
# =============================================================================
# MÓDULO 1: CRAWLING Y RECOPILACIÓN DE DOCUMENTOS
# =============================================================================

class NewsCrawler:
    """Crawler para recopilar noticias de diferentes fuentes"""
    
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        self.crawled_articles = []
    
    def crawl_rss_feed(self, feed_url, max_articles=20):
        """Crawl noticias desde RSS feeds"""
        try:
            print(f"🔍 Crawling RSS: {feed_url}")
            feed = feedparser.parse(feed_url)
            articles = []
            
            for entry in feed.entries[:max_articles]:
                article = {
                    'title': getattr(entry, 'title', ''),
                    'description': getattr(entry, 'summary', ''),
                    'link': getattr(entry, 'link', ''),
                    'published': getattr(entry, 'published', ''),
                    'source': feed_url,
                    'content_type': 'rss'
                }
                
                # Intentar obtener contenido completo
                try:
                    content = self.extract_article_content(article['link'])
                    if content:
                        article['full_content'] = content
                    else:
                        article['full_content'] = article['description']
                except:
                    article['full_content'] = article['description']
                
                articles.append(article)
                time.sleep(0.5)  # Ser respetuoso con el servidor
            
            print(f"✅ Obtenidos {len(articles)} artículos de RSS")
            return articles
            
        except Exception as e:
            print(f"❌ Error crawling RSS {feed_url}: {e}")
            return []
    
    def extract_article_content(self, url):
        """Extraer contenido del artículo desde URL"""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Remover scripts y estilos
            for script in soup(["script", "style"]):
                script.decompose()
            
            # Buscar contenido principal
            content_selectors = [
                'article', '.article-content', '.post-content', 
                '.entry-content', '.content', 'main', '.story-body'
            ]
            
            content = ""
            for selector in content_selectors:
                elements = soup.select(selector)
                if elements:
                    content = ' '.join([elem.get_text(strip=True) for elem in elements])
                    break
            
            if not content:
                # Fallback: obtener todo el texto
                content = soup.get_text(strip=True)
            
            return content[:5000]  # Limitar longitud
            
        except Exception as e:
            print(f"⚠️  Error extrayendo contenido de {url}: {e}")
            return None
    
    def crawl_news_websites(self, urls, max_articles_per_site=10):
        """Crawl noticias desde sitios web específicos"""
        all_articles = []
        
        for url in urls:
            try:
                print(f"🔍 Crawling website: {url}")
                response = self.session.get(url, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Buscar enlaces de artículos
                article_links = []
                link_selectors = ['a[href*="article"]', 'a[href*="news"]', 'a[href*="/20"]']
                
                for selector in link_selectors:
                    links = soup.select(selector)
                    for link in links:
                        href = link.get('href')
                        if href:
                            if href.startswith('/'):
                                href = url.rstrip('/') + href
                            article_links.append(href)
                
                # Procesar enlaces únicos
                unique_links = list(set(article_links))[:max_articles_per_site]
                
                for link in unique_links:
                    try:
                        content = self.extract_article_content(link)
                        if content and len(content) > 100:
                            article = {
                                'title': self.extract_title_from_content(content),
                                'description': content[:300] + '...',
                                'link': link,
                                'published': datetime.now().isoformat(),
                                'source': url,
                                'content_type': 'web',
                                'full_content': content
                            }
                            all_articles.append(article)
                    except Exception as e:
                        continue
                    
                    time.sleep(0.5)
                
                print(f"✅ Obtenidos {len([a for a in all_articles if a['source'] == url])} artículos de {url}")
                
            except Exception as e:
                print(f"❌ Error crawling {url}: {e}")
        
        return all_articles
    
    def extract_title_from_content(self, content):
        """Extraer título desde el contenido"""
        sentences = sent_tokenize(content)
        if sentences:
            return sentences[0][:100]
        return "Sin título"
    
    def save_crawled_data(self, articles, filename='crawled_news.json'):
        """Guardar datos crawleados"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(articles, f, ensure_ascii=False, indent=2)
        print(f"💾 Datos guardados en {filename}")

In [5]:
# =============================================================================
# MÓDULO 2: PROCESAMIENTO Y PREPROCESAMIENTO
# =============================================================================

class TextPreprocessor:
    """Clase para preprocesamiento de texto"""
    
    def __init__(self, language='english'):
        self.language = language
        self.stop_words = set(stopwords.words(language))
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        
        # Añadir stopwords personalizadas
        custom_stopwords = {
            'said', 'says', 'say', 'told', 'news', 'report', 'reports',
            'according', 'reuters', 'cnn', 'bbc', 'associated', 'press'
        }
        self.stop_words.update(custom_stopwords)
    
    def clean_text(self, text):
        """Limpieza básica del texto"""
        if not text:
            return ""
        
        # Convertir a minúsculas
        text = text.lower()
        
        # Remover URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        
        # Remover emails
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remover números y caracteres especiales
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        
        # Remover espacios múltiples
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()
    
    def tokenize_text(self, text):
        """Tokenización del texto"""
        try:
            tokens = word_tokenize(text)
            return tokens
        except:
            return text.split()
    
    def remove_stopwords(self, tokens):
        """Remover stopwords"""
        return [token for token in tokens if token not in self.stop_words and len(token) > 2]
    
    def stem_tokens(self, tokens):
        """Aplicar stemming"""
        return [self.stemmer.stem(token) for token in tokens]
    
    def lemmatize_tokens(self, tokens):
        """Aplicar lemmatización"""
        return [self.lemmatizer.lemmatize(token) for token in tokens]
    
    def get_pos_tags(self, tokens):
        """Obtener etiquetas POS"""
        return pos_tag(tokens)
    
    def filter_by_pos(self, pos_tags, allowed_pos=['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']):
        """Filtrar tokens por POS tags"""
        return [word for word, pos in pos_tags if pos in allowed_pos]
    
    def preprocess_document(self, text, use_stemming=True, use_pos_filtering=True):
        """Preprocessamiento completo de un documento"""
        # Limpieza
        cleaned_text = self.clean_text(text)
        
        # Tokenización
        tokens = self.tokenize_text(cleaned_text)
        
        # Remover stopwords
        tokens = self.remove_stopwords(tokens)
        
        # Filtrado por POS
        if use_pos_filtering:
            pos_tags = self.get_pos_tags(tokens)
            tokens = self.filter_by_pos(pos_tags)
        
        # Stemming o Lemmatización
        if use_stemming:
            tokens = self.stem_tokens(tokens)
        else:
            tokens = self.lemmatize_tokens(tokens)
        
        return tokens
    
    def extract_features(self, text):
        """Extraer características del texto"""
        tokens = self.preprocess_document(text)
        
        features = {
            'word_count': len(text.split()),
            'char_count': len(text),
            'sentence_count': len(sent_tokenize(text)),
            'avg_word_length': np.mean([len(word) for word in tokens]) if tokens else 0,
            'vocabulary_richness': len(set(tokens)) / len(tokens) if tokens else 0,
            'most_common_words': Counter(tokens).most_common(10)
        }
        
        return features

In [6]:
# =============================================================================
# MÓDULO 3: INTEGRACIÓN DEL DATASET
# =============================================================================

class DatasetIntegrator:
    """Clase para integrar y procesar datasets"""
    
    def __init__(self):
        self.preprocessor = TextPreprocessor()
        self.integrated_dataset = None
    
    def load_kaggle_dataset(self, filepath):
        """Cargar dataset de Kaggle"""
        try:
            # Intentar diferentes separadores
            for sep in [',', '\t', ';']:
                try:
                    df = pd.read_csv(filepath, sep=sep)
                    if len(df.columns) > 1:
                        break
                except:
                    continue
            
            print(f"✅ Dataset de Kaggle cargado: {df.shape[0]} filas, {df.shape[1]} columnas")
            print(f"Columnas: {list(df.columns)}")
            return df
            
        except Exception as e:
            print(f"❌ Error cargando dataset de Kaggle: {e}")
            return None
    
    def process_kaggle_dataset(self, df):
        """Procesar dataset de Kaggle"""
        processed_articles = []
        
        # Detectar columnas relevantes
        text_columns = []
        category_column = None
        
        for col in df.columns:
            col_lower = col.lower()
            if any(keyword in col_lower for keyword in ['headline', 'title', 'description', 'text', 'content']):
                text_columns.append(col)
            elif any(keyword in col_lower for keyword in ['category', 'class', 'label', 'topic']):
                category_column = col
        
        print(f"📝 Columnas de texto detectadas: {text_columns}")
        print(f"📂 Columna de categoría detectada: {category_column}")
        
        for idx, row in df.iterrows():
            # Combinar texto de todas las columnas relevantes
            text_content = ""
            for col in text_columns:
                if pd.notna(row[col]):
                    text_content += str(row[col]) + " "
            
            if text_content.strip():
                article = {
                    'title': str(row[text_columns[0]]) if text_columns else f"Article {idx}",
                    'description': text_content[:300] + "...",
                    'full_content': text_content,
                    'category': str(row[category_column]) if category_column and pd.notna(row[category_column]) else 'Unknown',
                    'source': 'kaggle_dataset',
                    'content_type': 'dataset',
                    'published': datetime.now().isoformat(),
                    'link': f"kaggle_article_{idx}"
                }
                processed_articles.append(article)
        
        print(f"✅ Procesados {len(processed_articles)} artículos del dataset de Kaggle")
        return processed_articles
    
    def integrate_datasets(self, crawled_articles=None, kaggle_articles=None):
        """Integrar todos los datasets"""
        all_articles = []
        
        if crawled_articles:
            all_articles.extend(crawled_articles)
            print(f"➕ Añadidos {len(crawled_articles)} artículos crawleados")
        
        if kaggle_articles:
            all_articles.extend(kaggle_articles)
            print(f"➕ Añadidos {len(kaggle_articles)} artículos del dataset")
        
        # Crear DataFrame integrado
        df_integrated = pd.DataFrame(all_articles)
        
        # Limpieza y normalización
        df_integrated = df_integrated.drop_duplicates(subset=['title'], keep='first')
        df_integrated = df_integrated.dropna(subset=['full_content'])
        df_integrated['processed_at'] = datetime.now().isoformat()
        
        print(f"🔄 Dataset integrado: {len(df_integrated)} artículos únicos")
        
        self.integrated_dataset = df_integrated
        return df_integrated
    
    def preprocess_integrated_dataset(self, df):
        """Preprocesar dataset integrado con manejo de errores"""
        print("🔧 Iniciando preprocesamiento...")
        
        # Preprocesar textos con manejo de errores
        processed_tokens = []
        text_features = []
        
        for idx, content in enumerate(df['full_content']):
            try:
                tokens = self.preprocessor.preprocess_document(content)
                features = self.preprocessor.extract_features(content)
                processed_tokens.append(tokens)
                text_features.append(features)
            except Exception as e:
                print(f"⚠️ Error procesando artículo {idx}: {e}")
                # Fallback básico
                basic_tokens = content.lower().split()[:50]  # Primeras 50 palabras
                basic_features = {
                    'word_count': len(content.split()),
                    'char_count': len(content),
                    'sentence_count': 1,
                    'avg_word_length': 5,
                    'vocabulary_richness': 0.5,
                    'most_common_words': []
                }
                processed_tokens.append(basic_tokens)
                text_features.append(basic_features)
        
        df['processed_tokens'] = processed_tokens
        df['text_features'] = text_features
        
        # Crear campos adicionales
        df['word_count'] = df['text_features'].apply(lambda x: x['word_count'])
        df['sentence_count'] = df['text_features'].apply(lambda x: x['sentence_count'])
        df['vocab_richness'] = df['text_features'].apply(lambda x: x['vocabulary_richness'])
        
        # Limpiar categorías
        if 'category' in df.columns:
            df['category'] = df['category'].str.lower().str.strip()
            df['category'] = df['category'].fillna('unknown')
        
        print("✅ Preprocesamiento completado")
        return df
    
    def get_dataset_statistics(self, df):
        """Obtener estadísticas del dataset"""
        stats = {
            'total_articles': len(df),
            'unique_sources': df['source'].nunique(),
            'content_types': df['content_type'].value_counts().to_dict(),
            'avg_word_count': df['word_count'].mean(),
            'avg_sentence_count': df['sentence_count'].mean(),
            'categories': df['category'].value_counts().head(10).to_dict() if 'category' in df.columns else {},
            'date_range': {
                'earliest': df['published'].min() if 'published' in df.columns else None,
                'latest': df['published'].max() if 'published' in df.columns else None
            }
        }
        
        return stats
    
    def save_integrated_dataset(self, df, filename='integrated_news_dataset.csv'):
        """Guardar dataset integrado"""
        # Preparar para guardar (convertir listas a strings)
        df_save = df.copy()
        df_save['processed_tokens'] = df_save['processed_tokens'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))
        df_save['text_features'] = df_save['text_features'].apply(lambda x: json.dumps(x) if isinstance(x, dict) else str(x))
        
        df_save.to_csv(filename, index=False, encoding='utf-8')
        print(f"💾 Dataset integrado guardado en {filename}")

In [None]:
# =============================================================================
# EJECUCIÓN PRINCIPAL
# =============================================================================

def main():
    """Función principal que ejecuta todo el pipeline"""
    print("🚀 Iniciando Sistema de Crawling y Procesamiento de Noticias\n")
    
    # Inicializar componentes
    crawler = NewsCrawler()
    integrator = DatasetIntegrator()
    
    # 1. CRAWLING DE NOTICIAS
    print("=" * 60)
    print("MÓDULO 1: CRAWLING Y RECOPILACIÓN DE DOCUMENTOS")
    print("=" * 60)
    
    crawled_articles = []
    
    # RSS Feeds populares
    rss_feeds = [
        'http://rss.cnn.com/rss/edition.rss',
        'http://feeds.bbci.co.uk/news/rss.xml',
        'https://rss.reuters.com/news',
        'https://feeds.npr.org/1001/rss.xml',
        'https://rss.dw.com/rdf/rss-en-all'
    ]
    
    print("🔍 Intentando crawlear RSS feeds...")
    for feed in rss_feeds:
        try:
            articles = crawler.crawl_rss_feed(feed, max_articles=10)
            if articles:
                crawled_articles.extend(articles)
                print(f"✅ {len(articles)} artículos obtenidos de {feed}")
            else:
                print(f"⚠️ No se pudieron obtener artículos de {feed}")
        except Exception as e:
            print(f"❌ Error con {feed}: {e}")
    
    # Sitios web alternativos
    web_sources = [
        'https://news.ycombinator.com',
        'https://www.reddit.com/r/worldnews/.rss',
        'https://techcrunch.com'
    ]
    
    print("\n🌐 Intentando crawlear sitios web...")
    try:
        web_articles = crawler.crawl_news_websites(web_sources, max_articles_per_site=2)
        if web_articles:
            crawled_articles.extend(web_articles)
            print(f"✅ {len(web_articles)} artículos obtenidos de sitios web")
        else:
            print("⚠️ No se pudieron obtener artículos de sitios web")
    except Exception as e:
        print(f"❌ Error crawleando sitios web: {e}")
    
    
    print(f"\n📊 Total de artículos crawleados: {len(crawled_articles)}")
    
    # Mostrar algunos ejemplos de artículos crawleados
    if crawled_articles:
        print("\n📰 Ejemplos de artículos crawleados:")
        for i, article in enumerate(crawled_articles[:2]):
            print(f"\n--- Artículo Crawleado {i+1} ---")
            print(f"Título: {article['title'][:70]}...")
            print(f"Fuente: {article['source']}")
            print(f"Tipo: {article['content_type']}")
            print(f"Descripción: {article['description'][:100]}...")
    
    # 2. CARGA DEL DATASET DE KAGGLE
    print("\n" + "=" * 60)
    print("MÓDULO 2: CARGA DEL DATASET DE KAGGLE")
    print("=" * 60)
    
    #----------------------------------------------------------------------------------------
    # Crear un dataset de ejemplo más robusto
    print("📝 Creando dataset de ejemplo para demostración...")
    
    example_data = {
        'headline': [
            'Breaking: New AI Technology Revolutionizes Healthcare Diagnosis',
            'Climate Change Effects Visible in Arctic Region Show Alarming Trends',
            'Stock Market Reaches New Record High Driven by Tech Sector',
            'Major Breakthrough in Renewable Energy Research Promises Cost Reduction',
            'International Trade Agreements Under Review by Multiple Nations',
            'Education Reform Bill Passes Senate Vote with Bipartisan Support',
            'Tech Giant Announces New Product Launch with Revolutionary Features',
            'Environmental Protection Measures Implemented Across National Parks',
            'Economic Recovery Shows Positive Signs in Manufacturing Sector',
            'Scientific Discovery Changes Understanding of Quantum Physics'
        ],
        'category': [
            'TECHNOLOGY', 'ENVIRONMENT', 'BUSINESS', 'SCIENCE', 'POLITICS',
            'EDUCATION', 'TECHNOLOGY', 'ENVIRONMENT', 'BUSINESS', 'SCIENCE'
        ],
        'short_description': [
            'Revolutionary AI system promises to transform patient care and medical diagnosis through advanced machine learning algorithms that can detect diseases earlier than traditional methods.',
            'Scientists document rapid changes in Arctic ice patterns and wildlife behavior due to global warming trends, with implications for sea level rise and climate stability.',
            'Markets surge to unprecedented levels driven by technology sector growth and investor confidence in artificial intelligence and cloud computing companies.',
            'Researchers develop highly efficient solar panel technology breakthrough that could revolutionize clean energy production and reduce costs by up to 40 percent.',
            'Government officials review existing trade partnerships and agreements to improve economic relationships and address supply chain disruptions affecting global commerce.',
            'New legislation aims to modernize educational standards nationwide and improve student outcomes through increased funding and curriculum updates.',
            'Company unveils innovative consumer technology product line featuring cutting-edge features like advanced AI integration and sustainable materials.',
            'New policies protect endangered species and natural habitats through comprehensive conservation efforts and increased enforcement of environmental regulations.',
            'Economic indicators suggest sustained growth and job creation across multiple industry sectors, with manufacturing showing particular strength.',
            'Quantum physics research reveals new fundamental particles that challenge existing scientific theories and could lead to breakthrough technologies.'
        ]
    }
    
    df_kaggle = pd.DataFrame(example_data)
    kaggle_articles = integrator.process_kaggle_dataset(df_kaggle)
    
    # 3. INTEGRACIÓN Y PROCESAMIENTO
    print("\n" + "=" * 60)
    print("MÓDULO 3: INTEGRACIÓN DEL DATASET")
    print("=" * 60)
    
    # Integrar datasets
    df_integrated = integrator.integrate_datasets(crawled_articles, kaggle_articles)
    
    # Preprocesar dataset integrado
    df_processed = integrator.preprocess_integrated_dataset(df_integrated)
    
    # Obtener estadísticas
    stats = integrator.get_dataset_statistics(df_processed)
    
    # 4. MOSTRAR RESULTADOS
    print("\n" + "=" * 60)
    print("RESULTADOS Y ESTADÍSTICAS")
    print("=" * 60)
    
    print(f"📊 Total de artículos procesados: {stats['total_articles']}")
    print(f"🌐 Fuentes únicas: {stats['unique_sources']}")
    print(f"📝 Promedio de palabras por artículo: {stats['avg_word_count']:.1f}")
    print(f"📄 Promedio de oraciones por artículo: {stats['avg_sentence_count']:.1f}")
    
    print("\n📂 Distribución por tipo de contenido:")
    for content_type, count in stats['content_types'].items():
        print(f"  - {content_type}: {count} artículos")
    
    print("\n🏷️  Top categorías:")
    for category, count in list(stats['categories'].items())[:5]:
        print(f"  - {category}: {count} artículos")
    
    # Mostrar ejemplos de artículos procesados
    print("\n📰 Ejemplos de artículos procesados:")
    for i, (_, article) in enumerate(df_processed.head(3).iterrows()):
        print(f"\n--- Artículo Procesado {i+1} ---")
        print(f"Título: {article['title'][:80]}...")
        print(f"Categoría: {article.get('category', 'N/A')}")
        print(f"Fuente: {article['source']}")
        print(f"Tipo de contenido: {article['content_type']}")
        print(f"Palabras: {article['word_count']}")
        if isinstance(article['processed_tokens'], list):
            print(f"Tokens procesados: {' '.join(article['processed_tokens'][:10])}...")
        else:
            print(f"Tokens procesados: {str(article['processed_tokens'])[:50]}...")
    
    # Separar artículos crawleados vs dataset
    crawled_count = len([a for a in df_processed.iterrows() if a[1]['source'] != 'kaggle_dataset'])
    dataset_count = len([a for a in df_processed.iterrows() if a[1]['source'] == 'kaggle_dataset'])
    
    print(f"\n🔍 Resumen de fuentes:")
    print(f"  - Artículos crawleados: {crawled_count}")
    print(f"  - Artículos del dataset: {dataset_count}")
    
    # Guardar datasets
    try:
        integrator.save_integrated_dataset(df_processed)
        crawler.save_crawled_data(crawled_articles)
        
        print("\n✅ ¡Pipeline completado exitosamente!")
        print("\nArchivos generados:")
        print("- integrated_news_dataset.csv (Dataset completo procesado)")
        print("- crawled_news.json (Datos crawleados en formato JSON)")
    except Exception as e:
        print(f"⚠️ Error guardando archivos: {e}")
        print("✅ Pipeline completado (sin guardar archivos)")
    
    return df_processed, stats

# Ejecutar el pipeline
if __name__ == "__main__":
    final_dataset, dataset_stats = main()

🚀 Iniciando Sistema de Crawling y Procesamiento de Noticias

MÓDULO 1: CRAWLING Y RECOPILACIÓN DE DOCUMENTOS
🔍 Intentando crawlear RSS feeds...
🔍 Crawling RSS: http://rss.cnn.com/rss/edition.rss
⚠️  Error extrayendo contenido de https://www.cnn.com/collections/intl-trump-040223/: 404 Client Error: Not Found for url: https://edition.cnn.com/collections/intl-trump-040223/
⚠️  Error extrayendo contenido de https://www.cnn.com/collections/intl-ukraine-030423/: 404 Client Error: Not Found for url: https://edition.cnn.com/collections/intl-ukraine-030423/
✅ Obtenidos 10 artículos de RSS
✅ 10 artículos obtenidos de http://rss.cnn.com/rss/edition.rss
🔍 Crawling RSS: http://feeds.bbci.co.uk/news/rss.xml
✅ Obtenidos 10 artículos de RSS
✅ 10 artículos obtenidos de http://feeds.bbci.co.uk/news/rss.xml
🔍 Crawling RSS: https://rss.reuters.com/news
✅ Obtenidos 0 artículos de RSS
⚠️ No se pudieron obtener artículos de https://rss.reuters.com/news
🔍 Crawling RSS: https://feeds.npr.org/1001/rss.xml
✅ Obt