In [2]:
import feedparser
from datetime import datetime
import pandas as pd
import os
import spacy

In [None]:
def format_date(date):
    try:
        date_obj = datetime.strptime(date, "%a, %d %b %Y %H:%M:%S %Z")
        return date_obj.strftime("%d/%m/%Y")
    except ValueError:
        return None
    
la_jornada_urls = [
    "https://www.jornada.com.mx/rss/deportes.xml?v=1",
    "https://www.jornada.com.mx/rss/economia.xml?v=1",
    "https://www.jornada.com.mx/rss/ciencias.xml?v=1",
    "https://www.jornada.com.mx/rss/cultura.xml?v=1"
]
expansion_urls = [
    "https://www.expansion.mx/rss/economia",
    "https://www.expansion.mx/rss/tecnologia"
]
def extract_section(entry, source):
    if 'La Jornada' in source:
        return source.split(":")[-1].strip() if ":" in source else "Sin Sección"
    return entry.get('category', 'Sin Sección')


def get_news(url):
    feed = feedparser.parse(url)
    fuente = feed.feed.get('title', 'Desconocido')
    return [
        {
            'Source': fuente,
            'Title': entry.get('title', 'Sin Título'),
            'Content': entry.get('description', ''),
            'Section': extract_section(entry, fuente),
            'URL': entry.get('link', ''),
            'Date': format_date(entry.get('published', ''))
        }
        for entry in feed.entries
    ]

csv_file = 'noticias2.csv'

if os.path.exists(csv_file):
    existing_df = pd.read_csv(csv_file)
else:
    existing_df = pd.DataFrame()

all_news = []


for url_list in [la_jornada_urls, expansion_urls]:
    for url in url_list:
        try:
            all_news.extend(get_news(url))  
        except feedparser.FeedParserError as e:
            print(f"No se pudo procesar {url}: {str(e)}")


new_df = pd.DataFrame(all_news)
combined_df = pd.concat([existing_df, new_df], ignore_index=True)
combined_df.drop_duplicates(subset=['URL'], keep='last', inplace=True)
combined_df.to_csv(csv_file, index=False)
print(f"Se guardaron {len(new_df)} nuevas noticias.")


In [4]:
nlp = spacy.load('es_core_news_sm')
df = pd.read_csv('noticias2.csv')

def normalize_text(text):
    doc = nlp(text)
    normalized_tokens = [
        token.lemma_.lower() for token in doc 
        if not token.is_stop and not token.is_punct and token.is_alpha and token.pos_ not in ['DET', 'CCONJ', 'SCONJ', 'PRON', 'ADP']
    ]
    return " ".join(normalized_tokens)

df['Title'] = df['Title'].apply(normalize_text)
df['Content'] = df['Content'].apply(normalize_text)


normalized_data_corpus = 'normalized_data_corpus2.csv'
df.to_csv(normalized_data_corpus, index=False)
