# Practice 2 Text Normalization
***
Rodriguez Nuñez Diego Eduardo

## Collect information from different sources
• Get news using RSS feeds from La Jornada and Expansión platforms using the following URLs:

    • https://www.jornada.com.mx/v7.0/cgi/rss.php

    • https://expansion.mx/canales-rss
    
• The data collection should be done once a day during 5 days at agreed time

• News can be repeated from one day to the next, so you must avoid collecting it again

• For each news article extract:

    • Title (<title>)
    • Content summary (<description>)
    • Section
    • URL (<link>)
    • Date of publication (<pubDate>)

• Section of interest are:

    • Sports
    • Economy
    • Science and technology
    • Culture

In [1]:
import feedparser
import pandas as pd
import spacy
from spacy import displacy
import re

In [2]:
months = {
    "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06",
    "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"
}

In [3]:
columns = ['Source', 'Title', 'Content', 'Section', 'URL', 'Date']

In [4]:
def format_date(date):
    match = re.match(r"\w{3}, (\d{2}) (\w{3}) (\d{4})", date)
    if match:
        day = match.group(1)    # Extrae el día
        month_str = match.group(2)  # Extrae el mes como texto
        year = match.group(3)   # Extrae el año

        # Convertir el mes de texto a su correspondiente número
        month = months.get(month_str, "00")  # Si no encuentra el mes, devuelve "00"
        
        # Formatear en dd/mm/yyyy
        formatted_date = f"{day}/{month}/{year}"
        return formatted_date
    else:
        return None # Retorna None si no se puede extraer la fecha

## Raw Data Corpus

In [5]:
la_jornada_urls = ["https://www.jornada.com.mx/rss/deportes.xml?v=1","https://www.jornada.com.mx/rss/economia.xml?v=1","https://www.jornada.com.mx/rss/ciencias.xml?v=1","https://www.jornada.com.mx/rss/cultura.xml?v=1"]
expansion_urls = ["https://www.expansion.mx/rss/economia","https://www.expansion.mx/rss/tecnologia"]

In [6]:
def get_news(url):
    feed = feedparser.parse(url)
    news = []
    fuente = feed.feed.title
    # Usa una cadena sin procesar (raw string) con el prefijo r
    section_match = re.search(r"[-:]\s*(\w+)", fuente)
    # Si hay un match, extraemos la sección, de lo contrario dejamos la sección como None
    section = section_match.group(1) if section_match else None
    for entry in feed.entries:
        # Extraemos la fecha en formato dd/mm/yyyy
        formatted_date = format_date(entry.published)
        news.append({
            'Source': feed.feed.title,
            'Title': entry.title,
            'Content': entry.description,
            'Section': section,
            'URL': entry.link,
            'Date': formatted_date
        })
    return news

In [14]:
csv_file = "raw_data_corpus2.csv"
for i, url_list in enumerate([la_jornada_urls, expansion_urls]):
    for url in url_list:
        news = get_news(url)
        df = pd.DataFrame(news)

        # Write the header only for the first URL processed
        if i == 0 and url == la_jornada_urls[0]:
            df.to_csv(csv_file, mode='a', header=columns, index=False)
        else:
            df.to_csv(csv_file, mode='a', header=False, index=False)

print(f"Data saved to {csv_file}")


Data saved to raw_data_corpus2.csv


## Normalized Data Corpus

In [8]:
nlp = spacy.load("es_core_news_sm")
stop_Pos = {'DET', 'ADP', 'SCONJ', 'CCONJ', 'PRON'}

In [9]:
def normalize_text(text):
    doc = nlp(text)
    normalized_tokens =[]
    for token in doc:
        if not token.is_stop and token.pos_ not in stop_Pos:
            normalized_tokens.append(token.lemma_)
    return " ".join(normalized_tokens)

In [10]:
df_normalized = pd.read_csv("raw_data_corpus.csv")
df_normalized['Normalized Content'] = df_normalized['Content'].apply(normalize_text)
df_normalized['Normalized Title'] = df_normalized['Title'].apply(normalize_text)
df_normalized.to_csv("normalized_data_corpus.csv", index=False)