# Practice 2 Text Normalization
***
Rodriguez Nuñez Diego Eduardo

## Collect information from different sources
• Get news using RSS feeds from La Jornada and Expansión platforms using the following URLs:

    • https://www.jornada.com.mx/v7.0/cgi/rss.php

    • https://expansion.mx/canales-rss
    
• The data collection should be done once a day during 5 days at agreed time

• News can be repeated from one day to the next, so you must avoid collecting it again

• For each news article extract:

    • Title (<title>)
    • Content summary (<description>)
    • Section
    • URL (<link>)
    • Date of publication (<pubDate>)

• Section of interest are:

    • Sports
    • Economy
    • Science and technology
    • Culture

In [1]:
import feedparser
import pandas as pd
import spacy
from spacy import displacy
import re

In [2]:
months = {
    "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06",
    "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"
}

In [3]:
columns = ['Source', 'Title', 'Content', 'Section', 'URL', 'Date']

In [4]:
def format_date(date):
    match = re.match(r"\w{3}, (\d{2}) (\w{3}) (\d{4})", date)
    if match:
        day = match.group(1)    # Extrae el día
        month_str = match.group(2)  # Extrae el mes como texto
        year = match.group(3)   # Extrae el año

        # Convertir el mes de texto a su correspondiente número
        month = months.get(month_str, "00")  # Si no encuentra el mes, devuelve "00"
        
        # Formatear en dd/mm/yyyy
        formatted_date = f"{day}/{month}/{year}"
        return formatted_date
    else:
        return None # Retorna None si no se puede extraer la fecha

## Raw Data Corpus

In [5]:
la_jornada_urls = ["https://www.jornada.com.mx/rss/deportes.xml?v=1","https://www.jornada.com.mx/rss/economia.xml?v=1","https://www.jornada.com.mx/rss/ciencias.xml?v=1","https://www.jornada.com.mx/rss/cultura.xml?v=1"]
expansion_urls = ["https://www.expansion.mx/rss/economia","https://www.expansion.mx/rss/tecnologia"]

In [6]:
def get_news(url):
    feed = feedparser.parse(url)
    news = []

    # Check if 'title' exists in the feed, otherwise use 'Unknown Source'
    fuente = feed.feed.get('title', 'Unknown Source')

    # Match and extract section, or set it to 'Unknown Section' if not found
    section_match = re.search(r"[-:]\s*(\w+)", fuente)
    section = section_match.group(1) if section_match else 'Unknown Section'
    
    # Loop through entries and collect data
    for entry in feed.entries:
        # Extract the date in dd/mm/yyyy format
        formatted_date = format_date(entry.get('published', 'No Date'))
        news.append({
            'Source': fuente,
            'Title': entry.get('title', 'No Title'),
            'Content': entry.get('description', 'No Description'),
            'Section': section,
            'URL': entry.get('link', 'No URL'),
            'Date': formatted_date
        })
    
    return news

In [7]:
for url_list in [la_jornada_urls, expansion_urls]:
    for url in url_list:
        news = get_news(url)
        df = pd.DataFrame(news)
        df.to_csv("raw_data_corpus.csv", mode='a', header=False, index=False)

print(f"Data saved")

Data saved


In [8]:
df = pd.read_csv("raw_data_corpus.csv")
print("duplicates:", df.duplicated().sum())

duplicates: 77


In [9]:
df_acumulado = pd.read_csv("raw_data_corpus.csv")
df_final = df_acumulado.drop_duplicates(subset=['Title', 'Content','URL'])
df_final.to_csv("raw_data_corpus.csv", index=False)

## Normalized Data Corpus

In [2]:
nlp = spacy.load("es_core_news_sm")
stop = {"ADP","AUX","CCONJ","DET","NUM","PART","PRON", "SCONJ"}

In [3]:
def normalize_text(text):
    # Tokenización
    doc = nlp(text)
    normalized_tokens = []
    for token in doc:
        # Eliminar tokens según su categoría gramatical
        if token.pos_ not in stop:
            # Lematización
            normalized_tokens.append(token.lemma_)
    return " ".join(normalized_tokens)

In [4]:
df2 = pd.read_csv("raw_data_corpus.csv")
df2['Content'] = df2['Content'].astype(str)
df2['Title'] = df2['Title'].astype(str)

In [5]:
df2['Content'] = df2['Content'].apply(normalize_text)
df2['Title'] = df2['Title'].apply(normalize_text)

df2.to_csv("normalized_data_corpus_prueba.csv", index=False)

In [6]:
df_acum = pd.read_csv("normalized_data_corpus_prueba.csv")
df_norm = pd.concat([df_acum, df2])
df_norm = df_norm.drop_duplicates(subset=['Title', 'Content', 'URL'])
df_norm.to_csv("normalized_data_corpus.csv", index=False)