In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs
from collections import Counter
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from math import log
import re
from collections import Counter
import string
from nltk.stem.snowball import SpanishStemmer

# Leer las noticias

In [2]:
def read_news():
    """
    Read a TXT file and extract the full text from the news.
    :return: list of news
    :rtype: list
    """
    # Notimex
    input_file = 'news_notimex_exercises.txt'
    f = codecs.open(input_file, encoding='utf-8')
    lines_notimex = f.readlines()
    f.close()
    # Agencia EFE
    input_file = 'news_agencia_efe_exercises.txt'
    f = codecs.open(input_file, encoding='utf-8')
    lines_agencia_efe = f.readlines()
    f.close()
    lines_news = lines_notimex + lines_agencia_efe
    return lines_news


def get_top_10_keywords(words):
    """
    Return the top10 frequent words.
    :param list word: List containing all the words
    :return: ordered list with the top-10 most frequent words (in descending order)
    :rtype: list
    """
    counts = Counter(words)
    ordered_list = sorted(words, key=counts.get, reverse=True)
    unique_list = []
    for ol in ordered_list:
        if ol not in unique_list:
            unique_list.append(ol)
    return unique_list[:10]

def show_top_10_keywords(processed_articles):
    # Show the top-10 most frequent words
    for article_words in processed_articles:
        # Get te top-10 words
        top_10_words = get_top_10_keywords(article_words)
        #print(article_words)
        print("\nTOP 10 words:")
        for w in top_10_words:
            print(w)


# Preparo los datos

In [3]:
# Obtain the list with the news
all_news = read_news()
# Get 5 news articles
articles = all_news[0:5]

# Ejercicio 1
Extrae las 10 palabras más frecuentes del artículo

In [4]:
def exercise_1(articles):
    """
    EXERCISE 1:
        * Extract the top-10 most frequent words appearing in the articles
    :param list articles: List containing all the news we will process
    :return: list of news tokenized into words
    :rtype: list
    """
    print("EXERCISE 1...")
    # Show the top-10 most frequent words
    processed_articles = []
    for article in articles:
        # Get the list of words in the article
        article_words = word_tokenize(article)
        processed_articles.append(article_words)
    return processed_articles


# Obtengo las keywords
processed_articles = exercise_1(articles)
#print(processed_articles)
# Muestro las top-10 keywords
show_top_10_keywords(processed_articles)

EXERCISE 1...

TOP 10 words:
de
que
en
la
.
por
el
,
y
(

TOP 10 words:
,
de
la
y
en
del
el
que
.
los

TOP 10 words:
de
,
la
“
”
y
a
por
.
México

TOP 10 words:
de
,
que
y
en
el
.
del
la
para

TOP 10 words:
de
,
la
el
y
del
los
en
que
:


# Ejercicio 2
* Quita las stopwords
* Extrae las 10 palabras más frecuentes de los artículos

In [5]:
def remove_stopwords_from_all_news(news, custom_stopwords):
    """
    Method to remove the stop words from a sentence.
    :param str sentence: List of all news
    :param list stopwords: List containing the stopwords to remove from the sentences
    :return: list of news with the stopwords removed
    :rtype: list
    """
    important_words_in_news=[]
    for one_news in news:
        # Split the sentence into separate words
        # TODO
        # Only keep the important words
        # TODO
    return important_words_in_news



def exercise_2(articles):
    """
    EXERCISE 2: Most of the results are stopwords. Let's remove them from the articles first
        * Perform some normalization:
            * Remove the stopwords (we can also use custom stop words)
        * Extract the top-10 most frequent words appearing in the normalized articles
    :param list articles: List containing all the news we will process
    :return: list of news tokenized into words
    :rtype: list
    """
    print("EXERCISE 2...")
    custom_stopwords = ["Noticia", "Noticias"]
    custom_stopwords += stopwords.words('spanish')
    articles_no_stopwords = remove_stopwords_from_all_news(articles, custom_stopwords)
    return articles_no_stopwords

# Obtengo las keywords
processed_articles = exercise_2(articles)
# Muestro las top-10 keywords
show_top_10_keywords(processed_articles)

EXERCISE 2...

TOP 10 words:
.
,
(
)
La
solo
amonestados
jugadores
final
título

TOP 10 words:
,
.
“
”
comercio
(
)
Alianza
Mercosur
Pacífico

TOP 10 words:
,
“
”
.
México
Radio
UNAM
Universidad
Nacional
Autónoma

TOP 10 words:
,
.
México
”
TLCAN
“
Carstens
integración
beneficios
económica

TOP 10 words:
,
:
.
Tabasco
SITEM
2017
secretario
Educación
En
Diego


# Ejercicio 3
* Convierte el texto a minúscula
* Quita las stopwords
* Quita signos de puntuación
* Quita los acentos
* Extrae las 10 palabras más frecuentes de los artículos

In [6]:
def exercise_3(articles):
    """
    EXERCISE 3: There is further preprocessing we can perform
        * Perform some normalization:
            * Lowercase all words
            * Remove all the stopwords
            * Remove punctuation signs
            * Remove all accents from the words
        * Extract the top-10 most frequent words appearing in the normalized articles
    :param list articles: List containing all the news we will process
    :return: list of news tokenized into words
    :rtype: list
    """
    print("EXERCISE 3...")
    # Lowercase all the articles
    # TODO
    
    # Create a translation table
    table = dict(zip( #  Quitar tildes
        [ord(x) for x in u'áéíóúü'],
        [ord(x) for x in u'aeiouu']
    ))
    # Use the translation table to remove all the accents
    articles = [article.translate(table) for article in articles]
    # Prepare our custom list of stopwords
    custom_stopwords = ["Noticia", "Noticias"]
    custom_stopwords += stopwords.words('spanish')
    # Add all the punctuation signs to the list of stopwords
    custom_stopwords += list(string.punctuation)
    # lowercase all the stopwords
    # TODO
    articles_no_stopwords = remove_stopwords_from_all_news(articles, custom_stopwords)
    return articles_no_stopwords

# Obtengo las keywords
processed_articles = exercise_3(articles)
# Muestro las top-10 keywords
show_top_10_keywords(processed_articles)

EXERCISE 3...

TOP 10 words:
solo
amonestados
jugadores
final
titulo
mexico
7
abr
notimex
.-

TOP 10 words:
comercio
“
”
alianza
mercosur
ministros
pacifico
bloques
encuentro
argentina

TOP 10 words:
“
”
mexico
radio
unam
universidad
nacional
autonoma
violencia
equidad

TOP 10 words:
mexico
mas
”
tlcan
“
carstens
integracion
beneficios
economica
unidos

TOP 10 words:
educacion
sitem
tabasco
2017
secretario
gobierno
estatal
diego
animas
dijo


# Ejercicio 4
* Convierte el texto a minúscula
* Quita las stopwords
* Quita signos de puntuación
* Quita los acentos
* Obtén el "stem" de las palabras
* Extrae las 10 palabras más frecuentes de los artículos

In [7]:
def exercise_4(articles):
    """
    EXERCISE 4: There is further preprocessing we can perform
        * Perform some normalization:
            * Lowercase all words
            * Remove all the stopwords
            * Remove punctuation signs
            * Remove all accents from the words
            * Stem all the words (Stemming = obtaining something similar to the "root" form of the word)
        * Extract the top-10 most frequent words appearing in the normalized articles
    :param list articles: List containing all the news we will process
    :return: list of news tokenized into words
    :rtype: list
    """
    print("EXERCISE 4...")
    # Lowercase all the articles
    articles = [article.lower() for article in articles]
    # Create a translation table
    table = dict(zip( #  Quitar tildes
        [ord(x) for x in u'áéíóúü'],
        [ord(x) for x in u'aeiouu']
    ))
    # Use the translation table to remove all the accents
    articles = [article.translate(table) for article in articles]
    # Prepare our custom list of stopwords
    custom_stopwords = ["Noticia", "Noticias"]
    custom_stopwords += stopwords.words('spanish')
    # Add all the punctuation signs to the list of stopwords
    custom_stopwords += list(string.punctuation)
    # lowercase all the stopwords
    # TODO
    articles_no_stopwords = remove_stopwords_from_all_news(articles, custom_stopwords)
    # Get the spanish stemmer
    stemmer = SpanishStemmer()
    stemmed_articles = []
    for article in articles_no_stopwords:
        article_words = []
        # The article contains all the words. Separate them
        for word in article:
            # TODO
        # The next step expects a string. Let's concatenate the separate words using blanks.
        stemmed_articles.append(article_words)
    return stemmed_articles


# Obtengo las keywords
processed_articles = exercise_4(articles)
# Muestro las top-10 keywords
show_top_10_keywords(processed_articles)

EXERCISE 4...

TOP 10 words:
sol
amonest
jugador
final
titul
mexic
7
abr
notimex
.-

TOP 10 words:
comerci
“
”
ministr
alianz
mercosur
pacif
bloqu
encuentr
argentin

TOP 10 words:
“
”
mexic
radi
unam
univers
nacional
autonom
expresion
violenci

TOP 10 words:
mexic
benefici
mas
”
tlcan
econom
“
carstens
integracion
pued

TOP 10 words:
educacion
sitem
tabasc
trabaj
respons
sal
buen
2017
secretari
gobiern
