### Installing all necessary dependencies

In [None]:
!pip install sentence_transformers
!pip install feedparser

### Import the dependencies

In [None]:
import sys
import json
from sentence_transformers import SentenceTransformer, util
import feedparser

### Feedparser, cosine similarity and results format

In [3]:
def obtener_noticias_desde_fuentes(fuentes):
    noticias_descripciones = []
    for nombre_fuente, url_feed in fuentes:
        noticias = feedparser.parse(url_feed)
        for entrada in noticias.entries:
            if "opinion" not in entrada.link.lower():
                titulo = entrada.title
                descripcion = entrada.description
                fecha = entrada.published
                num_palabras_descripcion = len(descripcion.split())

                if num_palabras_descripcion > 5:
                    noticias_descripciones.append((nombre_fuente, titulo, fecha, descripcion))
    return noticias_descripciones

In [21]:
def retrieve_articles(query, news_sources):
  embeddings1 = model.encode(query, convert_to_tensor=True)
  embeddings2 = model.encode(news_sources, convert_to_tensor=True)
  cosine_scores = util.cos_sim(embeddings1, embeddings2)

  results = [
          (noticias[i][0], noticias[i][1], noticias[i][2], noticias[i][3], cosine_scores[0][i].item(), i, lugar)
          for i in range(len(noticias))
      ]

  filtered_results = [result for result in results if result[4] >= 0.60]
  filtered_results.sort(key=lambda x: x[4], reverse=True)

  return filtered_results

In [18]:
def imprimir_articulos(lista):
    for idx, (fuente, titulo, fecha, descripcion, score, visitas, pais) in enumerate(lista, start=1):
        print(f"Article ID: {idx}:")
        print(f"  Source: {fuente}")
        print(f"  Title: {titulo}")
        print(f"  Date: {fecha}")
        print(f"  Description: {descripcion}")
        print(f"  Cosine Similarity: {score}")
        print(f"  Place: {pais}")
        print("="*80)

### Loading the Sentence Transformer model

In [None]:
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

### Search and retrieval of similar news

In [5]:
# Put the place and the key words to search the news. This is only an example!
lugar = "México"
palabrasClave = "Poder Judicial"

busqueda = lugar + " " + palabrasClave

In [6]:
# Here are all the RSS Sources (La Jornada, Reforma and Expansión)
fuentes = [
        ("La Jornada", "https://www.jornada.com.mx/rss/edicion.xml?v=1"),
        ("Reforma", "https://www.reforma.com/rss/portada.xml"),
        ("Reforma", "https://www.reforma.com/rss/internacional.xml"),
        ("Reforma", "https://www.reforma.com/rss/cancha.xml"),
        ("Reforma", "https://www.reforma.com/rss/justicia.xml"),
        ("Reforma", "https://www.reforma.com/rss/ciudad.xml"),
        ("Reforma", "https://www.reforma.com/rss/negocios.xml"),
        ("Reforma", "https://www.reforma.com/rss/estados.xml"),
        ("Reforma", "https://www.reforma.com/rss/nacional.xml"),
        ("Reforma", "https://www.reforma.com/rss/ciencia.xml"),
        ("Expansion", "https://expansion.mx/rss"),
    ]

In [7]:
# Obtaining all the news articles
noticias = obtener_noticias_desde_fuentes(fuentes)

In [22]:
# Cosine similarity between the search and all the news articles
sentences1 = [busqueda]
sentences2 = [noticia[1] + " " + noticia[3] for noticia in noticias]

# Only the articles which have a Cosine similarity > 0.60
filtered_results = retrieve_articles(sentences1, sentences2)

In [None]:
# Print all the retrieval articles
imprimir_articulos(filtered_results)