# Fuentes de datos de noticias

1. La república
2. Hydrocarbons Colombia
3. Portafolio 

> Para descargar nuevos datos, solamente se debe ejecutar todo para actualizar la base de datos finales

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
from dateutil import parser

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from urllib.error import HTTPError
import pandas as pd
from datetime import datetime
import re
from deep_translator import GoogleTranslator

meses_mapping = {
    'ene': 'Jan',
    'feb': 'Feb',
    'mar': 'Mar',
    'abr': 'Apr',
    'may': 'May',
    'jun': 'Jun',
    'jul': 'Jul',
    'ago': 'Aug',
    'sep': 'Sep',
    'oct': 'Oct',
    'nov': 'Nov',
    'dic': 'Dec'
}

In [2]:
options = Options()
options.add_experimental_option("detach",True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get("https://www.larepublica.co/buscar?term=ecopetrol")
#driver.maximize_window()

time.sleep(10)

max_duration_seconds = 10
start_time = time.time()

try:
    while time.time() - start_time < max_duration_seconds:
        # Encontrar el botón "VER MÁS"
        ver_mas_button = driver.find_element(By.CLASS_NAME, "btn.analisisSect")
        # Hacer scroll hasta el botón
        ver_mas_button.location_once_scrolled_into_view
        time.sleep(2)
        ver_mas_button.click()
except Exception as e:
    print(f"No se pudo hacer clic en el botón 'VER MÁS': {e}")
finally:
    html = driver.page_source
    #with open("html_ec_republica.txt", "w", encoding="utf-8") as file:
    #    file.write(html)
driver.quit()

html = BeautifulSoup(html, "html.parser")

articles = html.find("div", class_="result-list")

# Buscar todos los elementos <a> con la clase "result"
articles = articles.find_all('a', class_='result')

titles = []
links = []
dates = []
# Iterar sobre los enlaces y extraer la información deseada
for link in articles:
    title = link.find('h3').text
    url = link['href']
    date = link.find('span', class_='date').text
    # Almacenar la información en las listas
    titles.append(title)
    links.append(url)
    dates.append(date)

data1 = pd.DataFrame({"Date": dates, "Title": titles, "Link": links})

data1["Link"] = "https://www.larepublica.co" + data1["Link"] 

texts = []
leads = []

for l in data1["Link"]:
    req = Request(url=l)
    try: 
        response = urlopen(req)
        html = BeautifulSoup(response, "html.parser")
        try:
            lead = html.find("div", class_="lead").text
            new = html.find("div", class_="html-content").text
            lead = lead.replace('\n', ' ')
            new = new.replace('\n', ' ')
            text = lead + new
            texts.append(text)
            leads.append(lead)
        except:
            lead = "na"
            new  = "na"
            text = lead + new
            texts.append(text)
            leads.append(lead)
    except HTTPError as e:
        if e.code == 404:
            continue
        else:
            continue
    time.sleep(1)

data1["Headline"] = leads
data1["Article"] = texts


# Limpieza inicial
data1['Article'] = data1['Article'].str.replace(r'\[[^\]]*\]', '') 
data1['Article'] = data1['Article'].str.replace(r'\n', ' ')  
data1['Article'] = data1['Article'].str.strip()  
data1['Title'] = data1['Title'].str.replace(r'\[[^\]]*\]', '') 
data1['Title'] = data1['Title'].str.replace(r'\n', ' ')  
data1['Title'] = data1['Title'].str.strip()  
data1['Headline'] = data1['Headline'].str.replace(r'\[[^\]]*\]', '') 
data1['Headline'] = data1['Headline'].str.replace(r'\n', ' ')  
data1['Headline'] = data1['Headline'].str.strip()  


data1["Source"] = "La República"

data1['Date'] = data1['Date'].str.replace('.', '')

data1['Date'] = data1['Date'].apply(lambda x: ' '.join([meses_mapping[mes] if mes in meses_mapping else mes for mes in x.split()]))
data1['Date'] = pd.to_datetime(data1['Date'], format='%b %d, %Y', errors='coerce')

# Definir las columnas que se van a traducir
columns_to_translate = ["Title","Headline"]

for column in columns_to_translate:
    try:
      data1[column] = data1[column].apply(lambda x: GoogleTranslator(source='es', target='en').translate(x))
    except:
      pass

#data1.to_csv("./LaRepublica.csv", index=False)

old_data1 = pd.read_csv("./Datos/Datos_analisis_EC_traducidos.csv")
old_data1 = old_data1[["Date","Title","Link","Headline","Article","Source"]]
new_data1 = pd.concat([old_data1,data1], ignore_index=True).drop_duplicates(subset=["Link"])
new_data1 = new_data1[["Date","Title","Link","Headline","Article","Source"]]
#new_data1["Date"] = pd.to_datetime(new_data1["Date"])
#new_data1 = new_data1.sort_values(by="Date")
new_data1.to_csv("./Datos/Datos_analisis_EC_traducidos.csv")

In [3]:
# Hydrocarbons Colombia

pagsrange = [str(numero) for numero in range(1, 4)]

titles = []
links = []
dates = []
texts = []

for pag in pagsrange:
    url = "https://hydrocarbonscolombia.com/page/" + pag + "/?s=ECOPETROL&start_date&end_date&cate&usefulness"
    try:
        req = Request(url=url)
        response = urlopen(req)
        html = BeautifulSoup(response, "html.parser")
        articles = html.find_all("div", class_="item")

        for article in articles:
            try:
                if 'active' in article['class']:
                    continue
                else:
                    pass
                title_element = article.find('h4')

                if title_element:
                    title = title_element.a.text.strip()
                    link = title_element.a['href']
                else:
                    title = "nana"
                    link = "nana"

                text_element = article.find('div', class_='newsnippet')
                try: 
                    if text_element:
                        text = text_element.p.text.strip() #if text_element else "nana"
                    else:
                        text = "nana"
                except: 
                    pass  
                
                date_element = article.find('div', class_='col-md-8')
                if date_element:
                    date = date_element.span.text.strip() #if date_element else "nana"
                else:
                    date = "nana"

                titles.append(title)
                links.append(link)
                dates.append(date)
                texts.append(text)
                time.sleep(2)
            except:
                pass

    except HTTPError as e:
        if e.code == 404:
            continue
        else:
            continue

data2 = pd.DataFrame({"Date": dates, "Title": titles, "Link": links,"Headline": texts, "Article": texts})

# Limpieza inicial
data2['Article'] = data2['Article'].str.replace(r'\[[^\]]*\]', '') 
data2['Article'] = data2['Article'].str.replace(r'\n', ' ')  
data2['Article'] = data2['Article'].str.strip()  
data2['Headline'] = data2['Headline'].str.replace(r'\[[^\]]*\]', '') 
data2['Headline'] = data2['Headline'].str.replace(r'\n', ' ')  
data2['Headline'] = data2['Headline'].str.strip()  
data2['Title'] = data2['Title'].str.replace(r'\[[^\]]*\]', '') 
data2['Title'] = data2['Title'].str.replace(r'\n', ' ')  
data2['Title'] = data2['Title'].str.strip()  

data2["Source"] = "Hydrocarbons"

data2['Date'] = data2['Date'].apply(lambda x: parser.parse(x, fuzzy=True) if pd.notnull(x) else None)

#data2.to_csv("./Hydrocarbons.csv", index=False)

old_data2 = pd.read_csv("./Datos/EC Hydrocarbons.csv")
new_data2 = pd.concat([old_data2,data2], ignore_index=True).drop_duplicates(subset=["Link"])
#new_data2["Date"] = pd.to_datetime(new_data2["Date"])
#new_data2 = new_data2.sort_values(by="Date")
new_data2.to_csv("./Datos/EC Hydrocarbons.csv")

In [4]:
# Portafolio 
#pagsrange = [str(numero) for numero in range(1, 10)] 
#
#titles = []
#links = []
#dates = []
#texts = []
#leads = []
#
#for pag in pagsrange:
#    url = "https://www.data3.co/buscar?q=ecopetrol&page=" + pag
#    try:
#        req = Request(url=url)
#        response = urlopen(req)
#        html = BeautifulSoup(response, "html.parser")
#        articles = html.find_all("div", class_="listing")
#        for article in articles:
#            try: 
#                title = article.find("h3").text.strip() if article else "nana"
#                link = "https://www.data3.co" + article.a['href'] if article else "nana"
#                lead = article.find('div',class_='listing-epigraph').text.strip() if article else "nana"
#                lead = lead.replace('\n', ' ')
#                titles.append(title)
#                links.append(link)
#                leads.append(lead)
#                time.sleep(1)
#            except:
#                titles.append("nana")
#                links.append("nana")
#                leads.append("nana")
#                
#    except HTTPError as e:
#        if e.code == 404:
#            continue
#        else:
#            continue
#
#for l in links:
#    req = Request(url=l)
#    try:
#        response = urlopen(req)
#        html = BeautifulSoup(response, "html.parser")
#        try: 
#
#            new = html.find("div", class_="article-content")
#            # Eliminar los textos relacionados a los enlaces
#            for link in new.find_all('a'):
#                link.extract()
#            new = new.text.replace('\n', ' ') if new else "nana"
#
#            text = lead + new
#
#            date = html.find("div", class_="cat-fecha")
#            date = date.find("p", class_='date-time').text if date else "nana"
#            
#
#            dates.append(date)
#            texts.append(text)
#            #leads.append(lead)
#            time.sleep(2)
#        except:
#            dates.append("nana")
#            texts.append("nana")
#    except HTTPError as e:
#        if e.code == 404:
#            continue
#        else:
#            continue   
#    #time.sleep(1)
#
#data3 = pd.DataFrame({"Date": dates, "Title": titles, "Link": links, "Headline": leads, "Article": texts})
#
#data3["Article"] = data3["Headline"] + " " + data3["Article"]
#
## Limpieza inicial
#data3['Article'] = data3['Article'].str.replace(r'\[[^\]]*\]', '') 
#data3['Article'] = data3['Article'].str.replace(r'\n', ' ')  
#data3['Article'] = data3['Article'].str.strip()  
#data3['Headline'] = data3['Headline'].str.replace(r'\[[^\]]*\]', '') 
#data3['Headline'] = data3['Headline'].str.replace(r'\n', ' ')  
#data3['Headline'] = data3['Headline'].str.strip()  
#data3['Title'] = data3['Title'].str.replace(r'\[[^\]]*\]', '') 
#data3['Title'] = data3['Title'].str.replace(r'\n', ' ')  
#data3['Title'] = data3['Title'].str.strip()  
#
#data3["Source"] = "Portafolio"
#
#data3["Date"] = data3["Date"].str.split(' - ').str[0]
#data3["Date"] = data3["Date"].apply(lambda x: ' '.join([meses_mapping[mes] if mes in meses_mapping else mes for mes in x.split()]))
#data3["Date"] = pd.to_datetime(data3["Date"], format='%d %b %Y', errors="ignore")
#
#def convert_date(date):
#    try:
#        date_object = datetime.strptime(date, '%d %b %Y')
#        return date_object.strftime('%Y-%m-%d')
#    except ValueError:
#        return date
#
## Aplicar la función a la columna 'Date'
#data3['Date'] = data3['Date'].apply(convert_date)
#
##data3.to_csv("./Portafolio.csv", index=False)
#
#old_data3 = pd.read_csv("./Datos/EC Portafolio.csv")
#old_data3['Date'] = old_data3['Date'].apply(convert_date)
#
#old_data3 = old_data3[["Date","Title","Link","Headline","Article","Source"]]
#new_data3 = pd.concat([old_data3,data3], ignore_index=True).drop_duplicates(subset=["Link"])
#new_data3 = new_data3[["Date","Title","Link","Headline","Article","Source"]]
#new_data3.to_csv("./Datos/EC Portafolio.csv") 

In [5]:
# Joining dataframes
old_final_data = pd.read_csv("./Datos/Datos_analisis_EC_traducidos.csv")
old_final_data = old_final_data[["Date","Title","Link","Headline","Article","Source"]]
converted_dates = pd.to_datetime(old_final_data["Date"], format='%Y-%m-%d', errors='coerce')
mask = converted_dates.notnull()
old_final_data.loc[mask, "Date"] = converted_dates[mask]
old_final_data = old_final_data[(old_final_data != 'na') & (old_final_data != 'nana')].dropna()
new_final_data = pd.concat([old_final_data,new_data1,new_data2], ignore_index=True).drop_duplicates(subset=["Link"])
new_final_data = new_final_data[["Date","Title","Link","Headline","Article","Source"]]
new_final_data= new_final_data[(new_final_data != 'na') & (new_final_data != 'nana')].dropna()
converted_dates = pd.to_datetime(new_final_data["Date"], format='%Y-%m-%d', errors='coerce')
mask = converted_dates.notnull()
new_final_data.loc[mask, "Date"] = converted_dates[mask]
new_final_data.to_csv("./Datos/Datos_analisis_EC_traducidos.csv")