In [None]:
# Importing required libraries

import time

from bs4 import BeautifulSoup
from selenium import webdriver

from pymongo import MongoClient

### Connection to MongoDB and simulation

In [None]:
# Creating a connection to MongoDB
client = MongoClient('localhost', 27017)
db = client['news']
collection = db['elespectador']

In [None]:
# Base URL of the site to be analyzed
SITE_URL = 'https://www.elespectador.com'

In [None]:
# Firefox web driver path
# Download the driver for you S.O. here: https://github.com/mozilla/geckodriver/releases
DRIVER_PATH = './geckodriver.exe'

In [None]:
# Creating a new firefox window
browser = webdriver.Firefox(executable_path = DRIVER_PATH)

In [None]:
def make_request(browser, relative_path):
    # Making the request and rendering the browser
    browser.get(SITE_URL + relative_path)
    
    # Simulating vertical scrolling for handling lazy load
    check_height = browser.execute_script('return document.body.scrollHeight;')
    while True:
        browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(3)
        height = browser.execute_script('return document.body.scrollHeight;')
        if height == check_height: 
            break 
        check_height = height
    
    # Getting HTML content and passing it to BeautifulSoup for scraping analysis
    return BeautifulSoup(browser.page_source, 'html.parser')


In [None]:
# this function is created to simulate the opinion editorial category
def make_request1(browser, relative_path):
    # Making the request and rendering the browser
    browser.get(SITE_URL + relative_path)
    
    # Simulating vertical scrolling for handling lazy load
    check_height = browser.execute_script('return document.body.scrollHeight;')
    while True:
        browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(3)
        height = browser.execute_script('return document.body.scrollHeight;')
        if height == check_height: 
            break 
        check_height = height
        
    #simulate button click    
    for i in range (15):
        element = browser.find_element_by_css_selector('div[class*="pure-button"]')
        browser.execute_script("arguments[0].click();", element)
        
    time.sleep(30)
    
    browser.execute_script('window.scrollTo(0,-250);')

   
    
    # Getting HTML content and passing it to BeautifulSoup for scraping analysis
    return BeautifulSoup(browser.page_source, 'html.parser')


### Extract and organize the "El Espectador" information

In [None]:
def upload_news(category):
    
    news = []
    cards = None
    for i in range(1):
        # Getting HTML content for news listing page
        soup = make_request(browser, '/archivo/'+category+'/'+str((i+1))+'/')
        
        # Finding the section where news are contained 
        layout = soup.find(class_ = 'Layout-flexAds')
        # Getting blocks from layout
        blocks = layout.find('section').find_all(recursive = False)
        
        if cards is None:
            cards = blocks[0].find_all(class_ = 'Card') + blocks[1].find_all(class_ = 'Card')
            
        else:
            # Finding and concatenating news cards
            cards = cards + blocks[0].find_all(class_ = 'Card') + blocks[1].find_all(class_ = 'Card')
            
        
        # Building a list with title and relative path of the news founded
        for card in cards:
            news.append({
                'title': card.find('h2', class_ = 'Card-Title').find('a').get_text(),
                'relative_path': card.find('h2', class_ = 'Card-Title').find('a')['href']
            })
            
            
    return news



In [None]:
def order_information(news):
    for n in news:
        # Getting HTML content for news page
        soup = make_request(browser, n['relative_path'])
        
        # conditional that determines if the article contains an author or not
        if soup.find(class_ = 'ACredit-Author').find('a') is not None:
            
            # Extracting news metadata
            n['datetime'] = soup.find(class_ = 'ArticleHeader-Date').get_text()
            n['author'] = soup.find(class_ = 'ACredit-Author').find('a').get_text()
            n['summary'] =soup.find(class_ = 'ArticleHeader-Hook').find('div').get_text()
        
        else:
            n['author'] = 'no author'
        
        soup1 = soup
        # remove <a> tag from html
        [s.extract() for s in soup1("a")]
        
        # Extracting and concatenating news full text
        paragraphs = soup1.find_all(class_ = 'font--secondary')
        n['full_text'] = ' '.join([p.get_text() for p in paragraphs])
                
    return news


### Web scraping political category

In [None]:
news = upload_news('politica')
len(news)

In [None]:
# Storing extracted information for further analysis
collection.insert_many(order_information(news))

###  Web scraping category editorial opinion

In [None]:
news = []

# Getting HTML content for news listing page
soup = make_request1(browser, '/opinion/editorial/')

# Finding the section where news are contained 
layout = soup.find(class_ = 'Layout-flexAds')
# Getting blocks from layout
blocks = layout.find('section').find_all(recursive = False)

# Finding and concatenating news cards
cards =  blocks[0].find_all(class_ = 'Card') + blocks[1].find_all(class_ = 'Card')
            
# Building a list with title and relative path of the news founded
for card in cards:
    news.append({
        'title': card.find('h2', class_ = 'Card-Title').find('a').get_text(),
        'relative_path': card.find('h2', class_ = 'Card-Title').find('a')['href']
    })

In [None]:
# Storing extracted information for further analysis
collection.insert_many(order_information(news))

###  Web scraping category economy

In [None]:
news = upload_news('economia')
len(news)

In [None]:
# Storing extracted information for further analysis
collection.insert_many(order_information(news))

###  Web scraping category Colombia

In [None]:
news = upload_news('colombia')
len(news)

In [None]:
# Storing extracted information for further analysis
collection.insert_many(order_information(news))

### Web scraping category Judicial

In [None]:
news = upload_news('judicial')
len(news)

In [None]:
# Storing extracted information for further analysis
collection.insert_many(order_information(news))