# Environment setup

In [1]:
import os
import json
import glob
import datetime

from bs4 import BeautifulSoup, element
import re
from config import Config

import utils
import sql_utils
from typing import List, Dict

# Functions

In [2]:
def get_title(headline: element.Tag):
        
    # Title
    try:
        #title = headline.find("a", {"class": "nw-o-link-split__anchor"}).text
        title = headline.find("a").text
    
    except:
        title = None
        
    finally:
        if title:
            title = title.strip()
    
    return title

In [3]:
def get_url(headline: element.Tag):
    
    # Url
    try:        
        href = headline.find("a")['href']

    except:
        href = None
        
    finally:
        if href:
            url = 'https://www.bbc.com' + href
        else:
            url = href
    
    return url

In [4]:
def get_publish_date(headline: element.Tag):
    
    # Publish date
    publish_date = None
    
    try:
        # Get publish datetime
        #publish_date = headline.find('time', {'class': 'date'})['datetime']
        publish_date = headline.find('span', {'class': 'e16en2lz0'}).text
        
        if len(publish_date) > 20: # Sometimes the wrong tag is picked
            publish_date = None

    except IndexError as e:
        #print(e)
        pass
    
    except ValueError as e:
        #print(e)
        pass
    
    except TypeError as e:
        #print(e)
        pass

    return publish_date

In [5]:
def get_tickers(headline: element.Tag):
    
    # Tickers
    tickers = []

    return tickers

In [6]:
def extract_headline_data(headline: element.Tag):
    
    # Extract data
    extract = {    
        'title': get_title(headline),
        'url': get_url(headline),
        'publish_date': get_publish_date(headline),
        'tickers': get_tickers(headline),
        'countries': utils.get_countries(get_title(headline))
    }
    
    return extract

In [7]:
def get_headlines(soup: BeautifulSoup):
    # Get headlines
    #headlines = soup.find_all('div', {'class': 'gel-layout__item'})
    headlines = soup.find_all('div', {'class': 'ssrcss-tq7xfh-PromoContent exn3ah99'})

    # Extract info from headlines
    news = []
    for headline in headlines:
        extract = extract_headline_data(headline)
        news.append(extract)

    return news

# Main

In [8]:
def extract(filepath: str)->dict:
    
    with open(filepath, 'r') as j:
        file_content = json.loads(j.read())
        
    return file_content

In [9]:
def transform(file_content: str, filepath: str)->dict:
    
    # Extract    
    html = file_content['html']
    soup = BeautifulSoup(html, 'html.parser')

    # Get headlines
    news = get_headlines(soup)

    # Add metadata
    ref_filename = filepath.split('/')[-1]
    downloaded_datetime = file_content['downloaded_datetime']
    
    for i in range(len(news)):
        news[i]['news_outlet'] = re.search('https?://([A-Za-z_0-9.-]+).*', file_content['url']).group(1)
        news[i]['ref_filename'] = ref_filename
        news[i]['downloaded_datetime'] = downloaded_datetime
        

    return news

In [10]:
def validate_extract(data: List[Dict], schema: dict)->bool:
    
    for row in data:
    
        # Check if all columns appear
        for c in schema.keys():
            if c not in row:
                print(c)
                return False

        # Check for nonnullability
        title = row['title']
        url = row['url']
        publish_date = row['publish_date']
        if title is None or title == '':
            return False

        if url is None or url == '':
            return False

    return True

In [11]:
def load(data: dict):
    sql_utils.insert_data_into_db_news_table(data)

In [13]:
def main():
    
    # Get files to process
    etl_consumption_path = Config.NEWS_SETTINGS['BBC']['DOWNLOAD_PATH'] + str(datetime.datetime.now().date()).replace('-', '')
    filepaths = glob.glob(os.path.join(etl_consumption_path, "*.json"))

    # Reupload the missing files
    for filepath in filepaths:
        print(f'ETL: {filepath}')

        # Extract file
        file_content = extract(filepath)
            
        # Transform
        data = transform(file_content, filepath)
        
        # Validate
        is_valid = validate_extract(data, Config.STAGING_SCHEMA)
        
        # Load file
        if is_valid:
            load(data)

In [14]:
if __name__=='__main__':
    main()

ETL: downloads/bbc/20231106/20231106_185003_219852.json
ETL: downloads/bbc/20231106/20231106_184939_580158.json
ETL: downloads/bbc/20231106/20231106_184927_648007.json
ETL: downloads/bbc/20231106/20231106_184951_725009.json
ETL: downloads/bbc/20231106/20231106_184945_706346.json
ETL: downloads/bbc/20231106/20231106_184957_434518.json
ETL: downloads/bbc/20231106/20231106_184933_321954.json
