/
functional_sprott_scraper.py
54 lines (45 loc) · 1.56 KB
/
functional_sprott_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from bs4 import BeautifulSoup
import pendulum
from datetime import timedelta
from airflow import DAG
from airflow.decorators import task
# Default args used when create a new dag
args = {
'owner': 'airflow',
'depends_on_past': False,
'retries': 1,
'retry_delay': timedelta(minutes=5)
}
with DAG(
dag_id='Functional_Sprott_Scraper',
schedule_interval='5 20 * * 1-6',
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
default_args=args,
render_template_as_native_obj=True,
tags=['price', 'scraper']
) as dag:
def web_call(url): # Extract
import requests
r = requests.get(url)
if r.status_code == 200:
soup: BeautifulSoup = BeautifulSoup(r.content, "html.parser")
return soup
else:
return r.status_code
def get_fund_values(soup, index, class_name): # Transform
fund_values = soup.find_all('div', class_=class_name)
value = fund_values[index].contents
return str(value[0]).strip().replace('$US', '').replace(',', '')
def write_json(data, filename='data.json'): # Load
import json
with open(filename, 'w') as f:
json.dump(data, f, indent=4)
@task()
def execute_scraper():
soup = web_call(
url='https://sprott.com/investment-strategies/physical-commodity-funds/uranium/')
data = {}
data['shareprice'] = get_fund_values(soup, 4, 'fundHeader_value')
data['u3o8_stock'] = get_fund_values(soup, 6, 'fundHeader_value')
write_json(data)
execute_scraper()