__author__ = "Christoph Hartleb"<br>
__copyright__ = "Copyright 2019"<br>
__credits__ = ["Christoph Hartleb"]<br>
__version__ = "1.0.1"<br>
__email__ = "christophhartleb@gmx.at"<br>
__status__ = "Production"

### Description of the project

This is a Script for Webscraping. Goal of that script is to demonstrate, that variety of possibilities of how to get data with python.

# Web Scraping with Beautiful Soup

This part of the script tries to get all "H"-entries out of an online dictionary and saves that entries into a *.csv file. The list is sorted alphabetically.

Data sorce: __https://www.computer-dictionary-online.org__

In [164]:
# Import requests for getting the URL and Beautifulsoup for webscraping.
import requests
from bs4 import BeautifulSoup

# Get the URL here.
r = requests.get("https://www.computer-dictionary-online.org/glossary/h.html")
# Get the content.
c = r.content

# Get the HTML element.
soup = BeautifulSoup(c,"html.parser")

In [165]:
# Find all list-elements in HTML.
all = soup.find_all("li")

In [166]:
# Create a list and iterate through all a-elements.
# Then get the text of that element and list it out
list = []
for item in all:
    list.append(str(item.find_all("a")[0].text))

In [167]:
# Import pandas and create namespace pd.
import pandas as pd

In [168]:
# Create a Pandas dataframe object.
df = pd.DataFrame(list)

In [169]:
# Creates a *.csv file in saves it within the scrapped data.
df.to_csv('h_terms.csv', index=False)

# ... with Scrapy

here moves the desription forward.

# ... with lxml


1) Parsing HTML<br>
In this example it is tried to get some financial data from _https://finance.yahoo.com/quote/APPL?p=APPL_ and save them into a dataframe. The shows how to parse HTML data.

2) Parsing XML<br>
Exmple number two shows how to parse XML data.

### 1) Parsing HTML

In [170]:
from lxml.html import parse
from urllib.request import urlopen

In [171]:
# Parse the data.
parsed = parse(urlopen('https://finance.yahoo.com/quote/APPL?p=APPL'))
parsed

<lxml.etree._ElementTree at 0x2ab8e4d3248>

In [172]:
# Return the HTML element object.
doc = parsed.getroot()
doc

<Element html at 0x2ab8d3a1c78>

In [173]:
# Get the objects that display "a" elements.
links = doc.findall('.//a')
links[15:30]

[<Element a at 0x2ab8e727a98>,
 <Element a at 0x2ab8e7279a8>,
 <Element a at 0x2ab8e727958>,
 <Element a at 0x2ab8e727a48>,
 <Element a at 0x2ab8e727ae8>,
 <Element a at 0x2ab8e7279f8>,
 <Element a at 0x2ab8e727b38>,
 <Element a at 0x2ab8e727b88>,
 <Element a at 0x2ab8e727c28>,
 <Element a at 0x2ab8e727bd8>,
 <Element a at 0x2ab8e727e08>,
 <Element a at 0x2ab8e727c78>]

In [175]:
# Get a link out of the list.
lnk = links[18]
lnk

<Element a at 0x2ab8e727a48>

In [176]:
# Give the URL back of the link.
lnk.get('href')

'/portfolios'

In [177]:
# Finally get the content back.
lnk.text_content()

'My Portfolio'

In [178]:
# Now a list will be created, that show all URLs conected to that site.
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
urls[-10:]

['/watchlists',
 '/portfolios',
 '/screener',
 '/premium?ncid=navbarprem_fqbo1nu0ks0',
 '/calendar',
 '/industries',
 'https://money.yahoo.com',
 '/videos/',
 '/news/',
 '/tech']

In [179]:
# Now find the calls and puts data of the site.
tables = doc.findall('.//table')
calls = tables[1]
puts = tables[-68:]
calls
puts

[<Element table at 0x2ab8e72e638>, <Element table at 0x2ab8e72e598>]

In [180]:
# Find all header lines of the elements.
rows = calls.findall('.//tr')
rows

[<Element tr at 0x2ab8e72eb88>,
 <Element tr at 0x2ab8e72ec28>,
 <Element tr at 0x2ab8e72eea8>,
 <Element tr at 0x2ab8e72ed18>,
 <Element tr at 0x2ab8e72ee08>,
 <Element tr at 0x2ab8e72ee58>,
 <Element tr at 0x2ab8e72ef48>,
 <Element tr at 0x2ab8e72edb8>]

In [181]:
# Extract the data out of the headers.
def _unpack(row, kind='td'):
    elts = row.findall('.//%s' % kind)
    return [val.text_content() for val in elts]

In [182]:
_unpack(rows[6], kind='td')

['Average for Category', 'N/A']

In [183]:
_unpack(rows[0], kind='th')

[]

In [184]:
# Combine steps to get a dataframe. To do that first of all import libraries.
from pandas.io.parsers import TextParser

In [185]:
# Define a function for convertion of types automatically.
def parse_options_data(table):
    rows = table.findall('.//tr')
    header = _unpack(rows[0], kind='th')
    data = [_unpack(r) for r in rows[1:]]
    return TextParser(data, names=header).get_chunk()

In [186]:
# Call the function to get the Output.
call_data = parse_options_data(calls)
call_data

Unnamed: 0,0,1
0,Beta (5Y Monthly),
1,Yield,
2,5y Average Return,
3,Holdings Turnover,
4,Last Dividend,
5,Average for Category,
6,Inception Date,


### 2) Parsing XML

In [187]:
from lxml import objectify

In [188]:
path = ''
parsed = objectify.parse(open(path))
root = parsed.getroot()

TypeError: expected str, bytes or os.PathLike object, not Response

In [None]:
data = []
skip_fields = ['PARENT_SEQ', INDICATOR_SEQ, 'DESIRED_CHANGE', 'DECIMAL_PLACES']

In [None]:
for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        continue
    el_data[child.tag] = child.pyval
data.append(el_data)

In [None]:
perf = DataFrame(data)

In [None]:
perf

In [None]:
from io import StringIO
tag = '<a href = "http://www.google.com">Google</a>'

In [None]:
root = objectify.parse(StringIO(tag)).getroot()

In [None]:
root

In [None]:
root.get('href')

In [None]:
root.text