# urllib - a basic built in library of Python which can extract html code from a url

# BeautifulSoup - very useful and easy to use free third party library for Python. 
## Features:
- analysing html code from a url
- easy navigation through the various tags of the html
- identifying specific tags by their respective attributes
- Iteration through bundles of html elements, returned by BeautifulSoup functions as sets
- Manipulation of the html Code


In [6]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = 'https://www.univie.ac.at'
BS = BeautifulSoup(urlopen(url),"lxml")
print(BS)

<!DOCTYPE html>
<html lang="de">
<head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# article: http://ogp.me/ns/article#">
<meta charset="utf-8"/>
<!-- 
	This website is powered by TYPO3 - inspiring people to share!
	TYPO3 is a free open source Content Management Framework initially created by Kasper Skaarhoj and licensed under GNU/GPL.
	TYPO3 is copyright 1998-2016 of Kasper Skaarhoj. Extensions are copyright of their respective owners.
	Information and contribution at http://typo3.org/
-->
<title>Universität Wien</title>
<meta content="TYPO3 CMS" name="generator"/>
<link href="/typo3temp/stylesheet_b46be1037f.css?1484304200" media="all" rel="stylesheet" type="text/css"/>
<link href="/fileadmin/templates/Startseite/assets/part1.css?1429707651" media="all" rel="stylesheet" type="text/css"/>
<link href="/fileadmin/templates/Startseite/assets/part2.css?1458029963" media="all" rel="stylesheet" type="text/css"/>
<link href="/fileadmin/templates/Startseite/assets/icons/style.css?143

In [8]:
print(BS.a)

<a data-toggle="tab" href="#">Webseiten</a>


In [15]:
BS_all_a_tags = BS.findAll("a")
type(BS_all_a_tags)
print(BS_all_a_tags[0])

<a data-toggle="tab" href="#">Webseiten</a>


## Tags can be found by their specific attributes, regex patters are often helpful

In [16]:
import re
BS_tag_by_attribute = BS.find("link",attrs={"href":re.compile(".*part1.*")})
print(BS_tag_by_attribute)

<link href="/fileadmin/templates/Startseite/assets/part1.css?1429707651" media="all" rel="stylesheet" type="text/css"/>


## Case Example: open a wikipediasite, store the tags of internal article links in one list:

In [22]:
html = urlopen("https://en.wikipedia.org/wiki/Kevin_Bacon")
BS = BeautifulSoup(html,"lxml")
articlelinks = BS.find("div", attrs={"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))
print(articlelinks[0])

<a class="mw-disambig" href="/wiki/Kevin_Bacon_(disambiguation)" title="Kevin Bacon (disambiguation)">Kevin Bacon (disambiguation)</a>


## print the content of one tag:

In [23]:
print(articlelinks[0]["href"])

/wiki/Kevin_Bacon_(disambiguation)


<h2><b>Example of a Wikipedia Webcrawler</b><h2><br>
<h3><i>Non-registered users editing pages are shown by their ip-Adress<br>
A recursive crawler can iterate through article links, open up the edit pages,<br>
and receive the IP-Adresses contained within the html code.</i></h3>


## Parsing JSON in Order to receive location of IPs:

- Online IP location service: http://freegeoip.net/json/"relevantIP"<br>
- json library for Python can easily parse JSON-Objects results</h3>

In [27]:
import json
from urllib.request import urlopen

def getCountry(ipAddress):
	response = urlopen("http://freegeoip.net/json/"+ipAddress).read().decode('utf-8')
	responseJson = json.loads(response)
	return responseJson.get("country_code")
print(getCountry("50.78.253.58"))


US


In [30]:
import json
jsonString = """{"arrayOfNums":[{"number":0},{"number":1},{"number":2}],"arrayOfFruits":
[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]}"""
jsonObj = json.loads(jsonString)
print(jsonObj.get("arrayOfNums"))
print(jsonObj.get("arrayOfNums")[1])
print(jsonObj.get("arrayOfNums")[1].get("number")+jsonObj.get("arrayOfNums")[2].get("number"))  
print(jsonObj.get("arrayOfFruits")[2].get("fruit"))


[{'number': 0}, {'number': 1}, {'number': 2}]
{'number': 1}
3
pear


# Using web-APIs with python 
## several good examples of web-APIs that can be invoked through BeautifulSoup include:

### Echonest.com:  search for musicians and browse their songs and albums

### Twitter API: Twitter offers a service for the owners of a Twitter account to browse their tweets with all the attached information 

### Googles GEOCode API: enter an Adress and receive information about the geographic latitutde/longitude position of that adress, timezone informations, elevation above sealevel and many other geographic variables<br>

### Google Analytics: popular for tracking Users of a webpage </h3>

<h2>Pythons built-in urllib can take care of the automatic authentication for an API if needed:<h2>

In [None]:
token = "<your api key>"
webRequest = urllib.request.Request("http://myapi.com", headers={"token":token})
html = urlopen(webRequest)

# In combination with BeautifulSoup, the built-in urllib can be used to download data using the "urlretrieve" function, for example a .jpg-File:

In [31]:
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com")
bsObj = BeautifulSoup(html)
imageLocation = bsObj.find("a", {"id": "logo"}).find("img")["src"]
urlretrieve(imageLocation, "logo.jpg")




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


('logo.jpg', <http.client.HTTPMessage at 0x684f7f0>)

# Warning of the editors of the python webscraping book from O'Reilly:<h2>
## Downloading random content from the web can be a dangerous idea
## Recommendations:
## - instead of downloading data urls can be stored, and later on it can be decided whether the contained data is worth downloading, which also prevents servers to block you for excessive scraping
## - of course, this may be a bad idea because urls can change, and also downloading data may make the scraper appear like a normal user!

# Using .csv files with Python:
## Python has a built-in library to handle .csv files
## If .csv Files are to be read directly rather than downloaded to the hard disk, a Stream object must be employed

In [46]:
from urllib.request import urlopen
from io import StringIO
import csv
data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii', 'ignore')
dataFile = StringIO(data)
csvReader = csv.reader(dataFile)
l = []
for row in csvReader:
    l.append(row)
print(l[0:4])


[['Name', 'Year'], ["Monty Python's Flying Circus", '1970'], ['Another Monty Python Record', '1971'], ["Monty Python's Previous Record", '1972']]


# Reading and sending emails with Python
## the SMTP library smtplib takes care of that

# Textfiles and encoding
## the built-in urllib can retrieve Textfiles and give them a proper encoding:

In [49]:
from urllib.request import urlopen
textPage = urlopen("http://www.pythonscraping.com/pages/warandpeace/chapter1-ru.txt")
print(str(textPage.read(), "utf-8")[0:10])


ЧАСТЬ ПЕРВ


## BeautifulSoup also has encoding Options included for readig documents:

In [54]:
html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
bsObj = BeautifulSoup(html,"lxml")
content = bsObj.find("div", {"id":"mw-content-text"}).get_text()
content = bytes(content, "UTF-8")
content = content.decode("UTF-8")
print(content[0:10])



Python





# Using MySQL with Python
## The following snippet opens an extant database and executes SQL-commands via the third-party library "PyMySQL", assuming we have established the "scraping" data-base that stores webpages, for example wikipedia article links:

In [None]:
import pymysql
conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd=None, db='mysql')
cur = conn.cursor()
cur.execute("USE scraping")
cur.execute("SELECT * FROM pages WHERE id=1")
print(cur.fetchone())
cur.close()
conn.close()


   # Problems with delayed webcontent
   ## Ajax may load the content of a webpage after two seconds, for example
   ## The third-party library "Selenium" can help in these situations
   ## It has some similar functionalities to BeautifulSoup
   ## In this example, a webdriver called "PhantomJS" is used by the library
   ## PhantomJS is a "headless" webbrowser, must be installed for this piece of code
   

In [None]:
from selenium import webdriver
import time
driver = webdriver.PhantomJS(executable_path='')
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
time.sleep(3)
print(driver.find_element_by_id("content").text)
driver.close()


## With the help of Selenium, webpage redirects can be recognized, by validating the existence of a certain element persistently for a period of time:

In [None]:
from selenium import webdriver
import time
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import StaleElementReferenceException

def waitForLoad(driver):
	elem = driver.find_element_by_tag_name("html")# looking for the relevant element
	count = 0
	while True:
		count += 1
		if count > 20:
			print("Timing out after 10 seconds and returning")
			return
		time.sleep(.5)
		try:
			elem == driver.find_element_by_tag_name("html") # testing persistently for the element
		except StaleElementReferenceException:
			return
driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
waitForLoad(driver)
print(driver.page_source)


## Selenium is also useful to wait for the loading of an element: 

In [None]:
driver = webdriver.PhantomJS(executable_path='')
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
try:
	element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton")))
finally:
	print(driver.find_element_by_id("content").text)
	driver.close()


# Using a tor browser with Python
## This snippet visits a website to confirm your ip. Using tor via the port 9150, it should differ from your real IP:

In [None]:
Dieses Script sollte über die Seite Icanhas... eine IP anzeigen die nicht die eigene ist.
import socks
import socket
from urllib.request import urlopen
socks.set_default_proxy(socks.SOCKS5, "localhost", 9150)
socket.socket = socks.socksocket
print(urlopen('http://icanhazip.com').read())


## Selenium via PhantomJS can deliver the same result:

In [None]:
from selenium import webdriver
service_args = [ '--proxy=localhost:9150', '--proxy-type=socks5', ]
driver = webdriver.PhantomJS(executable_path='<path to PhantomJS>',
service_args=service_args)
driver.get("http://icanhazip.com")
print(driver.page_source)
driver.close()
