# Networking
- 'socket' module has low level networking
  - unpleasant to use
  - you will probably never need it, as almost everything is HTTP
- 'urllib' will retrieve HTTP content, as bytes
- 'urllib.urlopen' will return an object similar to a file, with a few extra methods
    - can iterate over the lines
    - grab entire page as one string
    - read character at a time
- [doc](https://docs.python.org/3.5/library/urllib.request.html#module-urllib.request)

In [3]:
import urllib.request

bad = 'http://columbaxy123.edu'

cur = urllib.request.urlopen(bad)
cur.headers.items()

URLError: <urlopen error [Errno 11001] getaddrinfo failed>

In [4]:
cu = 'http://columbia.edu'

# returns object that represents the network connection
cur = urllib.request.urlopen(cu)
cur

<http.client.HTTPResponse at 0x1ae3437fa90>

In [5]:
# headers from the server

cur.headers.items()

[('Date', 'Wed, 26 Sep 2018 22:55:11 GMT'),
 ('Server', 'Apache'),
 ('Cache-Control', 'max-age=300, public'),
 ('Content-language', 'en'),
 ('X-XSS-Protection', '1; mode=block'),
 ('X-Frame-Options', 'SAMEORIGIN'),
 ('X-Content-Type-Options', 'nosniff'),
 ('Expires', 'Sun, 19 Nov 1978 05:00:00 GMT'),
 ('Last-Modified', 'Wed, 26 Sep 2018 22:55:11 GMT'),
 ('ETag', '"1538002511"'),
 ('Content-Type', 'text/html; charset=UTF-8'),
 ('Age', '79'),
 ('X-Varnish-Cache', 'HIT'),
 ('Content-Length', '117229'),
 ('Connection', 'close'),
 ('Accept-Ranges', 'bytes')]

In [6]:
# headers is a dictionary

cur.headers['Server']

'Apache'

In [7]:
# the network connection is an iterable and iterator

cur is iter(cur)

True

In [8]:
# usual iteration protocol reads one line at a time
# note the lines comming back are byte arrays(b'), 
# not strings.  urllib doesn't know or try to 
# guess what encoding is being used by the server

[next(cur), next(cur)]

[b'<!DOCTYPE html>\n',
 b'<html  lang="en" dir="ltr" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# " class="wf-loading" data-ng-app="app">\n']

In [9]:
# grab the rest of the lines with 'list'
# note 'charset=utf-8'

lines = list(cur)
lines[:7]

[b'  <head>\n',
 b'    <meta charset="utf-8" />\n',
 b'<script>(function(i,s,o,g,r,a,m){i["GoogleAnalyticsObject"]=r;i[r]=i[r]||function(){(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)})(window,document,"script","https://www.google-analytics.com/analytics.js","ga");ga("create", "UA-18290390-1", {"cookieDomain":"auto"});ga("set", "anonymizeIp", true);ga("send", "pageview");</script>\n',
 b'<meta name="title" content="Homepage | Columbia University in the City of New York" />\n',
 b'<link rel="shortlink" href="https://www.columbia.edu/content/" />\n',
 b'<link rel="canonical" href="https://www.columbia.edu/content/" />\n',
 b'<meta name="Generator" content="Drupal 8 (https://www.drupal.org)" />\n']

In [10]:
# or read lines with a for loop

cur = urllib.request.urlopen(cu)

for j, line in enumerate(cur):
    # line is a 'bytes' object, not a 'str' object
    print(line)
    if j == 20:
        break


b'<!DOCTYPE html>\n'
b'<html  lang="en" dir="ltr" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# " class="wf-loading" data-ng-app="app">\n'
b'  <head>\n'
b'    <meta charset="utf-8" />\n'
b'<script>(function(i,s,o,g,r,a,m){i["GoogleAnalyticsObject"]=r;i[r]=i[r]||function(){(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)})(window,document,"script","https://www.google-analytics.com/analytics.js","ga");ga("create", "UA-18290390-1", {"cookieDomain":"auto"});ga("set", "anonymizeIp", true);ga("send", "pageview");</script>\n'
b'<meta name="title" content="Homepage | 

# A simple web server

In [None]:
# will serve files in the current directory
# at localhost:port

import http.server
import socketserver

port = 8002

# url = http://localhost:8002

Handler = http.server.SimpleHTTPRequestHandler

httpd = socketserver.TCPServer(("", port), Handler)

print("serving at port", port)
httpd.serve_forever()

# 'Real' python web servers
- two main ones are Django and Flask
- Django [doc](https://www.djangoproject.com)
- Flask [doc](http://flask.pocoo.org)

# W3C recommends utf-8 for web sites
- [doc](https://www.w3.org/International/questions/qa-html-encoding-declarations)