In [None]:
# Note you may have to install requests!  pip3 install requests

import requests
# These two things are for Pandas, it widens the notebook and lets us display data easily.
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

## Simple API Call with Requests Library

It may be good to look at the reference documentation for the [requests library](https://2.python-requests.org/en/master/user/quickstart/).

First, let's have a look at the [GitHub API](https://developer.github.com/v3/).

In [None]:
r = requests.get('https://api.github.com/users/nmattei', timeout=10)
r.status_code

In [None]:
r.headers['content-type']

In [None]:
r.url

In [None]:
r.content

In [None]:
r.json()

## Looking at HTTP Requests

We'll try to get some data from Google.  Note that this is kind of against the TOS and we **should not do it this way in general -- Google has very [specific rules on their site](https://developers.google.com/custom-search/v1/).**

In [None]:
params = {'q':'Tulane University'}
r = requests.get('http://www.google.com/search', params = params, timeout=10)
r.status_code

In [None]:
r.url

In [None]:
params = {'q':'Tulane University'}
r = requests.get('https://search.yahoo.com/', params = params, timeout=10)
r.status_code

In [None]:
r.url

In [None]:
r.headers['content-type']

In [None]:
r.text

## More Complicated with Parameters

We'll look for some information from the [Apple ITunes API](https://affiliate.itunes.apple.com/resources/documentation/itunes-store-web-service-search-api/).

In [None]:
params = {'term' : "the+meters"}
r = requests.get('https://itunes.apple.com/search', params=params, timeout=10)
r.status_code

In [None]:
r.url

In [None]:
r.json()

In [None]:
r.url

We can do lots of parameters in the payload like [this](https://2.python-requests.org/en/master/user/quickstart/).

In [None]:
params = {'term' : "the+meters", 'entity' : 'album'}
r = requests.get('https://itunes.apple.com/search', params=params, timeout=10)
r.status_code


In [None]:
r.url

In [None]:
r.json()

In [None]:
x = r.json()

In [None]:
type(x['results'][0])

## Converting the returned JSON to an object!

In [None]:
import json

In [None]:
data = json.loads(r.content)

In [None]:
data.keys()

In [None]:
type(data['results'])

In [None]:
type(data['results'][1])

In [None]:
data['results'][1]

In [None]:
data['results'][1].keys()

## Using Beautiful Soup to Parse a Webpage.

The [beautifulsoup4 documentation](https://www.crummy.com/software/BeautifulSoup/).

In [None]:
# Grab the course webpage.
import requests
from bs4 import BeautifulSoup

r = requests.get('https://nmattei.github.io/cmps3160/schedule/')

root = BeautifulSoup( r.content )

In [None]:
r.content

In [None]:
root.find("table")

In [None]:
root.find("table").findAll("a")

## Trying out some Regular Expressions.

In [None]:
import re
# Find the index in the raw HTML where we first mention CMPS3160

# Note we use the r to make sure special flags get used correctly.

r = requests.get('https://nmattei.github.io/cmps3160/syllabus/')


In [None]:
# Let's see what we got.
r.text

In [None]:
match = re.search(r'CMPS 3160', r.text)
print(match.start())

In [None]:
r.text[390:500]

In [None]:
# Does the start match?
match = re.match(r'CMPS 3160', r.text)
print(match)

In [None]:
# Iterate over all occurances and print a few characters.
for m in re.finditer(r'CMPS 3160', r.text):
    print(r.text[m.start()-50:m.start()+50])


In [None]:
# Find them all and the word(s)? right after?
match = re.findall(r'CMPS 3160\s\w*', r.text)
print(match)

In [None]:
# Can we find all the email addresses?
text = ''' This is a list that has an @ symbol in it.
            But we want to find Nick's address nsmattei@tulane.edu
            But also maybe someone else's eli@gmail.com....
            How would we write a regex for that?


            Also there is more text, and can't like 
            phil123@school.edu also be able to be caught?



'''

# Need to test on a few first..
# What rules do we need?
regex = r'\D\w*@\w+\.\w{3}'
match = re.findall(regex, text)
print(match)


In [None]:
### ANSWER for full email
regex = r'\w+@\w+.\w{3}'
match = re.findall(regex, text)
print(match)

In [None]:
### Only names, no domains...
regex = r'\w+@'
match = re.findall(regex, text)
print(match)

In [None]:
## Eli's more complicated answer with lookaheads
regex = r"[A-z]+(?=[^A-z\s]*@)"
match = re.findall(regex, text)
print(match)

In [None]:
# Now we can use this on the webpage!
regex = r'\w+@\w+.\w{3}'
match = re.findall(regex, r.text)
print(match)

In [None]:
# More complicated RegExes - Groups
regex = r'\s*([Uu]niversity)\s([Oo]f)\s(\w{3,})'

text = ''' The university of kentucky is the best
            basketball team and an ok university. and University of North CC
            The University Of Kentucky can be put in 
            some weird capitalization and University of Ken spelled wrong'''
m = re.search( regex, text)
print(m.groups())

In [None]:
# Find all
print(re.findall(regex, text))

In [None]:
# Named Groups.
regex = r'\s*([Uu]niversity)\s([Oo]f)\s(?P<school>\w{3,})'
text = ''' The university of kentucky is the best University of Lousiana
            basketball team and an ok university.
            The University Of Kentucky can be put in 
            some weird capitalization'''
m = re.search( regex, text)
print(m.groupdict())


In [None]:
# Find all named groups

# Named Groups.
regex = r'\s*([Uu]niversity)\s([Oo]f)\s(?P<school>\w{3,})'
text = ''' The university of kentucky is the best
            basketball team and an ok university.
            The University Of Kentucky can be put in 
            some weird capitalization.  And Kentucky is much better than
            the University of Mississippi.'''
for m in re.finditer(regex, text):
    print(m.groupdict())


In [None]:
'abcabcabc'.replace('a', 'X')

In [None]:
text = 'I love Introduction to Data Science'
re.sub(r'Data Science', r'Schmada Schmience', text) 

In [None]:
re.sub(r'(\w+)\s([Ss]cience)', r'\2 \1hmience', text) 


In [None]:
# Let's use it to parse part of a CSV?
text = '12,15,22,36,78,33,77,33,45'

# Use Regex split command
print(re.split(',', text))

# Use string split command
print(text.split(","))

#Use Regex to split into groups...
regex = r'(?P<data>\d*,)'
for m in re.finditer(regex, text):
    print(m.groupdict())


## Downloadning All the ... PDFs from the course website.

Using beautiful soup and some regular expressions.

In [None]:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.parse import urljoin
import os
import pathlib

In [None]:
# HTTP GET request sent to the URL url
# But our schedule is in an iframe... can we see that?
r = requests.get( "https://nmattei.github.io/cmps3160/schedule/" )
r.text


In [None]:
# HTTP GET request sent to the URL url
# We're going to use John's webpage because IFrames cause lots of issues...
r = requests.get( "https://cmsc320.github.io/" )
r.text


In [None]:
# Use BeautifulSoup to parse the GET response -- we want the second table on the page..
root = BeautifulSoup( r.content )
lnks = root.findAll("table")
lnks

In [None]:
# Use BeautifulSoup to parse the GET response -- we want the second table on the page..
root = BeautifulSoup( r.content )
lnks = root.findAll("table")
lnks[1].find("tbody").findAll("a")
lnks = lnks[1].find("tbody").findAll("a")
lnks

In [None]:
# Cycle through the href for each anchor, checking
# to see if it's a PDF/PPTX link or not
pdfs = []
for lnk in lnks:
    href = lnk['href']
    
    # If it's a PDF/PPTX link, queue a download   
    if href.lower().endswith(('.pdf', '.pptx')):
        pdfs.append(href)
        #print("{} is a Link to {}".format(lnk.contents,lnk['href']))
print(pdfs)

In [None]:
# Notice above we need to insert the base link...
base_url = "https://cmsc320.github.io/"
for href in pdfs:
    urld = urljoin(base_url, href)
    print(urld)
    rd = requests.get(urld, stream=True)
    
    # Write the downloaded PDF to a file
    # Note because the href is a path we have to just get the filename!
    outfile = os.path.join("./", href.split("/")[-1])
    print("Writing: ",outfile)
    with open(outfile, 'wb') as f:
        f.write(rd.content)

### Below here is an example of how to do this off a GitHub page but it is sort of broken, use at your own risk...

Let's do the easier one first and download all the `.ipynb` from the webpage.  We'll get into why this is easier in a second...

In [None]:
# Cycle through the href for each anchor, checking
# to see if it's an ipynb link or not
notebooks = []
for lnk in lnks:
    href = lnk['href']
    # If it's a PDF/PPTX link, queue a download   
    if href.lower().endswith(('ipynb')):
        notebooks.append(href)
        print("{} is a Link to {}".format(lnk.contents,lnk['href']))
print(notebooks)

In [None]:
# Download all the files to whatever you're running notebook from.

# Be careful for href!

for i, href in enumerate(notebooks):
    print("Downloading... {}".format(href))
    rd = requests.get(href, stream=True)
    
    # Write the downloaded object to a file -- first we should make a directory for it..
    outputdir = os.path.join(os.getcwd(), "downloaded")
    os.makedirs(outputdir, exist_ok=True)
    
    # Note because the href is a path we have to just get the filename!
    outfile = os.path.join(outputdir, href.split("/")[-1])
    print("Writing: ",outfile)
    with open(outfile, 'wb') as f:
        f.write(rd.content)


Let's do this more complicated and try to grab all the PDF's...

First thing to note is that the PDFs have it in the name but not the target and they're hosted on GOOGLE! -- so this doesn't really work :-(


In [None]:
# We can go check, we get a google drive directory...

r = requests.get( "https://drive.google.com/drive/u/1/folders/1uGrhWzhXbiqoChTK0fQXg340X319REks" )

# Use BeautifulSoup to parse the GET response
root = BeautifulSoup( r.content )
#lnks = root.find("table").findAll("a")
#lnks
root.content



In [None]:
# We have all google links so we need to check the tags to see if they contain PDF!
pdfs = []
for lnk in lnks:
    if 'pdf' in lnk.contents[0].lower():
        print("{} is a PDF Link to {}".format(lnk.contents,lnk['href']))
        pdfs.append(lnk['href'])
print(pdfs)

In [None]:
# Note that google doens't make this easy... sorry, you have to do a little kung fu...
# Format is: https://drive.google.com/u/1/uc?id=ID&export=download
download_links = []
for c in pdfs:
    fid = c.split("/")[-2]
    download_links.append("https://drive.google.com/u/1/uc?id={}&export=download".format(fid))
print(download_links)
