In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import os 
import sys 

In [None]:
import time 

In [None]:
import re
import json
import bs4
import requests

In [None]:
from bs4 import BeautifulSoup

In [None]:
import pandas as pd

# Scrapping 

## https://www.springfieldspringfield.co.uk

In [None]:
bowl = requests.get('https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=anne-2017&episode=s02e10')

In [None]:
bowl.status_code

In [None]:
soup = BeautifulSoup(bowl.text, 'html.parser')
type(soup)

Using a tag name as an attribute will give you only the first tag by that name

In [None]:
soup.body.div

In [None]:
soup.body.find_all('class="episode_script"')

get results between `div` and a specific `class`, such as `<div class="scrolling-script-container">`

```html
<div class="episode_script">
<div class="scrolling-script-container">
                    			1
 [NO AUDIBLE DIALOGUE.]<br/>
  - [MUFFLED.]<br/> Here.<br
 ...                            
</div>  

```  
https://stackoverflow.com/a/22735249/7583919

In [None]:
lyrics = soup.find_all(lambda tag: tag.name == 'div' and tag.get('class') ==
                       ['scrolling-script-container'])[0].text

subtitle = os.getcwd() + '/subtitle/'
if not os.path.exists(subtitle):
    os.makedirs(subtitle)

with open('subtitle/lyrics.html', 'w') as f:
    f.write(lyrics)

In [None]:
bowl = requests.get(
    'https://www.springfieldspringfield.co.uk/episode_scripts.php?tv-show=anne-2017'
)
soup = BeautifulSoup(bowl.text, 'html.parser')

In [None]:
soup.body

In [None]:
soup.find_all(lambda tag: tag.name == 'div' and tag.get('class') ==
              ['main-content-left'])

In [None]:
soup.find_all(lambda tag: tag.name == 'div' and tag.get('class') ==
              ['main-content-left'])[0].find_all('a')

## http://www.aaronsw.com
http://www.aaronsw.com/weblog/fullarchive

bowl.text vs bowl.content

`bowl.text` is the content of the response in Unicode, and `bowl.content` is the content of the response in bytes. useful when the address refers to such as image.

In [None]:
bowl = requests.get('http://www.aaronsw.com/weblog/fullarchive')
soup = BeautifulSoup(bowl.text, 'html.parser')

<span style="font-family:New York Times; font-size:1em; color:green;">
Extract all the url of posts from soup

In [None]:
soup.body.find_all('a', href = True, limit=10)[1]['href']

In [None]:
soup = BeautifulSoup(bowl.text, 'html.parser')
PostLink = soup.body.find_all('a', href = True)
PostLink = [i['href'] for i in PostLink][2:]
baseurl = 'http://www.aaronsw.com/weblog/'

### single thread approach

In [None]:
startt = time.time()
baseurl = 'http://www.aaronsw.com/weblog/'
bowls = [ requests.get(baseurl + i) for i in PostLink]
time.time()-startt

<span style="font-family:New York Times; color: red">
Takes around 6 minutes
</span>

<span style="font-family:New York Times; font-size:1em; color:green;">
It's a good scenario to apply multithreadings
    
Q: Is the task CPU intensive or I/O intensive? If the answer is I/O intensive, then you can go with threads.

https://stackoverflow.com/questions/40894487/python-threading-or-multiprocessing-for-web-crawler

### multithreading 

In [None]:
from concurrent.futures import ThreadPoolExecutor

In [None]:
startt = time.time()
pool = ThreadPoolExecutor(6)
res = []
future = pool.submit(requests.get, [baseurl+i for i in PostLink])
time.time()-startt

In [None]:
startt = time.time()
with ThreadPoolExecutor(8) as executor:
    bowls = executor.map(requests.get, [baseurl+i for i in PostLink])
time.time()-startt

In [None]:
startt = time.time()
with ThreadPoolExecutor(12) as executor:
    bowls_2 = executor.map(requests.get, [baseurl+i for i in PostLink])
time.time()-startt

### multiprocessing

In [None]:
from multiprocessing import Pool

In [None]:
startt = time.time()
with Pool(6) as p:
    bowls = p.map(requests.get, [baseurl+i for i in PostLink])
time.time()-startt

### write the result to a file

In [None]:
for i in range(len(bowls)):
    with open('AaronSwartz/{}.html'.format(i), 'wb') as f:
        f.write(BeautifulSoup(bowls[i].text, 'html.parser').encode('utf-8'))

##  http://arxiv.org

### by a higher level approach

In [None]:
class Paper():
    """ A class that holds the information for an Arxiv paper. """
    def __init__(self,
                 number=None,
                 title=None,
                 auths=None,
                 abstract=None,
                 fromfile=None):
        """ Initialize a paper with the arxiv number, title, authors, and abstract. """

        if fromfile is not None:
            self.load(fromfile)

        else:
            self.number = number
            self.title = title
            if auths is not None:
                self.authors = list(auths.values())
                self.author_ids = list(auths.keys())
                self.author_dict = auths.copy()
            else:
                self.authors = None
                self.author_ids = None
                self.author_dict = None

            self.abstract = abstract
            self.link = u'http://arxiv.org/abs/' + number

    def format_line(self, strval, maxlength, pad_left, pad_right):
        """ Function to format a line of a given length.
        Used by the __str__ routine."""
        temp = re.sub("(.{" + "{:d}".format(maxlength) + "})", u"\\1-\n",
                      strval.replace('\n', ''), 0, re.DOTALL).strip()

        temp = temp.split('\n')

        temp[-1] = temp[-1] + ''.join([u'\u0020'] *
                                      (maxlength - len(temp[-1])))
        if len(temp) > 1:
            temp[0] = temp[0][:-1] + temp[0][-1]

        return pad_left + (pad_right + '\n' + pad_left).join(temp) + pad_right

    def get_search_string(self):

        return '  '.join(
            [self.abstract.lower(),
             self.title.lower(), self.number] +
            [a.lower()
             for a in self.author_ids] + [a.lower() for a in self.authors])

    def save(self, filename):
        with open(filename, "a") as f:
            json.dump(vars(self), f)

    def load(self, filename):
        try:
            if os.path.exists(filename):
                with open(filename, 'r') as f:
                    dat = json.load(f)
            else:
                dat = filename
        except TypeError:
            dat = filename
        for key, val in dat.items():
            setattr(self, key, val)

    def __eq__(self, paper):
        return (self.number == paper.number)

    def __ne__(self, paper):
        return not self.__eq__(paper)

    def __le__(self, paper):
        return float(self.number) <= float(paper.number)

    def __ge__(self, paper):
        return float(self.number) >= float(paper.number)

    def __lt__(self, paper):
        return float(self.number) < float(paper.number)

    def __gt__(self, paper):
        return float(self.number) > float(paper.number)

    def __str__(self):
        """ Display the paper in a somewhat nice looking way. """

        maxlen = 80
        pad_char = u"\u0025"
        newline_char = u"\u000A"
        space_char = u"\u0020"
        tab_char = space_char + space_char + space_char + space_char
        comma_char = u"\u002C"
        and_char = u"\u0026"

        pad_left = pad_char + pad_char + pad_char + tab_char
        pad_right = tab_char + pad_char + pad_char + pad_char

        if len(self.authors) == 1:
            authstr = self.authors[0]
        else:
            authstr = (comma_char + space_char).join(self.authors[:-1])
            authstr += comma_char + space_char + and_char + space_char + self.authors[
                -1]

        authstr = self.format_line(authstr, maxlen, pad_left, pad_right)
        titlestr = self.format_line(self.title, maxlen, pad_left, pad_right)
        linkstr = self.format_line(self.link, maxlen, pad_left, pad_right)
        border = ''.join([pad_char] *
                         (maxlen + len(pad_left) + len(pad_right)))
        blank_line = pad_left + ''.join([space_char] * maxlen) + pad_right


        strbody = newline_char + \
                border + newline_char + \
                blank_line  + newline_char + \
                titlestr + newline_char + \
                blank_line  + newline_char + \
                linkstr + newline_char + \
                blank_line  + newline_char + \
                authstr + newline_char + \
                blank_line  + newline_char + \
                border + newline_char + \
                newline_char

        # Check for python 2 to convert from unicode
        if sys.version_info < (3, ):
            strbody = strbody.encode("utf8", "ignore")
        return strbody

In [None]:
def authors_list_to_dict(author_list):

    authors_dict = {}
    for a in author_list:

        if '(' in a:
            # We have an affiliation
            a = a.split('(')[0]
            #a = ' ' .join(a.split('(')[0])
        temp = a.split()

        if len(temp) > 2:
            # More than two names, take first and last
            name = (temp[0],temp[-1])
        elif len(temp) == 1:
            # Just one name, probably a spacing error
            temp = temp[0].split('.')
            name = (temp[0],temp[-1])
        else:
            # Two names
            name = (temp[0],temp[1])

        authors_dict[name[1]+'_'+name[0][0].upper()] = ' '.join(temp)
    return authors_dict

In [None]:
def read_paper_from_url(number):

    bowl = requests.get('http://arxiv.org/abs/' + str(number))
    soup = bs4.BeautifulSoup(bowl.text, 'html.parser')
    title = soup.find_all(
        'h1', attrs={'class':
                     'title mathjax'})[0].text.split('Title:')[-1].strip()

    authors = [
        x.strip() for x in soup.find_all('div', attrs={'class': 'authors'})[0].
        text.split('Authors:')[-1].split(',')
    ]

    abstract = soup.find_all(
        'blockquote',
        attrs={'class':
               'abstract mathjax'})[0].text.split('Abstract:')[-1].strip()

    return Paper(number, title, authors_list_to_dict(authors), abstract)

In [None]:
read_paper_from_url('1908.04905')

### by a lower level approach

In [None]:
bowl = requests.get('http://arxiv.org/abs/'+ str(1908.04905)) 
soup = bs4.BeautifulSoup(bowl.text, 'html.parser')

```html
<h1 class="title mathjax"><span class="descriptor">Title:</span>Random walk on a lattice in the presence of obstacles: The short-time transient regime, anomalous diffusion and crowding</h1>
```

In [None]:
soup = bs4.BeautifulSoup(bowl.text, 'html.parser')
#soup
title = soup.find_all(
        'h1', attrs={'class':
                     'title mathjax'})[0].text.split('Title:')[-1].strip()
title
authors = [
        x.strip() for x in soup.find_all('div', attrs={'class': 'authors'})[0].
        text.split('Authors:')[-1].split(',')
    ]
authors
abstract = soup.find_all('blockquote',attrs={'class':
               'abstract mathjax'})[0].text.split('Abstract:')[-1].strip()
abstract

## https://www.gotouniversity.com

###  `bs4`

In [None]:
bowl = requests.get('https://www.gotouniversity.com/course/index') 
soup = bs4.BeautifulSoup(bowl.text, 'html.parser')

In [None]:
UniversityName = [i.text for i in soup.find_all('p', attrs={'class': 'university-name'})]

When writing a scraper, it's a good idea to look at the source of the HTML file and familiarize yourself with the structure. 

In [None]:
with open('gotouniversity.html', 'wb') as f:
    f.write(soup.encode('utf-8'))

```html
<a href="https://www.gotouniversity.com/programs/accelerated-bachelors/united-states-of-america/sciences/biochemistry-and-biophysics/loyola-university-chicago/bsms-in-biochemistry" target="_blank">
<span class="large-text program-name" title="BSMS in Biochemistry">BSMS in Biochemistry</span>
</a>
<script type="application/ld+json">
{
  "@context": "http://schema.org",
  "@type": "Course",
  "name": "BSMS in Biochemistry",
  "description": "",
  "provider": {
    "@type": "Organization",
    "name": "Loyola University Chicago",
    "sameAs": "https://www.gotouniversity.com/university/loyola-university-chicago"
  }
}
</script>
<a href="/university/loyola-university-chicago" target="_blank" title="University">
<p class="university-name" title="Loyola University Chicago">Loyola University Chicago</p>
</a>
<p class="location-name" title="Chicago Illinois"> Chicago, Illinois</p>
```

In [None]:
url = 'https://www.gotouniversity.com/course/index'

params = {'page': 80}
UniversityName = []
ProjectName = []
for page in range(1, 11):
    #update params
    params['page'] = page
    # requests.post?
    soup = BeautifulSoup(requests.post(url, data=params).text, 'html.parser' )
    UniversityName.append([a.get_text(strip=True) for a in soup.select('a[title="University"]') ])
    ProjectName.append([a.get_text(strip=True) for a in soup.select('span[class="large-text program-name"]')])

#### design data format

In [None]:
UniversityName = sum(UniversityName, [])
ProjectName = sum(ProjectName, [])

In [None]:
data = {"ProjectName": ProjectName, "UniversityName": UniversityName}

In [None]:
df = pd.DataFrame(data)
df.to_json('gotoun.json') 
df.to_excel('gotoun.xlsx', engine='xlsxwriter')

### selenium

* https://stackoverflow.com/q/51591849
  
  The only way to do this is to execute the Javascript that handles the click event - you won't do it with a regular GET request.
* https://stackoverflow.com/q/31442119

In [None]:
from selenium import webdriver
driver = webdriver.Chrome(executable_path='/Users/wangmiao/Desktop/chromedriver')
# find the element that's name attribute is q (the google search box)
driver.get('https://www.gotouniversity.com/course/index')
university_name = driver.find_elements_by_class_name("university-name")
university_name = [link.text for link in university_name]

> 如果href链接`(<a>)`值是javascript:void(0) 而不是网址，暗示链接不用于href响应页面导航，但使用绑定到链接点击事件的javascript函数来响应用户点击。因此无法driver.get(url)直接打开目标页面，必须单击链接以触发单击事件，该事件将调用javascript函数以导航到目标页面。

> 提示需要等待一段时间才能完成浏览器加载javascript并注册javascript函数以链接点击事件。否则点击链接后没有任何反应。

%%html
<div class="pagination"><div aria-live="polite" role="status" style="float:left; height:14px; padding:8px">Showing 1 to 20 of 143981 entries</div><div style="float:right;"><ul class="pagination" id="pagin_count"><li class="active" p="1"><a>1</a></li><li p="2"><a href="javascript:void()" onclick="pagingcustom(2);">2</a></li><li p="3"><a href="javascript:void()" onclick="pagingcustom(3);">3</a></li><li p="4"><a href="javascript:void()" onclick="pagingcustom(4);">4</a></li><li p="5"><a href="javascript:void()" onclick="pagingcustom(5);">5</a></li><li p="6"><a href="javascript:void()" onclick="pagingcustom(6);">6</a></li><li p="7"><a href="javascript:void()" onclick="pagingcustom(7);">7</a></li><li p="8"><a href="javascript:void()" onclick="pagingcustom(8);">8</a></li><li p="9"><a href="javascript:void()" onclick="pagingcustom(9);">9</a></li><li p="10"><a href="javascript:void()" onclick="pagingcustom(10);">10</a></li><li p="1"><a href="javascript:void()" onclick="pagingcustom(1);">Next</a></li></ul></div></div>
</div>
<script>
function fn_advcount(id){
    $.ajax({
            url: 'https://www.gotouniversity.com/site/advertisement-count',
            data: { id : id },
            success: function(result){
    }});
  }
</script>

"javascript:void(0)" means that the link wouldn't work. It'll do nothing. That is why no action is taking place when you click on it.
    
* https://stackoverflow.com/a/1291950/7583919

In [None]:
# https://stackoverflow.com/a/35786344/7583919
aElements = driver.find_elements_by_tag_name("a")
result = []
for name in aElements:
    if(name.get_attribute("href") is not None and "javascript:void()" in name.get_attribute("href")):
        print("IM IN HUR")
        """
        elements = driver.find_elements_by_class_name("university-name")
        result.append([link.text for link in elements])
        print(result)
        """
        name.click()

https://codeday.me/bug/20190123/563610.html

https://stackoverflow.com/questions/52876136/google-search-next-pages-using-selenium

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Chrome(executable_path='/Users/wangmiao/Desktop/chromedriver')
driver.get('https://www.gotouniversity.com/course/index')
Page_number = 1
Max_page = 10

while Page_number <= Max_page:

    university_name = WebDriverWait(driver, 20).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR,
                                             '.university-name')))
    university_name = [link.text for link in university_name]
    print(university_name)
    Page_number = Page_number + 1
    element = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH,
                                    '//a[text()="' + str(Page_number) + '"]')))
    driver.execute_script("arguments[0].click();", element)

## https://www.premierleague.com/players

Besides getting the content parsed in html, there is other format.

In [None]:
import requests
from bs4 import BeautifulSoup as soup
from pprint import pprint

In [None]:
player_name = ['Bernd Leno', 'Emiliano Martínez', 'Matt Macey', 'Héctor Bellerín']
player = {}
for i in player_name:
    player_page = requests.get(
        'https://www.premierleague.com/players/10483/{}/stats'.format(i))
    cont = soup(player_page.content, 'lxml')

    data = dict(
        (k.contents[0].strip(), v.get_text(strip=True)) for k, v in zip(
            cont.select('.topStat span.stat, .normalStat span.stat'),
            cont.select(
                '.topStat span.stat > span, .normalStat span.stat > span')))
    player[i] = data

pprint(player)

## http://fz.people.com.cn/skygb/sk/index.php/Index

### single thread

In [None]:
bowl = requests.get('http://fz.people.com.cn/skygb/sk/index.php/Index') 
soup = bs4.BeautifulSoup(bowl.text, 'html.parser')
soup.select("span[title]")[::20][0].get_text()
soup.select("span[title]")


url = 'http://fz.people.com.cn/skygb/sk/index.php'
for page in range(1, 3):
    print(params)
    bowl = requests.post(url + "?&p={}".format(page))
    #print(bowl.url)
    soup = BeautifulSoup(bowl.text, 'html.parser' )
    print([a.get_text(strip=True) for a in soup.select("span[title]")])
    print("xxx")

### parallel requests

In [None]:
url = 'http://fz.people.com.cn/skygb/sk/index.php'
startt = time.time()
with Pool(6) as p:
    bowls = p.map(requests.post,
                  [url + "?&p={}".format(i) for i in range(1, 600)])
time.time() - startt

res = [[
    x.get_text(strip=True)
    for x in BeautifulSoup(response.text, 'html.parser').select("span[title]")
] for response in bowls]
res = sum(res, [])

startt = time.time()
with ThreadPoolExecutor(8) as executor:
    bowls_1 = executor.map(
        requests.post, [url + "?&p={}".format(i) for i in range(600, 1582)])
time.time() - startt

res_2 = [[
    x.get_text(strip=True)
    for x in BeautifulSoup(response.text, 'html.parser').select("span[title]")
] for response in bowls_1]
res_2 = sum(res_2, [])
final_result = res + res_2

In [None]:
data = {'项目批准号': final_result[0::20]}
df = pd.DataFrame(data)

title = [
    "项目类别", "学科分类", "项目名称", "立项时间", "项目负责人", "专业职务", "工作单位", "单位类别", "所在省区市",
    "所属系统"
]

for i, v in enumerate(title, 1):
    df[v] = final_result[i::20]
    
filter_result = df[~df["立项时间"].str.contains("2013")]
filter_result.to_excel("classified.xlsx")

# `bs4`

[A Simple Cheat Sheet for Web Scraping with Python](https://blog.hartleybrody.com/web-scraping-cheat-sheet/)

https://stackoverflow.com/questions/57767188/python-beautifulsoup-replace-links-with-url-in-string#57767188

In [None]:
html="""
<html><head></head>
<body>
<a href="www.google.com">foo</a> some text 
<a href="www.bing.com">bar</a> some <br> text
</body></html>"""

soup = BeautifulSoup(html, 'html.parser')
for a_tag in soup.find_all('a'):
    a_tag.string = a_tag.get('href')
print(soup)

In [None]:
url = 'https://www.basketball-reference.com/players/a/abrinal01.html'
res = requests.get(url)
res.raise_for_status()

soup = bs4.BeautifulSoup(res.text, 'html.parser')
elems = soup.select('#per_game')

table = soup.find("table", { "id" : "per_game" })
table_rows = table.find_all('tr')

for tr in table_rows:
    td = tr.find_all('td') + tr.find_all('th')
    row = [i.text for i in td]
    print(row)

# Extract from  `xml`

## `html` vs `xml`

* `html` is static because it is used to display data.	
* `xml` is dynamic because it is used to transport the data not for displaying the data.
*  One other potential advantage to using `xml` is that some processing can be moved client-side as opposed to server-side.

### Why `xml` and when should I use it?

> Here is a common scenario: Data is stored as XML, retrieved by Java, and displayed in HTML. The underlying code remains the same (meaning that a programmer doesn’t have to sit there all day making changes), and the screen doesn’t refresh constantly, annoying the end user. But when there’s a need for new data, it’s there in its current form.

[xml preview](https://codebeautify.org/xmlviewer)

## parse by Beautifulsoup

In [None]:
url = "https://complicatedphenomenon.gitlab.io/atom.xml"
bowl = requests.get(url)
soup = BeautifulSoup(bowl.text, "lxml")
soup.find_all("content")[1]

## parse by xml.etree

掌握http协议，熟悉html、dom、xpath等常见的数据抽取技术

https://docs.python.org/3/library/xml.etree.elementtree.html

A good tool with an explicit documentation about its API would get your work done soon. 

### `xml` from local file

In [None]:
from xml.etree import cElementTree as ET

* We can import this data by reading from a file
* Or directly from a string

<span style="font-family:New York Times; font-size:1.2em; color:red;">
    
the `<poll>` element contains a couple of "attributes", such as `title` `totalvotes` `name` that give even more information!

In [None]:
xmlstr = """<poll title="User Suggested Number of Players" totalvotes="0" name="suggested_numplayers">
<results numplayers="3+"> 
</results></poll>
"""
root = ET.fromstring(xmlstr)
root.tag
root.attrib
root.keys()

In [None]:
xmlstr = '''<root>
<level>
  <name>Matthias</name>
  <age>23</age>
  <gender>Male</gender>
</level>
<level>
  <name>Foo</name>
  <age>24</age>
  <gender>Male</gender>
</level>
<level>
  <name>Bar</name>
  <age>25</age>
  <gender>Male</gender>
</level>
</root>'''

root = ET.fromstring(xmlstr)
levels = root.findall('level')
for level in levels:
    name = level.find('name').text
    age = level.find('age').text
    print(name, age)

In [None]:
dir(ET)

###  `xml` from url

In [None]:
!wget  $url  -O CP.xml

In [None]:
tree = ET.parse('CP.xml')
root = tree.getroot()

In [None]:
root.tag
root.attrib

In [None]:
for child in root:
    print(child.tag, child.attrib)

In [None]:
#dir(root[0])
root[0].tag