## LineUp Table Scraping

### Packages

In [1]:
import pandas as pd, numpy as np
import asyncio
import time, random
from pyppeteer import launch
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup

### IP Pools

In [145]:
ips = []
with open("Scrape/ip_pool.txt", "r") as file:
    for line in file:
        ips.append(line.strip(" \n+"))

In [150]:
ips

['183.182.101.32:60564',
 '94.127.144.179:37690',
 '177.92.79.10:80',
 '168.197.115.152:50368',
 '89.34.202.96:30825',
 '103.221.254.2:38689',
 '36.89.180.35:44255',
 '185.118.49.11:3128',
 '89.34.202.96:30825',
 '189.204.241.44:56792']

### Reference Table

In [68]:
ref_table = pd.read_csv("Team_Played_Date.csv")
ref_table.head()

Unnamed: 0,Date,Game_Index,Team,GameYear
0,2018-10-16,1,BOS,18
1,2018-10-16,1,PHI,18
2,2018-10-16,2,GSW,18
3,2018-10-16,2,OKC,18
4,2018-10-17,1,CHO,18


### Traditional

In [113]:
url = "https://stats.nba.com/lineups/traditional/?Season=2007-08&SeasonType=Regular%20Season&TeamID=1610612757&DateFrom=10%2F30%2F2007&DateTo=10%2F30%2F2007"
season = "2007"
season_next = str(int(season[-2:]) + 1) if int(season[-2:]) + 1 >= 10 else '0' + str(int(season[-2:]) + 1)
teamid = "1610612757"
yr = "2007"
mt = "10"
dy = "30"
date_format = mt+"%2F"+dy+"%2F"+yr
customized_url = "https://stats.nba.com/lineups/advanced/?Season="+season+'-'+season_next+"&SeasonType=Regular%20Season&TeamID="+teamid+"&DateFrom="+date_format+"&DateTo="+date_format

In [5]:
customized_url

'https://stats.nba.com/lineups/advanced/?Season=2007-08&SeasonType=Regular%20Season&TeamID=1610612757&DateFrom=10%2F30%2F2007&DateTo=10%2F30%2F2007'

### Selenium

In [131]:
def parse_table(table, team, date):
    stats = []
    for line in table.text.split("\n")[1:]:
        if line == "" or line == "LINEUPS":
            break
        else:
            lineup = [', '.join([i.strip(". +") for i in line.split("POR")[0].split(",")])]
            stats.append(lineup + line.split("POR")[1].strip(" +").split(" ") + [team] + [date])

    df_columns = ['lineup','GP','MIN','PTS','FGM','FGA','FG%','3PM',
                  '3PA','3P%','FTM','FTA','FT%','OREB','DREB','REB',
                  'AST','TOV','STL','BLK','BLKA','PF','PFD','+/-',
                  'TEAM','GAMEDAY']

    df = pd.DataFrame(stats,
                     columns = df_columns)
    return df



In [None]:
random.shuffle(ips)
PROXY = ips[0]
options = webdriver.ChromeOptions()
options.add_argument('--proxy-server=%s' %PROXY)
driver = webdriver.Chrome(executable_path = "/Users/garyliu/Documents/chromedriver",
                          options = options)
url = "https://stats.nba.com/lineups/traditional/?Season=2007-08&SeasonType=Regular%20Season&TeamID=1610612757&DateFrom=10%2F30%2F2007&DateTo=10%2F30%2F2007"
#driver.implicitly_wait(10)  # 隐性等待，最久等30秒
#WebDriverWait wait = new WebDriverWait(driver, 15, 1)
#driver.set_page_load_timeout(20)
driver.get(url)
wait.until(lambda driver: driver.find_element_by_class_name("nba-stat-table"))
table = driver.find_element_by_class_name("nba-stat-table")
driver.close()

In [138]:
#Initial
date = "2007-10-30"
team = "POR"
error_game = []
# Chrome Connection
options = webdriver.ChromeOptions()
options.add_argument('--proxy-server=http://43.229.26.24:36702')
driver = webdriver.Chrome(executable_path = "/Users/garyliu/Documents/chromedriver",
                           chrome_options = options)
for test in range(10):
    if test >= 9:
        error_game.append(date + "|" + team)
        print("Error: Time Out")
    try:
        url = "https://stats.nba.com/lineups/traditional/?Season=2007-08&SeasonType=Regular%20Season&TeamID=1610612757&DateFrom=10%2F30%2F2007&DateTo=10%2F30%2F2007"
        driver.implicitly_wait(30)  # 隐性等待，最久等30秒
        driver.get(url)
        table = driver.find_element_by_class_name("nba-stat-table")
        df = parse_table(table, team, date)
        
        break
    except:
        print("waiting for finding the element.")
    time.sleep(random.randint(1, 3))

  if __name__ == '__main__':


waiting for finding the element.
waiting for finding the element.


KeyboardInterrupt: 

### Pyppeteer (Give up)

[Reference1](https://ithelp.ithome.com.tw/articles/10225429)
[Reference2](https://www.kingname.info/2019/08/18/hide-webdriver-in-pyppeteer/)

In [258]:
browser = await launch(headless = False)
page = await browser.newPage()
await page.setUserAgent(
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36')
await page.goto(url)
#await asyncio.sleep(5)
#await page.evaluate('window.scrollBy(0, window.innerHeight)')
#自行加入while loop測試網頁是否有跑出來
# while not await page.waitForFunction('document.getElementByTag("th")'):
#     await asyncio.sleep(1)
#     pass

while not await page.querySelector("body > main > div.stats-container__inner > div > div.row > div > div > nba-stat-table > div.nba-stat-table > div.nba-stat-table__overflow > table > tbody > tr:nth-child(1) > td.lineup"):
    await asyncio.sleep(1)
    pass
html_doc = await page.content()
soup = BeautifulSoup(html_doc, "lxml")
# Extract statistics
tb = soup.find_all("div", class_="nba-stat-table")[0].find("tbody").find_all("tr")
stats = []
for i in range(tb.__len__()):
    stats.append([ele.text.strip("(\n| |.)+") for ele in tb[0].find_all("td")] + ["2007-10-30"])
# DataFrame
df_columns = ['lineup','TEAM','GP','MIN','PTS','FGM','FGA','FG%','3PM',
                  '3PA','3P%','FTM','FTA','FT%','OREB','DREB','REB',
                  'AST','TOV','STL','BLK','BLKA','PF','PFD','+/-','GAMEDAY']
pd.DataFrame(stats)

#await browser.close()
#tb = await page.querySelector("body > main > div.stats-container__inner > div > div.row > div > div > nba-stat-table > div.nba-stat-table")
#content = await page.evaluate('(element) => element.textContent', tb)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,"J. Holiday, .N. Mirotic, .E. Moore, .A. Davis,...",NOP,1,19,143.6,105.3,38.3,77.3,5.67,27.0,41.2,63.6,53.8,0.1,64.1,67.2,95.59,69.5,2007-10-30
1,"J. Holiday, .N. Mirotic, .E. Moore, .A. Davis,...",NOP,1,19,143.6,105.3,38.3,77.3,5.67,27.0,41.2,63.6,53.8,0.1,64.1,67.2,95.59,69.5,2007-10-30
2,"J. Holiday, .N. Mirotic, .E. Moore, .A. Davis,...",NOP,1,19,143.6,105.3,38.3,77.3,5.67,27.0,41.2,63.6,53.8,0.1,64.1,67.2,95.59,69.5,2007-10-30
3,"J. Holiday, .N. Mirotic, .E. Moore, .A. Davis,...",NOP,1,19,143.6,105.3,38.3,77.3,5.67,27.0,41.2,63.6,53.8,0.1,64.1,67.2,95.59,69.5,2007-10-30
4,"J. Holiday, .N. Mirotic, .E. Moore, .A. Davis,...",NOP,1,19,143.6,105.3,38.3,77.3,5.67,27.0,41.2,63.6,53.8,0.1,64.1,67.2,95.59,69.5,2007-10-30
5,"J. Holiday, .N. Mirotic, .E. Moore, .A. Davis,...",NOP,1,19,143.6,105.3,38.3,77.3,5.67,27.0,41.2,63.6,53.8,0.1,64.1,67.2,95.59,69.5,2007-10-30
6,"J. Holiday, .N. Mirotic, .E. Moore, .A. Davis,...",NOP,1,19,143.6,105.3,38.3,77.3,5.67,27.0,41.2,63.6,53.8,0.1,64.1,67.2,95.59,69.5,2007-10-30
7,"J. Holiday, .N. Mirotic, .E. Moore, .A. Davis,...",NOP,1,19,143.6,105.3,38.3,77.3,5.67,27.0,41.2,63.6,53.8,0.1,64.1,67.2,95.59,69.5,2007-10-30
8,"J. Holiday, .N. Mirotic, .E. Moore, .A. Davis,...",NOP,1,19,143.6,105.3,38.3,77.3,5.67,27.0,41.2,63.6,53.8,0.1,64.1,67.2,95.59,69.5,2007-10-30
9,"J. Holiday, .N. Mirotic, .E. Moore, .A. Davis,...",NOP,1,19,143.6,105.3,38.3,77.3,5.67,27.0,41.2,63.6,53.8,0.1,64.1,67.2,95.59,69.5,2007-10-30


#### Start Pyppeteer

In [2]:
# Team ID
id_dict = {}
with open("Team_id.txt", "r") as file:
    for line in file:
        id_dict[line.split("|")[0]] = line.split("|")[1].strip("\n")

# Game Reference Table
ref_table = pd.read_csv("Team_Played_Date.csv")
ref_table["Team_id"] = [id_dict[tm] for tm in ref_table["Team"]]

# Season
def getGameYear(date_col):
    gameYear = []
    for day in date_col:
        if int(day.split("-")[1]) >= 9:
            gameYear.append(day.split("-")[0][-2:])
        else:
            gameYear.append(str(int(day.split("-")[0][-2:]) - 1))
    return gameYear
ref_table["Season"] = getGameYear(ref_table["Date"])

In [None]:
browser = await launch(headless = False)
page = await browser.newPage()
await page.setUserAgent('Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36')

new_table = ref_table.iloc[3:10,:]
stats = []
errorGame = []
for ind in range(new_table.shape[0]):
    # Get url information
    spec_table = new_table.iloc[ind, :]
    season = "20" + spec_table["Season"]
    season_next = str(int(season[-2:]) + 1) if int(season[-2:]) + 1 >= 10 else '0' + str(int(season[-2:]) + 1)
    date_format = "%2F".join(spec_table["Date"].split("-"))
    teamid = spec_table["Team_id"]
    url = "https://stats.nba.com/lineups/traditional/?Season="+season+'-'+season_next+"&SeasonType=Regular%20Season&TeamID="+teamid+"&DateFrom="+date_format+"&DateTo="+date_format
    print(url)

    try:
        # Go to the page
        await page.goto(url)
        print("success")
        # Wait for the page setting down
#         while not await page.waitForSelector("body"):
#             await asyncio.sleep(1)
#             pass
        await page.waitForSelector("body", timeout = 10000)
        print("___")
        html_doc = await page.content()

        # Parse html
        soup = BeautifulSoup(html_doc, "lxml")
        # Extract statistics
        tb = soup.find_all("div", class_="nba-stat-table")[0].find("tbody").find_all("tr")

        for i in range(tb.__len__()):
            stats.append([ele.text.strip("(\n| |.)+") for ele in tb[i].find_all("td")] + [spec_table["Date"]])

        # Sleep
        await asyncio.sleep(random.randint(5,10))
    except:
        print("Error: " + spec_table["Date"] + "|" + spec_table["Team"])
        await asyncio.sleep(random.randint(5,10))
# DataFrame
df_columns = ['lineup','TEAM','GP','MIN','PTS','FGM','FGA','FG%','3PM',
                '3PA','3P%','FTM','FTA','FT%','OREB','DREB','REB',
                'AST','TOV','STL','BLK','BLKA','PF','PFD','+/-','GAMEDAY']
table_section = pd.DataFrame(stats)

https://stats.nba.com/lineups/traditional/?Season=2018-19&SeasonType=Regular%20Season&TeamID=1610612760&DateFrom=2018%2F10%2F16&DateTo=2018%2F10%2F16
success
___
https://stats.nba.com/lineups/traditional/?Season=2018-19&SeasonType=Regular%20Season&TeamID=1610612766&DateFrom=2018%2F10%2F17&DateTo=2018%2F10%2F17
success
___
Error: 2018-10-17|CHO
https://stats.nba.com/lineups/traditional/?Season=2018-19&SeasonType=Regular%20Season&TeamID=1610612749&DateFrom=2018%2F10%2F17&DateTo=2018%2F10%2F17
Error: 2018-10-17|MIL
https://stats.nba.com/lineups/traditional/?Season=2018-19&SeasonType=Regular%20Season&TeamID=1610612765&DateFrom=2018%2F10%2F17&DateTo=2018%2F10%2F17
Error: 2018-10-17|DET
https://stats.nba.com/lineups/traditional/?Season=2018-19&SeasonType=Regular%20Season&TeamID=1610612751&DateFrom=2018%2F10%2F17&DateTo=2018%2F10%2F17
Error: 2018-10-17|BRK


RuntimeError: This event loop is already running

ERROR:root:Invalid alias: The name clear can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name more can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name less can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name man can't be aliased because it is another magic command.


### Advanced

In [5]:
browser = await launch(headless = False)
page = await browser.newPage()
await page.setUserAgent('Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36')
await page.goto("https://stats.nba.com/lineups/traditional/?Season=2018-19&SeasonType=Regular%20Season&TeamID=1610612745&DateFrom=2018%2F10%2F17&DateTo=2018%2F10%2F17")
await page.waitForSelector("body", timeout=10000)

<pyppeteer.element_handle.ElementHandle at 0x7fc8c056d160>

In [None]:
page.goto("https://stats.nba.com/lineups/traditional/?Season=2018-19&SeasonType=Regular%20Season&TeamID=1610612745&DateFrom=2018%2F10%2F17&DateTo=2018%2F10%2F17")