## LineUp Table Scraping

### Packages

In [1]:
import pandas as pd, numpy as np
import asyncio
import time, random
import datetime
from pyppeteer import launch
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup

### IP Pools

In [145]:
ips = []
with open("Scrape/ip_pool.txt", "r") as file:
    for line in file:
        ips.append(line.strip(" \n+"))

In [150]:
ips

['183.182.101.32:60564',
 '94.127.144.179:37690',
 '177.92.79.10:80',
 '168.197.115.152:50368',
 '89.34.202.96:30825',
 '103.221.254.2:38689',
 '36.89.180.35:44255',
 '185.118.49.11:3128',
 '89.34.202.96:30825',
 '189.204.241.44:56792']

### Reference Table

In [68]:
ref_table = pd.read_csv("Team_Played_Date.csv")
ref_table.head()

Unnamed: 0,Date,Game_Index,Team,GameYear
0,2018-10-16,1,BOS,18
1,2018-10-16,1,PHI,18
2,2018-10-16,2,GSW,18
3,2018-10-16,2,OKC,18
4,2018-10-17,1,CHO,18


### Traditional

In [113]:
url = "https://stats.nba.com/lineups/traditional/?Season=2007-08&SeasonType=Regular%20Season&TeamID=1610612757&DateFrom=10%2F30%2F2007&DateTo=10%2F30%2F2007"
season = "2007"
season_next = str(int(season[-2:]) + 1) if int(season[-2:]) + 1 >= 10 else '0' + str(int(season[-2:]) + 1)
teamid = "1610612757"
yr = "2007"
mt = "10"
dy = "30"
date_format = mt+"%2F"+dy+"%2F"+yr
customized_url = "https://stats.nba.com/lineups/advanced/?Season="+season+'-'+season_next+"&SeasonType=Regular%20Season&TeamID="+teamid+"&DateFrom="+date_format+"&DateTo="+date_format

In [5]:
customized_url

'https://stats.nba.com/lineups/advanced/?Season=2007-08&SeasonType=Regular%20Season&TeamID=1610612757&DateFrom=10%2F30%2F2007&DateTo=10%2F30%2F2007'

### Selenium

In [131]:
def parse_table(table, team, date):
    stats = []
    for line in table.text.split("\n")[1:]:
        if line == "" or line == "LINEUPS":
            break
        else:
            lineup = [', '.join([i.strip(". +") for i in line.split("POR")[0].split(",")])]
            stats.append(lineup + line.split("POR")[1].strip(" +").split(" ") + [team] + [date])

    df_columns = ['lineup','GP','MIN','PTS','FGM','FGA','FG%','3PM',
                  '3PA','3P%','FTM','FTA','FT%','OREB','DREB','REB',
                  'AST','TOV','STL','BLK','BLKA','PF','PFD','+/-',
                  'TEAM','GAMEDAY']

    df = pd.DataFrame(stats,
                     columns = df_columns)
    return df



In [None]:
random.shuffle(ips)
PROXY = ips[0]
options = webdriver.ChromeOptions()
options.add_argument('--proxy-server=%s' %PROXY)
driver = webdriver.Chrome(executable_path = "/Users/garyliu/Documents/chromedriver",
                          options = options)
url = "https://stats.nba.com/lineups/traditional/?Season=2007-08&SeasonType=Regular%20Season&TeamID=1610612757&DateFrom=10%2F30%2F2007&DateTo=10%2F30%2F2007"
#driver.implicitly_wait(10)  # 隐性等待，最久等30秒
#WebDriverWait wait = new WebDriverWait(driver, 15, 1)
#driver.set_page_load_timeout(20)
driver.get(url)
wait.until(lambda driver: driver.find_element_by_class_name("nba-stat-table"))
table = driver.find_element_by_class_name("nba-stat-table")
driver.close()

In [138]:
#Initial
date = "2007-10-30"
team = "POR"
error_game = []
# Chrome Connection
options = webdriver.ChromeOptions()
options.add_argument('--proxy-server=http://43.229.26.24:36702')
driver = webdriver.Chrome(executable_path = "/Users/garyliu/Documents/chromedriver",
                           chrome_options = options)
for test in range(10):
    if test >= 9:
        error_game.append(date + "|" + team)
        print("Error: Time Out")
    try:
        url = "https://stats.nba.com/lineups/traditional/?Season=2007-08&SeasonType=Regular%20Season&TeamID=1610612757&DateFrom=10%2F30%2F2007&DateTo=10%2F30%2F2007"
        driver.implicitly_wait(30)  # 隐性等待，最久等30秒
        driver.get(url)
        table = driver.find_element_by_class_name("nba-stat-table")
        df = parse_table(table, team, date)
        
        break
    except:
        print("waiting for finding the element.")
    time.sleep(random.randint(1, 3))

  if __name__ == '__main__':


waiting for finding the element.
waiting for finding the element.


KeyboardInterrupt: 

### Pyppeteer (Give up)

[Reference1](https://ithelp.ithome.com.tw/articles/10225429)

[Reference2](https://www.kingname.info/2019/08/18/hide-webdriver-in-pyppeteer/)

[Network Error](https://github.com/miyakogi/pyppeteer/issues/171)

[Network Error with fork](https://github.com/Francesco149/pyppeteer)

#### Start Pyppeteer

In [19]:
# Team ID
id_dict = {}
with open("Team_id.txt", "r") as file:
    for line in file:
        id_dict[line.split("|")[0]] = line.split("|")[1].strip("\n")

# Game Reference Table
ref_table = pd.read_csv("Team_Played_Date.csv")
ref_table["Team_id"] = [id_dict[tm] for tm in ref_table["Team"]]

# Season
def getGameYear(date_col):
    gameYear = []
    for day in date_col:
        if int(day.split("-")[1]) >= 9:
            gameYear.append(day.split("-")[0][-2:])
        else:
            gameYear.append(str(int(day.split("-")[0][-2:]) - 1))
    return gameYear
ref_table["Season"] = getGameYear(ref_table["Date"])

# Regular Season or not
regularseason = []
for gameday in ref_table['Date']:
    d = datetime.date(int(gameday.split("-")[0]),
                      int(gameday.split("-")[1]),
                      int(gameday.split("-")[2]))
    if datetime.date(2006,10,31) <= d <= datetime.date(2007,4,18):
        regularseason.append(1)
    elif datetime.date(2007,10,30) <= d <= datetime.date(2008,4,16):
        regularseason.append(1)
    elif datetime.date(2008,10,28) <= d <= datetime.date(2009,4,16):
        regularseason.append(1)
    elif datetime.date(2009,10,27) <= d <= datetime.date(2010,4,14):
        regularseason.append(1)
    elif datetime.date(2010,10,26) <= d <= datetime.date(2011,4,13):
        regularseason.append(1)
    elif datetime.date(2011,12,25) <= d <= datetime.date(2012,4,26):
        regularseason.append(1)        
    elif datetime.date(2012,10,30) <= d <= datetime.date(2013,4,17):
        regularseason.append(1)    
    elif datetime.date(2013,10,29) <= d <= datetime.date(2014,4,16):
        regularseason.append(1)  
    elif datetime.date(2014,10,28) <= d <= datetime.date(2015,4,15):
        regularseason.append(1)
    elif datetime.date(2015,10,27) <= d <= datetime.date(2016,4,13):
        regularseason.append(1)   
    elif datetime.date(2016,10,25) <= d <= datetime.date(2017,4,12):
        regularseason.append(1)
    elif datetime.date(2017,10,17) <= d <= datetime.date(2018,4,11):
        regularseason.append(1)
    elif datetime.date(2018,10,16) <= d <= datetime.date(2019,4,10):
        regularseason.append(1)
    else:
        regularseason.append(0)
ref_table["RegularSeason"] = regularseason

In [None]:
browser = await launch(headless = False)
page = await browser.newPage()
await page.setUserAgent('Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36')

new_table = ref_table.iloc[3:10,:]
stats = []
errorGame = []
for ind in range(new_table.shape[0]):
    # Get url information
    spec_table = new_table.iloc[ind, :]
    season = "20" + spec_table["Season"]
    season_next = str(int(season[-2:]) + 1) if int(season[-2:]) + 1 >= 10 else '0' + str(int(season[-2:]) + 1)
    date_format = "%2F".join(spec_table["Date"].split("-"))
    teamid = spec_table["Team_id"]
    url = "https://stats.nba.com/lineups/traditional/?Season="+season+'-'+season_next+"&SeasonType=Regular%20Season&TeamID="+teamid+"&DateFrom="+date_format+"&DateTo="+date_format
    print(url)

    try:
        # Go to the page
        await page.goto(url)
        print("success")
        # Wait for the page setting down
#         while not await page.waitForSelector("body"):
#             await asyncio.sleep(1)
#             pass
        await page.waitForSelector("body", timeout = 10000)
        print("___")
        html_doc = await page.content()

        # Parse html
        soup = BeautifulSoup(html_doc, "lxml")
        # Extract statistics
        tb = soup.find_all("div", class_="nba-stat-table")[0].find("tbody").find_all("tr")

        for i in range(tb.__len__()):
            stats.append([ele.text.strip("(\n| |.)+") for ele in tb[i].find_all("td")] + [spec_table["Date"]])

        # Sleep
        await asyncio.sleep(random.randint(5,10))
    except:
        print("Error: " + spec_table["Date"] + "|" + spec_table["Team"])
        await asyncio.sleep(random.randint(5,10))
# DataFrame
df_columns = ['lineup','TEAM','GP','MIN','PTS','FGM','FGA','FG%','3PM',
                '3PA','3P%','FTM','FTA','FT%','OREB','DREB','REB',
                'AST','TOV','STL','BLK','BLKA','PF','PFD','+/-','GAMEDAY']
table_section = pd.DataFrame(stats)

https://stats.nba.com/lineups/traditional/?Season=2018-19&SeasonType=Regular%20Season&TeamID=1610612760&DateFrom=2018%2F10%2F16&DateTo=2018%2F10%2F16
success
___
https://stats.nba.com/lineups/traditional/?Season=2018-19&SeasonType=Regular%20Season&TeamID=1610612766&DateFrom=2018%2F10%2F17&DateTo=2018%2F10%2F17
success
___
Error: 2018-10-17|CHO
https://stats.nba.com/lineups/traditional/?Season=2018-19&SeasonType=Regular%20Season&TeamID=1610612749&DateFrom=2018%2F10%2F17&DateTo=2018%2F10%2F17
Error: 2018-10-17|MIL
https://stats.nba.com/lineups/traditional/?Season=2018-19&SeasonType=Regular%20Season&TeamID=1610612765&DateFrom=2018%2F10%2F17&DateTo=2018%2F10%2F17
Error: 2018-10-17|DET
https://stats.nba.com/lineups/traditional/?Season=2018-19&SeasonType=Regular%20Season&TeamID=1610612751&DateFrom=2018%2F10%2F17&DateTo=2018%2F10%2F17
Error: 2018-10-17|BRK


RuntimeError: This event loop is already running

ERROR:root:Invalid alias: The name clear can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name more can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name less can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name man can't be aliased because it is another magic command.


### Advanced

In [5]:
browser = await launch(headless = False)
page = await browser.newPage()
await page.setUserAgent('Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36')
await page.goto("https://stats.nba.com/lineups/traditional/?Season=2018-19&SeasonType=Regular%20Season&TeamID=1610612745&DateFrom=2018%2F10%2F17&DateTo=2018%2F10%2F17")
await page.waitForSelector("body", timeout=10000)

<pyppeteer.element_handle.ElementHandle at 0x7fc8c056d160>

In [3]:
existed_traditional_data = pd.read_csv("https://raw.githubusercontent.com/BaselineAnalytics/IntroductiontoDataScience/zye/data/lineup_traditional_exist_check.csv")

In [9]:
missing_table = existed_traditional_data[existed_traditional_data["exist_check"].isnull()]

In [11]:
missing_table.loc[:, ["Date_fmt", "Home_abb", "Season"]].head

<bound method NDFrame.head of          Date_fmt Home_abb   Season
4      2008-05-28      BOS  2007-08
5      2008-05-22      BOS  2007-08
6      2008-05-20      BOS  2007-08
11     2008-05-18      BOS  2007-08
13     2008-05-14      BOS  2007-08
14     2008-05-08      BOS  2007-08
28     2008-05-03      NOH  2007-08
29     2008-05-05      NOH  2007-08
30     2008-05-13      NOH  2007-08
31     2008-05-19      NOH  2007-08
32     2008-05-08      SAS  2007-08
34     2008-05-11      SAS  2007-08
40     2008-05-06      BOS  2007-08
48     2008-04-19      NOH  2007-08
56     2008-01-25      NOH  2007-08
66     2008-01-28      NOH  2007-08
74     2008-01-30      NOH  2007-08
82     2008-04-22      NOH  2007-08
84     2008-04-29      NOH  2007-08
109    2007-12-31      NOH  2007-08
125    2007-12-07      NOH  2007-08
126    2007-12-09      NOH  2007-08
139    2008-04-28      DEN  2007-08
142    2007-11-26      NOH  2007-08
147    2008-04-26      DEN  2007-08
150    2008-01-18      NOH  2007-0