In [144]:
import requests
import os
import shutil
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

def tocsv(table,append=True):
    output_rows = []
    for table_row in table.findAll('tr'):
        columns = table_row.findAll('td')
        output_row = []
        for column in columns:
            output_row.append(column.text)
        output_rows.append(output_row)
    df = pd.DataFrame(output_rows)
    df.head()
    if append == True:
        df.to_csv('outputs2.csv', mode='a',header=False, index=False)
    else:
        df.to_csv('outputs2.csv', index=False)

def dftocsv(df,append=True):
    if append == True:
        df.to_csv('outputs2.csv', mode='a',header=False, index=False)
    else:
        df.to_csv('outputs2.csv', index=False)

def delete_empty_rows(file_path, new_file_path):
    data = pd.read_csv(file_path, skip_blank_lines=True)
    data.dropna(how="all", inplace=True)
    data.to_csv(new_file_path, header=True,index=False)
    
def renamedupes(df):
    cols=pd.Series(df.columns)

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '.' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    
def htmltodf(html,year,month):
    with open("./years/{}{}.html".format(year,month), "w+",encoding="utf8") as f:
        f.write(html.text)
    with open("./years/{}{}.html".format(year,month),encoding="utf8") as f:
        page = f.read()
    soup = BeautifulSoup(page, 'html5lib')
    column_headers = [th.getText() for th in soup.findAll('tr',limit=2)[0].findAll('th')]
    data_rows = soup.findAll('tr')[1:]
    game_data = [[td.getText() for td in data_rows[i].findAll(['td','th'])] for i in range(len(data_rows))]
    df = pd.DataFrame(game_data,columns=column_headers)
    renamedupes(df)
    df2 = df[['Visitor/Neutral','PTS','Home/Neutral','PTS.1']].copy()
    return df2

years = list(range(1977,2020))
months = ["october","november","december","january","february","march","april","may","june"]
url_start = "https://www.basketball-reference.com/leagues/NBA_{}_games-{}.html"
 


In [145]:
# for year in years:
for year in years:
    for month in months:
        url = url_start.format(year,month)
        data = requests.get(url)
        if data.status_code == 404:
            continue
        df2=htmltodf(data,year,month)
        if month == months[0] and year == years[0]:
            dftocsv(df2,False)
        else:
            dftocsv(df2)
delete_empty_rows("outputs2.csv","outputs3.csv")

In [146]:
df = pd.read_csv("outputs3.csv", sep=",", usecols=list(range(4)),header=0)
df = df.rename(columns={'Visitor/Neutral':"Visitor",'PTS':"VPTS",'Home/Neutral':"Home",'PTS.1':"HPTS"})
conditions = [df["VPTS"] > df["HPTS"],df["HPTS"] > df["VPTS"]]
df["Winner"] = np.select(conditions, [df["Visitor"],df["Home"]], 'Other')
df["Loser"] = np.select(conditions, [df["Home"],df["Visitor"]], 'Other')
df.head()

Unnamed: 0,Visitor,VPTS,Home,HPTS,Winner,Loser
0,Boston Celtics,129.0,Indiana Pacers,122.0,Boston Celtics,Indiana Pacers
1,Buffalo Braves,133.0,Milwaukee Bucks,112.0,Buffalo Braves,Milwaukee Bucks
2,Los Angeles Lakers,97.0,New York Knicks,102.0,New York Knicks,Los Angeles Lakers
3,Houston Rockets,120.0,Atlanta Hawks,104.0,Houston Rockets,Atlanta Hawks
4,Chicago Bulls,95.0,Cleveland Cavaliers,106.0,Cleveland Cavaliers,Chicago Bulls


In [147]:
Beltholder = "Boston Celtics"
currentstreak = 1
results=[]
streak = []
for row in df.itertuples(index=False):
    if Beltholder != row[4] and Beltholder != row[5]:
        streak.append("")
        results.append(Beltholder)
        continue
    else:
        if row[5] == Beltholder:
            Beltholder = row[4]
            currentstreak = 1
        else:
            currentstreak+=1
        streak.append(currentstreak)
        results.append(Beltholder)
df["Belt Holder"] = results
df["Defense Streak"] = streak
df.head()

Unnamed: 0,Visitor,VPTS,Home,HPTS,Winner,Loser,Belt Holder,Defense Streak
0,Boston Celtics,129.0,Indiana Pacers,122.0,Boston Celtics,Indiana Pacers,Boston Celtics,2.0
1,Buffalo Braves,133.0,Milwaukee Bucks,112.0,Buffalo Braves,Milwaukee Bucks,Boston Celtics,
2,Los Angeles Lakers,97.0,New York Knicks,102.0,New York Knicks,Los Angeles Lakers,Boston Celtics,
3,Houston Rockets,120.0,Atlanta Hawks,104.0,Houston Rockets,Atlanta Hawks,Boston Celtics,
4,Chicago Bulls,95.0,Cleveland Cavaliers,106.0,Cleveland Cavaliers,Chicago Bulls,Boston Celtics,


In [148]:
df.to_csv("records.csv",index=False)