# PGA Tour Web Scraping

In [81]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from bs4 import BeautifulSoup
import os
import lxml.html as lh
import requests as req
import urllib
import json
from itertools import chain
import sqlite3 as db
import time

We will scrape data from the PGA Tour website using their api. The api url can be found by inspecting the tables we want (from https://www.pgatour.com/players/player.28237.rory-mcilroy.html for Rory McIlroy) and using the Network tab to view from where the website collects it's data.

In [2]:
url = 'https://statdata-api-prod.pgatour.com/api/clientfile/YtdPlayerStatsArchive?P_ID=28237&YEAR=2021&format=json&userTrackingId=exp=1628280848~acl=*~hmac=1da33c725b3fc5716b3db4f8fb34a5176c139f432870155ce0106b284cf1dbf4'

In [3]:
api_id = {'McIlroy' : '28237'}
player_id = api_id['McIlroy']
year = ''
hmac = ''
tracking_id = ''
url_format = 'https://statdata-api-prod.pgatour.com/api/clientfile/YtdPlayerStatsArchive?P_ID=' + player_id + '&YEAR=' + year + '&format=json&userTrackingId=exp=' + tracking_id + '~acl=*~hmac=' + hmac

The player_id and year are easy to choose. But the hmac and tracking_id change every time I open the webpage. I need to find a way to automate the retrieval of the hmac and tracking_id for each webpage.

Note: this url has to be updated frequently. Potentially every time you run this file you may need to update the url

In [4]:
request = req.get(url)
j = request.json()

In [5]:
recap_data = j.get('plrs')[0]#.get('years')[0].get('tours')[0].get('statCats')[0].get('stats')

In [6]:
off_the_tee_data = j.get('plrs')[0].get('years')[0].get('tours')[0].get('statCats')[1].get('stats')

In [7]:
app_the_green_data = j.get('plrs')[0].get('years')[0].get('tours')[0].get('statCats')[2].get('stats')

In [8]:
ar_the_green_data = j.get('plrs')[0].get('years')[0].get('tours')[0].get('statCats')[3].get('stats')

In [9]:
putting_data = j.get('plrs')[0].get('years')[0].get('tours')[0].get('statCats')[4].get('stats')

In [10]:
scoring_data = j.get('plrs')[0].get('years')[0].get('tours')[0].get('statCats')[5].get('stats')

In [11]:
streaks_data = j.get('plrs')[0].get('years')[0].get('tours')[0].get('statCats')[6].get('stats')

In [12]:
recap = pd.DataFrame(recap_data)

In [13]:
recap['years']

0    {'year': '2021', 'disclaimer': 'Player did not...
Name: years, dtype: object

We nearly have everything we need, our problem is in the additionals column. Notice how our stats query is a list of dictionaries with embedded dictionaries for the 'additionals' key. The other columns converted to the data frame nicely because they did not have embedded dictionaries within them. 

In [14]:
#recap_additionals = pd.DataFrame(list(chain.from_iterable(recap['additionals'])))

Now we have a separate data frame for the additional statistics column from our recap data frame. Now we can do this for all of our categories.

In [15]:
off_the_tee = pd.DataFrame(off_the_tee_data)

In [16]:
off_the_tee

Unnamed: 0,statID,name,value,rank,projRank,additionals
0,2674,SG: Tee-to-Green,1.293,11,,"[{'title': 'SG:OTT', 'value': '.636'}, {'title..."
1,2567,SG: Off-the-Tee,.636,8,,"[{'title': 'Total SG:OTT', 'value': '29.912'},..."
2,101,Driving Distance,317.7,2,,"[{'title': 'Total Distance', 'value': '36,850'..."
3,317,Driving Distance - All Drives,308.4,2,,"[{'title': 'Total Distance', 'value': '201,705..."
4,159,Longest Drives,379,83,,"[{'title': 'Tournament', 'value': 'the Memoria..."
5,102,Driving Accuracy Percentage,55.32%,177,,"[{'title': 'Fairways Hit', 'value': '447'}, {'..."
6,2420,Distance from Edge of Fairway,30' 10,167,,"[{'title': 'Total Distance (Feet)', 'value': '..."
7,459,Left Rough Tendency,14.81%,163,,"[{'title': 'Total Left rough', 'value': '96'},..."
8,460,Right Rough Tendency,17.28%,170,,"[{'title': 'Total Right rough', 'value': '112'..."
9,129,Total Driving,179,55,,"[{'title': 'Distance Rank', 'value': '2'}, {'t..."


In [17]:
off_the_tee_additionals = pd.DataFrame(list(chain.from_iterable(off_the_tee['additionals'])))

In [18]:
app_the_green = pd.DataFrame(app_the_green_data)

In [19]:
ar_the_green = pd.DataFrame(ar_the_green_data)

In [20]:
putting = pd.DataFrame(putting_data)

In [21]:
scoring = pd.DataFrame(scoring_data)

In [22]:
streaks = pd.DataFrame(streaks_data)

In [23]:
app_the_green_additionals = pd.DataFrame(list(chain.from_iterable(app_the_green['additionals'])))

In [24]:
ar_the_green_additionals = pd.DataFrame(list(chain.from_iterable(ar_the_green['additionals'])))

In [25]:
putting_additionals = pd.DataFrame(list(chain.from_iterable(putting['additionals'])))

In [26]:
scoring_additionals = pd.DataFrame(list(chain.from_iterable(scoring['additionals'])))

In [27]:
streaks_additionals = pd.DataFrame(list(chain.from_iterable(streaks['additionals'])))

In future iterations I will make a function that automates this. But for now this will suffice.

Let's now concatenate the dataframes, drop the additionals column from the primary data, and drop the projRank column. Let's also remove duplicate rows from both.

In [28]:
McIlroy_data = pd.concat([recap, off_the_tee, app_the_green, ar_the_green, putting, scoring, streaks], axis = 0)
McIlroy_data = McIlroy_data.drop(['projRank', 'additionals'], axis = 1)
McIlroy_data = McIlroy_data.drop_duplicates('statID')

In [29]:
McIlroy_additionals = pd.concat([off_the_tee_additionals, #recap_additionals
                                 app_the_green_additionals, ar_the_green_additionals,
                                 putting_additionals, scoring_additionals,
                                 streaks_additionals], axis = 0)
McIlroy_additionals = McIlroy_additionals.drop_duplicates('title')

In [30]:
McIlroy_data_transposed = McIlroy_data.transpose()
#McIlroy_data_transposed.rename(columns = McIlroy_data_transposed[2,:])
McIlroy_data_transposed.columns = McIlroy_data_transposed.iloc[1, :]
McIlroy_data_transposed.head()

plrName,Rory McIlroy,NaN,NaN.1,NaN.2,NaN.3,NaN.4,NaN.5,NaN.6,NaN.7,NaN.8,...,NaN.9,NaN.10,NaN.11,NaN.12,NaN.13,NaN.14,NaN.15,NaN.16,NaN.17,NaN.18
plrNum,28237,,,,,,,,,,...,,,,,,,,,,
plrName,Rory McIlroy,,,,,,,,,,...,,,,,,,,,,
years,"{'year': '2021', 'disclaimer': 'Player did not...",,,,,,,,,,...,,,,,,,,,,
statID,,02674,02567,101,317,159,102,02420,459,460,...,295,294,483,296,297,298,452,449,02672,02673
name,,SG: Tee-to-Green,SG: Off-the-Tee,Driving Distance,Driving Distance - All Drives,Longest Drives,Driving Accuracy Percentage,Distance from Edge of Fairway,Left Rough Tendency,Right Rough Tendency,...,Best YTD 1-Putt or Better Streak,Best YTD Streak w/o a 3-Putt,Current Streak without a 3-Putt,Consecutive Sand Saves,Consecutive Fairways Hit,Consecutive GIR,Consecutive Holes Below Par,Consecutive Par 3 Birdies,Consecutive Birdies Streak,Consecutive Birdies/Eagles streak


In [31]:
McIlroy_additionals_transposed = McIlroy_additionals.transpose()
McIlroy_additionals_transposed.columns = McIlroy_additionals_transposed.iloc[0,:]
McIlroy_additionals_transposed.head()


title,SG:OTT,Measured Rounds,Total SG:OTT,Total Distance,Total Drives,# of Drives,Tournament,Round,Fairways Hit,Possible Fairways,...,Total Late Rounds,Par 3 Birdies or Better,Par 3 Holes,Par 4 Birdies or Better,Par 4 Holes,Par 5 Birdies or Better,Par 5 Holes,Total Birdies,Start Tournament ID,Current Streak
title,SG:OTT,Measured Rounds,Total SG:OTT,Total Distance,Total Drives,# of Drives,Tournament,Round,Fairways Hit,Possible Fairways,...,Total Late Rounds,Par 3 Birdies or Better,Par 3 Holes,Par 4 Birdies or Better,Par 4 Holes,Par 5 Birdies or Better,Par 5 Holes,Total Birdies,Start Tournament ID,Current Streak
value,.636,47,29.912,36850,116,654,the Memorial,3,447,808,...,31,33,236,111,602,110,206,254,2021340,37


In [32]:
McIlroy_data.loc[McIlroy_data['statID'] == '103']

Unnamed: 0,plrNum,plrName,years,statID,name,value,rank
1,,,,103,Greens in Regulation Percentage,65.42%,110


In [33]:
stat_ids = McIlroy_data['statID'].values
stat_ids

array([nan, '02674', '02567', '101', '317', '159', '102', '02420', '459',
       '460', '129', '02401', '02412', '02568', '103', '419', '486',
       '02357', '331', '437', '432', '433', '431', '02361', '02360',
       '02359', '02358', '076', '075', '074', '340', '339', '338', '337',
       '336', '02374', '02373', '02372', '02371', '02370', '02369',
       '02368', '02367', '02366', '02365', '02364', '02363', '02362',
       '350', '351', '02569', '111', '375', '130', '364', '363', '366',
       '367', '368', '369', '02564', '02428', '104', '402', '413', '426',
       '119', '393', '394', '395', '396', '341', '342', '343', '344',
       '345', '346', '347', '348', '485', '484', '405', '406', '407',
       '408', '498', '349', '438', '115', '02675', '299', '120', '108',
       '156', '107', '106', '155', '105', '148', '149', '117', '285',
       '118', '207', '208', '292', '293', '142', '143', '144', '219',
       '112', '113', '114', '352', '160', '122', '482', '295', '294',
       '

In [34]:
stat_names = McIlroy_data['name'].values

In [35]:
stats = {stat_ids[i] : stat_names[i] for i in range(1, len(stat_ids))}

In [58]:
stats

{'02674': 'SG: Tee-to-Green',
 '02567': 'SG: Off-the-Tee',
 '101': 'Driving Distance',
 '317': 'Driving Distance - All Drives',
 '159': 'Longest Drives',
 '102': 'Driving Accuracy Percentage',
 '02420': 'Distance from Edge of Fairway',
 '459': 'Left Rough Tendency',
 '460': 'Right Rough Tendency',
 '129': 'Total Driving',
 '02401': 'Club Head Speed',
 '02412': 'Total Driving Efficiency',
 '02568': 'SG: Approach the Green',
 '103': 'Greens in Regulation Percentage',
 '419': 'Going for the Green',
 '486': 'Going for the Green - Hit Green Pct.',
 '02357': 'Going for the Green - Birdie or Better',
 '331': 'Proximity to Hole',
 '437': 'Rough Proximity',
 '432': 'Left Rough Proximity',
 '433': 'Right Rough Proximity',
 '431': 'Fairway Proximity',
 '02361': 'Approaches from &gt; 275 yards',
 '02360': 'Approaches from 250-275 yards',
 '02359': 'Approaches from 225-250 yards',
 '02358': 'Approaches from 200-225 yards',
 '076': 'Approaches from 50-75 yards',
 '075': 'Approaches from 75-100 yards

In [36]:
data = {'stat_ids' : stat_ids[1:], 'stat_names': stat_names[1:]}
stat_ids_df = pd.DataFrame.from_dict(data)
conn = db.connect('stat_ids.db')
stat_ids_df.to_sql('stat_ids', conn, if_exists = 'replace')
conn.close()

Now we can use this dictionary to construct dataframes for each statistic.

In [37]:
def get_links(stats, year):
    """
    Create a list of links for each statistic in a specified year.
    """
    base_url = 'https://www.pgatour.com/content/pgatour/stats/stat.'
    urls = [base_url + str(id) + '.y' + str(year) + '.html' for id in stats]
    return urls

Let's just use one of these urls for now to see what they look like under the hood

In [38]:
conn = db.connect('stat_ids.db')

c = conn.cursor()

query = conn.execute("SELECT * From stat_ids")
cols = [column[0] for column in query.description]
stat_ids = pd.DataFrame.from_records(data = query.fetchall(), columns = cols)

conn.close()

In [39]:
stat_ids.head()

Unnamed: 0,index,stat_ids,stat_names
0,0,2674,SG: Tee-to-Green
1,1,2567,SG: Off-the-Tee
2,2,101,Driving Distance
3,3,317,Driving Distance - All Drives
4,4,159,Longest Drives


In [59]:
stat_nums = stat_ids['stat_ids']
stat_names = stat_ids['stat_names']
stat_dict = dict(zip(stat_nums, stat_names))
stat_dict

{'02674': 'SG: Tee-to-Green',
 '02567': 'SG: Off-the-Tee',
 '101': 'Driving Distance',
 '317': 'Driving Distance - All Drives',
 '159': 'Longest Drives',
 '102': 'Driving Accuracy Percentage',
 '02420': 'Distance from Edge of Fairway',
 '459': 'Left Rough Tendency',
 '460': 'Right Rough Tendency',
 '129': 'Total Driving',
 '02401': 'Club Head Speed',
 '02412': 'Total Driving Efficiency',
 '02568': 'SG: Approach the Green',
 '103': 'Greens in Regulation Percentage',
 '419': 'Going for the Green',
 '486': 'Going for the Green - Hit Green Pct.',
 '02357': 'Going for the Green - Birdie or Better',
 '331': 'Proximity to Hole',
 '437': 'Rough Proximity',
 '432': 'Left Rough Proximity',
 '433': 'Right Rough Proximity',
 '431': 'Fairway Proximity',
 '02361': 'Approaches from &gt; 275 yards',
 '02360': 'Approaches from 250-275 yards',
 '02359': 'Approaches from 225-250 yards',
 '02358': 'Approaches from 200-225 yards',
 '076': 'Approaches from 50-75 yards',
 '075': 'Approaches from 75-100 yards

I included this extra dictionary as practice on how to create dictionaries from a database/dataframe. It may seem redundant but I wanted the extra practice.

In [41]:
def get_links(stat_ids, year):
    """
    Create a list of links for each statistic in a given year.
    """
    base_url = 'https://www.pgatour.com/content/pgatour/stats/stat.'
    urls = [base_url + key + '.y' + str(year) + '.html' for key in stat_dict.keys()]
    return urls

In [42]:
def get_link(stat_id, year):
    """
    Create one specific link for a given statistic in a given year
    """
    url = 'https://www.pgatour.com/content/pgatour/stats/stat.' + stat_id + '.y' + str(year) + '.html'
    return url

Now what do I need? I know I can access all of the meaning headers I want. I need to change some of the headers, for example headers that just say '%', for convenience, then I can rework my existing scraping functions to use the headers I have here. This will also make my make_data_frame function much cleaner and simpler to use as I won't have many separate cases involved.

In [89]:
def get_stats(link):
    '''
    Collect the stats from a soup object and return as a list.
    '''
    response =  ''
    while response == '':
        try:
            response = req.get(link)
            break
        except:
            print("Connection refused by the server..")
            print("Let me sleep for 5 seconds")
            print("ZZzzzz...")
            time.sleep(5)
            print("Was a nice sleep, now let me continue...")
            continue
    soup = BeautifulSoup(response.content, 'lxml')
    table = soup.find('table', attrs = {'id': 'statsTable'})
    stats = []
    body = table.find('tbody')
    all_rows = body.findAll('tr')
    for row in all_rows:
        line = []
        for column in row.findAll('td'):
            data = column
            data = data.text
            line.append(data)
        stats.append(line)
    return stats

In [79]:
def make_data_frames(links, year):
    """
    This function takes a scoring category and a provided url_list,
    calls the get_datas function to scrape the data, and creates a
    data frame for every url in the url list and appropriately
    renames the columns based upon the category provided. This returns
    a list of data frames.
    """
    headers_list = get_headers(links)
    df_list = []
    for i in range(len(links)):
        data = get_stats(links[i])
        df = pd.DataFrame(data, columns = headers_list[i])
        df['RANK THIS WEEK'] = df['RANK THIS WEEK'].str.replace('\n', '')
        df['RANK LAST WEEK'] = df['RANK LAST WEEK'].str.replace('\n', '')
        df['PLAYER NAME'] = df['PLAYER NAME'].str.replace('\n', '').str.strip()
        df_list.append(df)
    return df_list

In [74]:
def get_headers(links):
    headers_nested_list = []
    stat_dict_keys = [key for key in stat_dict.keys()]

    for i in range(len(links)):
        headers = []
        stat_name = stat_dict[stat_dict_keys[i]].upper()
        response = req.get(links[i])
        soup = BeautifulSoup(response.content, 'lxml')

            #Get rounds header
        rank_this_week = soup.find_all(class_="hidden-small hidden-medium")[0].get_text()
        headers.append('RANK ' + rank_this_week)

        player_name = soup.find_all(class_ = 'player-name')[0].get_text()
        headers.append(player_name)

            #Get other headers
        stat_headers = soup.find_all(class_="col-stat")
        for header in stat_headers:
                headers.append(header.get_text())

        headers.insert(1, 'RANK LAST WEEK')
        # All edge cases:

        if len(headers) > 4 and headers[4] == 'TOTAL PUTTS':
            if len(headers) == 7:
                #print('edge case 1')
                headers.insert(3, 'ROUNDS')
                headers[4] = stat_name
                #print('edge case 1')
                #print(headers)
                headers_nested_list.append(headers)
                #print(links[i])
                #print(stat_name)
                continue
            else:
                headers[3] = stat_name
                #print('edge case 2')
                #print(headers)
                headers_nested_list.append(headers)
                #print(links[i])
                #print(stat_name)
                continue
                
        streak_stats = ['CONSECUTIVE CUTS', 'YTD PAR OR BETTER STREAK',
                       'BEST YTD 1-PUTT OR BETTER STREAK','BEST YTD STREAK W/O A 3-PUTT',
                       'CONSECUTIVE SAND SAVES', 'CONSECUTIVE FAIRWAYS HIT',
                       'CONSECUTIVE GIR', 'CONSECUTIVE HOLES BELOW PAR',
                       'CONSECUTIVE PAR 3 BIRDIES', 'CONSECUTIVE BIRDIES STREAK',
                       'CONSECUTIVE BIRDIES/EAGLES STREAK',
                       'CURRENT STREAK WITHOUT A 3-PUTT']

        if stat_name in streak_stats:
            #print('edge case 3')
            headers[3] = stat_name
            #print(headers)
            headers_nested_list.append(headers)
            #print(links[i])
            #print(stat_name)
            continue
            
        elif len(headers) > 6 and headers[5] == 'TOTAL ADJUSTMENT':
            #another edge case
            #print('edge case 4')
            headers.insert(3, 'ROUNDS')
            headers[4] = stat_name
            #print(stat_name)
            #print(headers)
            headers_nested_list.append(headers)
        elif len(headers) == 6 and headers[5] == 'TOTAL ROUNDS':
            if headers[4] == 'TOTAL STROKES':
                if stat_name == 'SCORING AVERAGE (ACTUAL)':
                    #print('edge case 5')
                    headers.insert(3, 'ROUNDS')
                    headers[4] = stat_name
                    #print(stat_name)
                    #print(headers[4])
                    headers_nested_list.append(headers)
                    #print(headers)
                    #print(links[i])
                    #print(stat_name)
                    continue
            if headers[4] == '# OF BIRDIES':
                #print('edge case 6')
                headers.insert(3, 'ROUNDS')
                headers[4] = stat_name
                #print(stat_name)
                #print(headers[4])
                headers_nested_list.append(headers)
                #print(headers)
                #print(links[i])
                #print(stat_name)
                continue
            #print(headers[3])
            #print('edge case 7')
            headers[3] = stat_name
            #print(headers[3])
            #print(stat_name)
            #print(headers)
            headers_nested_list.append(headers)

        else:
            #print(headers)
            if len(headers) == 3:
                #print("edge case 8")
                headers.insert(3, 'ROUNDS')
                headers.insert(4, stat_name)
                headers_nested_list.append(headers)
            
            else:
                #print("edge case 9")
                headers.insert(3, 'ROUNDS')
                headers[4] = stat_name
                headers_nested_list.append(headers)

        #print(headers)
        #print(links[i])
        #print(stat_name)
    return headers_nested_list

The commented out print statements are for debugging purposes. I mainly develop in a jupyter environment and thus far I have not found a debugger similar to what IDE's have. So for now, my code looks a bit cluttered.

In [75]:
years = [str(i) for i in range(2010, 2022)]

Create a list of year numbers from 2010 to 2021 for the seasons to extract.

In [90]:
for year in years:
    print(year)
    # create the list of links for each statistic with the current year.
    links = get_links(stat_ids, year)
    # create the associated dataframes with each link in the current year.
    data_frames = make_data_frames(links, year)

    # remove duplicate columns. Perhaps in the future I will 
    # change the name of each column slightly in the body of the
    # make_data_frames function but for now I don't see many
    # uses for these statistics.
    duplicate_columns = ['ROUNDS','ATTEMTPS','RELATIVE TO PAR',
                        'MEASURED ROUNDS','TOTAL DISTANCE (FEET)',
                        '# OF ATTEMPTS', 'SUCCESSES','TOTAL PUTTS',
                        'TOTAL ROUNDS','PUTTS MADE','TOTAL HOLES',
                        'ROUND','HOLE','CURRENT STREAK',
                        'RANK THIS WEEK','RANK LAST WEEK',
                        'ATTEMPTS', 'TOTAL STROKES', 'TOTAL DISTANCE',
                        'YEAR/TOURN#', 'TOURNAMENT', 'POSSIBLE FWYS', 'GREENS HIT',
                        '# HOLES', 'GREENS HIT', 'RTP-NOT GOING FOR THE GRN',
                        'RTP-SUCCESSES', 'GIR RANK', '# OF HOLES',
                        'TOTAL BIRDIES']
    columns_to_drop = []
    df_one = data_frames[0]
    for column in df_one.columns:
        if column in duplicate_columns:
            columns_to_drop.append(column)
    df_one = df_one.drop(columns_to_drop, axis = 1)
    
    # merge all Dataframes together
    for i in range(1, len(data_frames)):
        columns_to_drop = []
        for column in data_frames[i].columns:
            if column in duplicate_columns:
                columns_to_drop.append(column)
        df_one = pd.merge(df_one, data_frames[i].drop(columns_to_drop, axis = 1), on ='PLAYER NAME', how = 'left')

    # add year column
    df_one['YEAR'] = year

        #Concat dataframe to overall dataframe    
    if year == '2010':
        df_total = pd.DataFrame()
        df_total = pd.concat([df_total, df_one], axis=0)
    else:
        df_total = pd.concat([df_total, df_one], axis=0)


print('Finished Constructing Table')
df_total.shape

2010


  df_one['YEAR'] = year


2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
Finished Constructing Table


(10462, 191)

In [91]:
df_total.head()

Unnamed: 0,PLAYER NAME,SG: TEE-TO-GREEN,SG:OTT,SG:APR,SG:ARG,SG: OFF-THE-TEE,TOTAL SG:OTT,DRIVING DISTANCE,TOTAL DRIVES,DRIVING DISTANCE - ALL DRIVES,...,BEST YTD STREAK W/O A 3-PUTT,CURRENT STREAK WITHOUT A 3-PUTT,CONSECUTIVE SAND SAVES,CONSECUTIVE FAIRWAYS HIT,CONSECUTIVE GIR,CONSECUTIVE HOLES BELOW PAR,CONSECUTIVE PAR 3 BIRDIES,CONSECUTIVE BIRDIES STREAK,CONSECUTIVE BIRDIES/EAGLES STREAK,YEAR
0,Adam Scott,1.609,0.472,1.164,-0.027,0.472,24.523,294.4,136,285.7,...,116,26,4,19,15,3,4,3,3,2010
1,Vijay Singh,1.494,0.369,0.979,0.145,0.369,21.05,292.3,148,287.0,...,118,6,6,14,17,4,2,4,3,2010
2,Steve Stricker,1.383,0.191,0.773,0.419,0.191,10.669,282.9,144,279.1,...,223,58,8,24,11,4,2,4,2,2010
3,Jim Furyk,1.159,0.15,0.641,0.367,0.15,9.018,276.0,148,273.0,...,124,17,9,19,18,4,3,4,3,2010
4,Phil Mickelson,1.151,0.185,0.738,0.228,0.185,10.556,299.1,152,295.0,...,154,7,6,12,18,6,2,6,3,2010


Load the dataframe onto a database for use in other projects.

In [92]:
conn = db.connect('pga_database.db')
df_total.to_sql('pga_stats_table', conn, if_exists = 'replace')
conn.close()

  sql.to_sql(
  temp.reset_index(inplace=True)


Access the database and create a dataframe from it.

In [93]:
conn = db.connect('pga_database.db')
query = conn.execute("SELECT * From pga_stats_table")
cols = [column[0] for column in query.description]
df_total = pd.DataFrame.from_records(data = query.fetchall(), columns = cols)

In [94]:
df_total.loc[df_total['PLAYER NAME'] == 'Rory McIlroy']

Unnamed: 0,index,PLAYER NAME,SG: TEE-TO-GREEN,SG:OTT,SG:APR,SG:ARG,SG: OFF-THE-TEE,TOTAL SG:OTT,DRIVING DISTANCE,TOTAL DRIVES,...,BEST YTD STREAK W/O A 3-PUTT,CURRENT STREAK WITHOUT A 3-PUTT,CONSECUTIVE SAND SAVES,CONSECUTIVE FAIRWAYS HIT,CONSECUTIVE GIR,CONSECUTIVE HOLES BELOW PAR,CONSECUTIVE PAR 3 BIRDIES,CONSECUTIVE BIRDIES STREAK,CONSECUTIVE BIRDIES/EAGLES STREAK,YEAR
12,12,Rory McIlroy,0.944,0.698,0.271,-0.025,0.698,27.933,300.0,104,...,194,12,6,16,17,4,1,3,4,2010
378,0,Rory McIlroy,2.347,1.072,1.002,0.273,1.072,42.86,310.1,106,...,110,97,10,12,13,4,4,4,2,2012
579,10,Rory McIlroy,1.077,0.624,0.42,0.032,0.624,25.601,302.2,110,...,126,49,5,9,13,4,2,4,2,2013
749,0,Rory McIlroy,1.992,1.367,0.602,0.022,1.367,65.623,310.5,120,...,152,8,5,11,22,5,2,5,3,2014
1111,1,Rory McIlroy,1.813,1.192,0.428,0.193,1.192,52.438,306.2,108,...,106,64,4,15,21,4,2,4,4,2016
1305,7,Rory McIlroy,1.382,0.965,0.322,0.095,0.965,34.731,317.2,88,...,85,4,16,8,17,3,2,3,3,2017
1497,5,Rory McIlroy,1.351,0.761,0.269,0.322,0.761,38.798,319.7,122,...,184,50,7,14,26,6,2,6,1,2018
5780,0,Rory McIlroy,2.126,1.195,0.633,0.297,1.195,68.124,313.5,128,...,190,140,6,17,15,4,3,4,2,2019
5973,5,Rory McIlroy,1.314,0.702,0.504,0.108,0.702,35.796,314.0,104,...,245,18,5,11,13,4,3,4,3,2020
10266,10,Rory McIlroy,1.293,0.636,0.501,0.156,0.636,29.912,317.7,116,...,125,37,8,10,15,5,3,5,2,2021


Now I can access historical data for each player that the PGA Tour tracked from 2010 to 2021