# Exploratory Data Analysis

Import libraries

In [1]:
import pandas as pd
import requests 
from bs4 import BeautifulSoup
import re
from typing import Optional
import json
import os

In [2]:
# # Run Once to put into json
# variables_of_interest = {
#     "Scoring Average": "https://www.pgatour.com/stats/detail/120",
#     "Birdie Average": "https://www.pgatour.com/stats/detail/156",
#     "SG: Total": "https://www.pgatour.com/stats/detail/02675",
#     "Driving Distance": "https://www.pgatour.com/stats/detail/101",
#     "SG: Approach the Green": "https://www.pgatour.com/stats/detail/02568",
#     "Greens in Regulation Percentage": "https://www.pgatour.com/stats/detail/103",
#     "Scrambling": "https://www.pgatour.com/stats/detail/130",
#     "SG: Putting": "https://www.pgatour.com/stats/detail/02564",
#     "FedEx Cup Standings": "https://www.pgatour.com/fedexcup"
# }

# # Define the path to the data directory and file
# data_dir = os.path.join("..", "data")
# os.makedirs(data_dir, exist_ok=True)  # Ensure the directory exists

# file_path = os.path.join(data_dir, "links.json")

# # Save the dictionary as a JSON file
# with open(file_path, "w") as f:
#     json.dump(variables_of_interest, f, indent=4)

In [3]:
def get_status(endpoint: str) -> Optional[BeautifulSoup]:
    """
    Get the status of an endpoint.
    Args:
        endpoint (str): the url of the endpoint
    Returns:
        str: the status of the endpoint
    """
    try:
        response = requests.get(endpoint)
        status = response.status_code
        if status == 200:
            print(f"\nResponse code: {status}\nStatus OK")
            soup = BeautifulSoup(response.text, features="html.parser")
            return soup
        else:
            print(f"Response code: {status}. Status not OK")
            return None
    except Exception as e:
        print(f"Error getting endpoint: {e}")
        return None

## PGA Tour ( scoring average )

In [4]:
endpoint = "https://www.pgatour.com/stats/detail/120"
soup = get_status(endpoint)
if soup is not None and not soup.empty:
    print("\nSoup found!")
else:
    print("\nNO SOUP FOR YOU!")


Response code: 200
Status OK

Soup found!


In [5]:
display(soup)

<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><title>Golf Stat and Records | PGA TOUR</title><meta content="#20487C" name="theme-color"/><meta content="Golf Stat and Records | PGA TOUR" name="title" property="og:title"/><meta content="website" name="type" property="og:type"/><meta content="PGA TOUR Stats" name="description" property="og:description"/><meta content="width=device-width, height=device-height, initial-scale=1, maximum-scale=1" name="viewport"/><meta content="Golf Stat and Records | PGA TOUR" name="parsely-title"/><meta content="PGA TOUR" name="parsely-author"/><link href="/apple-touch-icon.png?v=5" rel="apple-touch-icon" sizes="180x180"/><link href="/favicon-32x32.png?v=4" rel="icon" sizes="32x32" type="image/png"/><link href="/favicon-16x16.png?v=4" rel="icon" sizes="16x16" type="image/png"/><link href="/site.webmanifest?v=4" rel="manifest"/><link href="/safari-pinned-tab.svg" rel="mask-icon"/><link href="/favicon.ico?v=4" rel="shortcut icon"/><meta content

In [7]:
table = soup.find("table")

if table:
    rows = table.find_all("tr")

    player_names = []
    for row in rows:
        cells = row.find_all("td")
        if len(cells) > 2:
            player_name_span = cells[2].find("span")
            if player_name_span:
                player_names.append(player_name_span.get_text())

    print(player_names)

    # to get just scottie scheffler for testing
    if player_names:
        scottie = player_names[0]
        print(f"\nJust the first player: {scottie}")
else:
    print("No <table> element was found in the HTML.")

No <table> element was found in the HTML.


## FedEx cup

In [8]:
endpoint = "https://www.pgatour.com/fedexcup"
soup = get_status(endpoint)
if soup is not None and not soup.empty:
    print("\nSoup found!")
else:
    print("\nNO SOUP FOR YOU!")

display(soup)


Response code: 200
Status OK

Soup found!


<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><title>PGA TOUR FedExCup Standings</title><meta content="#20487C" name="theme-color"/><meta content="PGA TOUR FedExCup Standings" name="title" property="og:title"/><meta content="website" name="type" property="og:type"/><meta content="FedExCup Standings" name="description" property="og:description"/><meta content="width=device-width, height=device-height, initial-scale=1, maximum-scale=1" name="viewport"/><meta content="PGA TOUR FedExCup Standings" name="parsely-title"/><meta content="PGA TOUR" name="parsely-author"/><link href="/apple-touch-icon.png?v=5" rel="apple-touch-icon" sizes="180x180"/><link href="/favicon-32x32.png?v=4" rel="icon" sizes="32x32" type="image/png"/><link href="/favicon-16x16.png?v=4" rel="icon" sizes="16x16" type="image/png"/><link href="/site.webmanifest?v=4" rel="manifest"/><link href="/safari-pinned-tab.svg" rel="mask-icon"/><link href="/favicon.ico?v=4" rel="shortcut icon"/><meta content="#da532c" 