# Explore here

It's recommended to use this notebook for exploration purposes.

In [2]:
import os
from bs4 import BeautifulSoup
import requests
import time
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns



In [3]:
# Step 1: Install dependencies
!pip install requests beautifulsoup4

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
# Step 2: Download HTML

# Fetch the HTML content of a webpage
url = "https://www.mlb.com/stats/san-francisco-giants/all-time-by-season"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    print("Page fetched successfully!")
else:
    print(f"Failed with status code: {response.status_code}")

# Print the first 500 characters of the HTML content
print(response.text[:500])

Page fetched successfully!
<!DOCTYPE html>
<html lang="en">
<head>
        <script defer id="dd-rum">
          (() => {
            try {
              if (!window.DD_RUM) {
              (function(h, o, u, n, d) {
                h = h[d] = h[d] || {
                q: [],
                onReady: function(c) {
                  h.q.push(c)
                }
                };
                d = o.createElement(u);
                d.async = 1;
                d.src = n;
                n = o.getElementsByTagName(u)[0];


In [8]:
# Step 3: Transform the HTML
from bs4 import BeautifulSoup

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Pretty-print the first 500 characters of the parsed HTML
print(soup.prettify()[:500])

<!DOCTYPE html>
<html lang="en">
 <head>
  <script defer="" id="dd-rum">
   (() => {
            try {
              if (!window.DD_RUM) {
              (function(h, o, u, n, d) {
                h = h[d] = h[d] || {
                q: [],
                onReady: function(c) {
                  h.q.push(c)
                }
                };
                d = o.createElement(u);
                d.async = 1;
                d.src = n;
                n = o.getElementsByTagName(u)[0];
        


In [9]:
# Step 4: Process the DataFrame

# Find the table by its class name or id
table = soup.find('table', {'class': 'bui-table'})

# Extract all rows in the table
rows = table.find_all('tr')

# Print the first row (header row) to inspect it
print(rows[0])

<tr><th class="pinned-col-T0Jkz2nh col-group-start-Gn6clGbi number-GoaicxKV first-col-hKqk_bGN header-_PPXDbaa is-table-pinned-lGP8KWTK" data-col="0" data-row="h1" id="tb-1232-header-col0" scope="col"><button aria-label="Player Column Sort" class="column-header-container-x7DJ3Fgp no-select-hDruNBDu"><div><abbr class="bui-text cellheader bui-text">PLAYER</abbr></div><div class="mobile-header-extra-AGgURvvT"><div></div><div></div></div></button><div class="column-header-container-x7DJ3Fgp no-select-hDruNBDu"><div><abbr class="bui-text cellheader bui-text">PLAYER</abbr></div><div class="mobile-header-extra-AGgURvvT"><div></div><div></div></div></div></th><th class="number-GoaicxKV header-_PPXDbaa align-left-L6MdxTlJ is-table-pinned-lGP8KWTK" data-col="1" data-row="h1" id="tb-1232-header-col1" scope="col"><button aria-label="Year Column Sort" class="column-header-container-x7DJ3Fgp no-select-hDruNBDu"><div><abbr class="bui-text cellheader bui-text">YEAR</abbr></div><div class="mobile-heade

In [10]:
# Step 4: Process the DataFrame

# Initialize an empty list to hold the rows of data
table_data = []

# Loop through the rows in the table
for row in rows:
    # Extract the cells from the row
    cells = row.find_all(['td', 'th'])
    
    # Extract the text from each cell and strip any surrounding whitespace
    row_data = [cell.text.strip() for cell in cells]
    
    # Append the row data to our list
    table_data.append(row_data)

# Print the first few rows of extracted data
print(table_data[:5])

[['PLAYERPLAYER', 'YEARYEAR', 'TEAMTEAM', 'GG', 'ABAB', 'RR', 'HH', '2B2B', '3B3B', 'HRHR', 'RBIRBI', 'BBBB', 'SOSO', 'SBSB', 'CSCS', 'AVGAVG', 'OBPOBP', 'SLGSLG', 'caret-upcaret-downOPScaret-upcaret-downOPS'], ['1BarryB BondsBondsLF1\u200c\u200c\u200c', '2004', 'SF', '147', '373', '129', '135', '27', '3', '45', '101', '232', '41', '6', '1', '.362', '.609', '.812', '1.421'], ['2BarryB BondsBondsLF2\u200c\u200c\u200c', '2002', 'SF', '143', '403', '117', '149', '31', '2', '46', '110', '198', '47', '9', '2', '.370', '.582', '.799', '1.381'], ['3BarryB BondsBondsLF3\u200c\u200c\u200c', '2001', 'SF', '153', '476', '129', '156', '32', '2', '73', '137', '177', '93', '13', '3', '.328', '.515', '.863', '1.378'], ['4BarryB BondsBondsLF4\u200c\u200c\u200c', '2003', 'SF', '130', '390', '111', '133', '22', '1', '45', '90', '148', '58', '7', '0', '.341', '.529', '.749', '1.278']]


In [None]:
# Step 5: Store the data in sqlite


In [None]:
# Step 6: Visualize the data
