# Basic Web Scraping with Python - NBA Player Data

### Importing Libraries

In [1]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import pandas as pd

### Getting raw HTML data

In [20]:
# Getting URL
name = input("Type a player name: ")
name = name.lower().split(" ")

last = name[-1] if len(name[-1]) <= 5 else name[-1][:5]
first = name[0] if len(name[0]) <= 2 else name[0][:2]
key = last + first

my_url = "https://www.basketball-reference.com/players/{}/{}01.html".format(key[0], key)
my_url


Type a player name: LeBron James


'https://www.basketball-reference.com/players/j/jamesle01.html'

In [21]:
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

page_soup = soup(page_html, 'html.parser')

### Find and extract relevant data

In [26]:
player_data = page_soup.find('div', {'id': 'meta'})
player_data

<div id="meta">
<div class="media-item"><img alt="Photo of LeBron James" itemscope="image" src="https://www.basketball-reference.com/req/202106291/images/players/jamesle01.jpg"/>
</div><!-- div.media-item --><div>
<h1>
<span>LeBron James</span>
</h1>
<!-- all other pages -->
<p>
<strong>
<strong>LeBron Raymone James</strong>
</strong>
    
  ▪
  <a href="/friv/twitter.html">Twitter</a>:
  <a href="https://twitter.com/KingJames">KingJames</a>
  
  
  ▪
  Instagram:
  <a href="https://instagram.com/kingjames">kingjames</a>
</p>
<p>
(King James, LBJ, Chosen One, Bron-Bron, The Little Emperor, The Akron Hammer, L-Train, Benjamin Buckets)
</p>
<p>
<strong>
  Position:
  </strong>
  Small Forward, Power Forward, Point Guard, and Shooting Guard


  
  ▪
  
  <strong>
  Shoots:
  </strong>
  Right
</p>
<p><span>6-9</span>, <span>250lb</span> (206cm, 113kg) </p>
<p>
<strong>Born: </strong>
<span data-birth="1984-12-30" id="necro-birth">
<a href="/friv/birthdays.fcgi?month=12&amp;day=30">Decembe

In [75]:
full_name = player_data.find("strong").text[1:-1]
birth_date = [item.text for item in player_data.find("span", {'id': 'necro-birth'}).findAll("a")]
birth_date = " ".join([birth_date[0].split(" ")[1], birth_date[0].split(" ")[0], birth_date[1]])
weight = player_data.findAll("p")[3].text.split("\xa0")[-1][:-2]
height = player_data.findAll("p")[3].text.split("\xa0")[-2][1:-1]
school = player_data.findAll("p")[5].text.split("\n")[-2].lstrip()
debut = player_data.findAll("p")[-2].find("a").text
debut

'October 29, 2003'

In [86]:
header = ["Full Name", "Birth Date", "Weight (kg)", "Height (cm)", "School", "Debut Date"]
data = [full_name, birth_date, weight, height, school, debut]

for div in page_soup.find('div', {'class': 'p1'}).findAll("div"):
    header.append(div.find("strong").text)
    data.append(div.findAll("p")[1].text)

for div in page_soup.find('div', {'class': 'p2'}).findAll("div"):
    header.append(div.find("strong").text)
    data.append(div.findAll("p")[1].text)

for div in page_soup.find('div', {'class': 'p3'}).findAll("div"):
    header.append(div.find("strong").text)
    data.append(div.findAll("p")[1].text)
    
data

['LeBron Raymone James',
 '30 December 1984',
 '113kg',
 '206cm',
 'St. Vincent-St. Mary in Akron, Ohio',
 'October 29, 2003',
 '1411',
 '27.2',
 '7.5',
 '7.3',
 '50.5',
 '34.4',
 '73.5',
 '54.5',
 '27.3',
 '254.5']

In [88]:
frame = pd.DataFrame([data], columns=header)
frame

Unnamed: 0,Full Name,Birth Date,Weight (kg),Height (cm),School,Debut Date,G,PTS,TRB,AST,FG%,FG3%,FT%,eFG%,PER,WS
0,LeBron Raymone James,30 December 1984,113kg,206cm,"St. Vincent-St. Mary in Akron, Ohio","October 29, 2003",1411,27.2,7.5,7.3,50.5,34.4,73.5,54.5,27.3,254.5
