## Scraping Table From Wikipedia

In [2]:
from bs4 import BeautifulSoup
import requests
import numpy as np

In [3]:
page = requests.get('https://en.wikipedia.org/wiki/List_of_Solar_System_objects_by_size')
soup = BeautifulSoup(page.text, 'html')

### Find first \<table\> in the soup

In [4]:
table = soup.find_all('table')[1]

### Parse Headers

In [5]:
html_headers = table.find_all('th')
all_headers = [title.text.strip() for title in html_headers]
all_headers

['Body[note 1]',
 'Image',
 'Radius[note 2]',
 'Volume',
 'Mass',
 'Surface area',
 'Density',
 'Gravity[note 3]',
 'Type',
 'Discovery',
 '(km)',
 '(R🜨)',
 '(109\xa0km3)',
 '(V🜨)',
 '(1021\xa0kg)',
 '(M🜨)',
 '(106 km2)',
 '🜨',
 '(g/cm3)',
 '(m/s2)',
 '(🜨)',
 '695508 ±\xa0?[13]',
 '1989100000[13]',
 '1.409[13]',
 '69911±6[14]',
 '1898187±88[14]',
 '1.3262±0.0003[14]',
 '58232±6[14](136775 for A Ring)',
 '568317±13[14]',
 '0.6871±0.0002[14]',
 '25362±7[14]',
 '86813±4[14]',
 '1.270±0.001[14]',
 '24622±19[14]',
 '102413±5[14]',
 '1.638±0.004[14]',
 '6371.0084±0.0001[14]',
 '5972.4±0.3[14]',
 '5.5136±0.0003[14]',
 '6052±1[14]',
 '4867.5±0.2[14]',
 '5.243±0.003[14]',
 '3389.5±0.2[14]',
 '641.71±0.03[14]',
 '3.9341±0.0007[14]',
 '2634.1±0.3',
 '148.2',
 '1.936',
 '2574.73±0.09[23]',
 '134.5',
 '1.880±0.004',
 '2439.4±0.1[14]',
 '330.11±0.02[14]',
 '5.4291±0.007[14]',
 '2410.3±1.5[23]',
 '107.6',
 '1.834±0.003',
 '1821.6±0.5[5]',
 '89.32',
 '3.528±0.006',
 '1737.5±0.1[28]',
 '73.46[29]',
 '3

I will remove the Earth-based metrics columns, since we can calculate that based on Earth's data.

In [6]:

table_headers = all_headers[:10]
table_headers.remove('Image')
table_headers[0] = "Body"
table_headers[1] = "Radius"
table_headers[6] = "Gravity"
table_headers

['Body',
 'Radius',
 'Volume',
 'Mass',
 'Surface area',
 'Density',
 'Gravity',
 'Type',
 'Discovery']

### Parse Data

Some data in rows are marked as \<th\>

In [42]:
table_data_rows = table.find_all('tr')[2:]
data = []

for row in table_data_rows:
    table_row_data = row.find_all(['td', 'th'])
    row_data = [data.text.strip() for data in table_row_data]

    data.append(row_data)

data

[['Sun',
  '',
  '695508 ±\xa0?[13]',
  '109.2[13]',
  '1,409,300,000[13]',
  '1,301,000[13]',
  '1989100000[13]',
  '333,000[13]',
  '6,078,700[13]',
  '11,918[13]',
  '1.409[13]',
  '274.0[13]',
  '27.94[13]',
  'G2V-class star',
  'prehistoric'],
 ['Jupiter',
  '',
  '69911±6[14]',
  '10.97',
  '1,431,280',
  '1,321',
  '1898187±88[14]',
  '317.83',
  '61,419[15]',
  '120.41',
  '1.3262±0.0003[14]',
  '24.79[14]',
  '2.528',
  'gas giant planet; has rings',
  'prehistoric'],
 ['Saturn',
  '',
  '58232±6[14](136775 for A Ring)',
  '9.140',
  '827,130',
  '764',
  '568317±13[14]',
  '95.162',
  '42,612[16]',
  '83.54',
  '0.6871±0.0002[14]',
  '10.44[14]',
  '1.065',
  'gas giant planet; has rings',
  'prehistoric'],
 ['Uranus',
  '',
  '25362±7[14]',
  '3.981',
  '68,340',
  '63.1',
  '86813±4[14]',
  '14.536',
  '8083.1[17]',
  '15.85',
  '1.270±0.001[14]',
  '8.87[14]',
  '0.886',
  'ice giant planet; has rings',
  '1781'],
 ['Neptune',
  '',
  '24622±19[14]',
  '3.865',
  '62,540'

### Analyse Data

In [8]:
import pandas as pd

In [43]:
matrix = np.matrix(data)
df = pd.DataFrame(matrix)

df = df.drop([1,3,5,7,9,12], axis=1)
df.columns = table_headers

In [44]:
df.head()

Unnamed: 0,Body,Radius,Volume,Mass,Surface area,Density,Gravity,Type,Discovery
0,Sun,695508 ± ?[13],"1,409,300,000[13]",1989100000[13],"6,078,700[13]",1.409[13],274.0[13],G2V-class star,prehistoric
1,Jupiter,69911±6[14],1431280,1898187±88[14],"61,419[15]",1.3262±0.0003[14],24.79[14],gas giant planet; has rings,prehistoric
2,Saturn,58232±6[14](136775 for A Ring),827130,568317±13[14],"42,612[16]",0.6871±0.0002[14],10.44[14],gas giant planet; has rings,prehistoric
3,Uranus,25362±7[14],68340,86813±4[14],8083.1[17],1.270±0.001[14],8.87[14],ice giant planet; has rings,1781
4,Neptune,24622±19[14],62540,102413±5[14],7618.3[18],1.638±0.004[14],11.15[14],ice giant planet; has rings,1846


Data cleaning and parsing

In [45]:
def clean_strings(data):
    return (data.strip()
            .split("±")[0]
            .split("+")[0]
            .split("[")[0]
            .replace(",", "")
            .replace("≈", "")
            )

df = df.map(clean_strings)
df[["Radius", "Volume", "Mass", "Surface area", "Density", "Gravity"]] = df[["Radius", "Volume", "Mass", "Surface area", "Density", "Gravity"]].apply(pd.to_numeric, errors='coerce')

df.head()

Unnamed: 0,Body,Radius,Volume,Mass,Surface area,Density,Gravity,Type,Discovery
0,Sun,695508.0,1409300000.0,1989100000.0,6078700.0,1.409,274.0,G2V-class star,prehistoric
1,Jupiter,69911.0,1431280.0,1898187.0,61419.0,1.3262,24.79,gas giant planet; has rings,prehistoric
2,Saturn,58232.0,827130.0,568317.0,42612.0,0.6871,10.44,gas giant planet; has rings,prehistoric
3,Uranus,25362.0,68340.0,86813.0,8083.1,1.27,8.87,ice giant planet; has rings,1781
4,Neptune,24622.0,62540.0,102413.0,7618.3,1.638,11.15,ice giant planet; has rings,1846


### Visualizing data

In [46]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [47]:
df.head()

Unnamed: 0,Body,Radius,Volume,Mass,Surface area,Density,Gravity,Type,Discovery
0,Sun,695508.0,1409300000.0,1989100000.0,6078700.0,1.409,274.0,G2V-class star,prehistoric
1,Jupiter,69911.0,1431280.0,1898187.0,61419.0,1.3262,24.79,gas giant planet; has rings,prehistoric
2,Saturn,58232.0,827130.0,568317.0,42612.0,0.6871,10.44,gas giant planet; has rings,prehistoric
3,Uranus,25362.0,68340.0,86813.0,8083.1,1.27,8.87,ice giant planet; has rings,1781
4,Neptune,24622.0,62540.0,102413.0,7618.3,1.638,11.15,ice giant planet; has rings,1846


In [48]:
df.tail()

Unnamed: 0,Body,Radius,Volume,Mass,Surface area,Density,Gravity,Type,Discovery
31,Sedna90377,498.0,0.516,,,,,sednoid; detached object,2003
32,Ceres1,469.7,0.433,0.938,2.85,2.17,0.28,dwarf planet; belt asteroid,1801
33,Orcus90482,455.0,0.404,0.548,,1.4,0.2,plutino; binary,2004
34,Salacia120347,423.0,0.3729,0.492,,1.5,0.165,cubewano; binary,2004
35,2002 MS4307261,400.0,0.2681,,,,,cubewano,2002


### Scales: 
- Radius: <b>km</b>
- Volume: <b>10<sup>9</sup> km<sup>3</sup></b>
- Mass: <b>10<sup>21</sup> kg</b>
- Surface Area: <b>10<sup>6</sup> km<sup>2</sup></b>
- Density: <b>g/cm<sup>3</sup></b>
- Gravity: <b>m/s<sup>2</sup></b>

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Body          36 non-null     object 
 1   Radius        36 non-null     float64
 2   Volume        36 non-null     float64
 3   Mass          33 non-null     float64
 4   Surface area  32 non-null     float64
 5   Density       33 non-null     float64
 6   Gravity       34 non-null     float64
 7   Type          36 non-null     object 
 8   Discovery     36 non-null     object 
dtypes: float64(6), object(3)
memory usage: 2.7+ KB


In [49]:
df.describe()

Unnamed: 0,Radius,Volume,Mass,Surface area,Density,Gravity
count,36.0,36.0,33.0,32.0,33.0,34.0
mean,25532.2594,39213660.0,60356610.0,193753.4,2.185003,10.909236
std,115872.504906,234872100.0,346243500.0,1073959.0,1.252099,46.766561
min,400.0,0.2681,0.492,2.85,0.6871,0.145
25%,583.25,0.831,1.586,5.98825,1.409,0.282
50%,980.5,4.325,16.6,20.404,1.72,0.701
75%,2589.5725,72.7,330.11,101.3417,2.17,3.22425
max,695508.0,1409300000.0,1989100000.0,6078700.0,5.5136,274.0
