# Web Scraping Exercises:
1. Scrape the following website and store the data as json file(url = 'http://www.bu.edu/president/boston-university-facts-stats/').

In [1]:
import requests
from bs4 import BeautifulSoup
import json

url = 'http://www.bu.edu/president/boston-university-facts-stats/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

data = {}
for fact in soup.find_all('div', class_='fact'):
    title = fact.find('h3').text.strip()
    value = fact.find('p').text.strip()
    data[title] = value

# Saving the data as a JSON file
with open('bu_facts_stats.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

2. Extract the table in this url (https://archive.ics.uci.edu/ml/datasets.php) and change it to a json file

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import json

url = 'https://archive.ics.uci.edu/ml/datasets.php'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

tables = pd.read_html(url)

df = tables[0]

# Converting the dataframe to a JSON file
df.to_json('uci_datasets.json', orient='records', indent=4)

3. Scrape the presidents table and store the data as json(https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). The table is not very structured and the scrapping may take very long time.

In [4]:
import requests
from bs4 import BeautifulSoup
import json

url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

table = soup.find('table', {'class': 'wikitable'})

# Extracting the headers
headers = [header.text.strip() for header in table.find_all('th')]

# Extracting the rows
rows = []
for row in table.find_all('tr')[1:]:
    cells = row.find_all(['td', 'th'])
    cells = [cell.text.strip() for cell in cells]
    rows.append(cells)

# Convert to a list of dictionaries
presidents_data = [dict(zip(headers, row)) for row in rows]

# Saving the data as a JSON file
with open('presidents.json', 'w') as json_file:
    json.dump(presidents_data, json_file, indent=4)