# Day 22 Assignment : Web scraping 

In [1]:
import requests
from bs4 import BeautifulSoup
import json

## Task 1

### Scrape and store the data from a website as json file 

In [2]:
url = 'http://www.bu.edu/president/boston-university-facts-stats/'

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all HTML elements on the page
    all_elements = soup.find_all()

    # Extract data and store in a list of dictionaries
    data_list = []
    for element in all_elements:
        # Extract relevant information from the element
        tag = element.name
        text = element.get_text(strip=True)

        # Skip empty text
        if text:
            info = {
                'tag': tag,
                'text': text
            }
            data_list.append(info)

    # Convert the list of dictionaries to JSON
    data_json = json.dumps(data_list, indent=2)

    # Save the JSON data to a file
    with open('scraped_data.json', 'w') as json_file:
        json_file.write(data_json)

    print("Data has been scraped and saved as 'scraped_data.json'.")
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")


Data has been scraped and saved as 'scraped_data.json'.


## Task 2

### Extract the table in url and change it to a json file

In [3]:

url = 'https://archive.ics.uci.edu/ml/datasets.php'

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the datasets
    dataset_table = soup.find('table', {'border': '1', 'cellpadding': '6'})

    if dataset_table:
        # Initialize an empty list to store the data
        dataset_list = []

        # Extract data from the table rows
        for row in dataset_table.find_all('tr')[1:]:
            columns = row.find_all('td')
            dataset_info = {
                'columns': columns,
                'row': row
                
            }
            dataset_list.append(dataset_info)

        # Convert the list of dictionaries to JSON
        dataset_json = json.dumps(dataset_list, indent=2)

        # Save the JSON data to a file
        with open('datasets.json', 'w') as json_file:
            json_file.write(dataset_json)

        print("Data has been scraped and saved as 'datasets.json'.")
    else:
        print("Table not found on the page.")
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")


Failed to retrieve data. Status code: 404


## Task 3

### Scrape the presidents table from Wikipedia and store the data as JSON

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States"

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the list of presidents
    presidents_table = soup.find('table', {'class': 'wikitable'})

    # Initialize an empty list to store the data
    presidents_data = []

    # Iterate through rows in the table
    for row in presidents_table.find_all('tr')[1:]:
        columns = row.find_all(['th', 'td'])

        # Extract relevant data from columns
        number = columns[0].text.strip()
        name_Birth_and_death = columns[2].text.strip()
        term = columns[3].text.strip()
        party = columns[5].text.strip()
        election = columns[6].text.strip()
        vice_president = columns[7].text.strip()

        # Create a dictionary for each president
        president_info = {
            'number': number,
            'name_Birth_and_death' : name_Birth_and_death,
            'term': term,
            'party': party,
            'election': election,
            'vice_president': vice_president
            
        }

        # Append the dictionary to the list
        presidents_data.append(president_info)

    # Convert the list of dictionaries to JSON
    presidents_json = json.dumps(presidents_data, indent=2)

    # Save the JSON data to a file
    with open('presidents.json', 'w') as json_file:
        json_file.write(presidents_json)

    print("Data has been scraped and saved as 'presidents.json'.")
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")


Data has been scraped and saved as 'presidents.json'.
