<a href="https://colab.research.google.com/github/Atharshoyeb/Internship/blob/main/BeautifulSoupAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import requests
from bs4 import BeautifulSoup
import re

def display_header_tags(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all header tags (h1, h2, h3, etc.)
        header_tags = soup.find_all(re.compile('^h[1-6]$'))

        # Display the text content of each header tag
        for tag in header_tags:
            print(tag.text.strip())
    else:
        print("Failed to retrieve the page.")


url = "https://en.wikipedia.org/wiki/Main_Page"

display_header_tags(url)


Main Page
Welcome to Wikipedia
From today's featured article
Did you know ...
In the news
On this day
Today's featured picture
Other areas of Wikipedia
Wikipedia's sister projects
Wikipedia languages


In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_top_rated_movies(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the list containing the top-rated movies
        movie_list = soup.find('div', class_='lister-list')

        # Check if the list was found
        if movie_list:
            # Initialize lists to store movie data
            names = []
            ratings = []
            years = []

            # Extract data for each movie in the list
            for movie in movie_list.find_all('div', class_='lister-item-content'):
                # Extract movie name
                name = movie.find('h3', class_='lister-item-header').a.text.strip()
                names.append(name)

                # Extract movie rating
                rating = movie.find('span', class_='ipl-rating-star__rating').text.strip()
                ratings.append(float(rating))

                # Extract year of release
                year = movie.find('span', class_='lister-item-year').text.strip('()')
                years.append(year)

            # Create a DataFrame from the extracted data
            df = pd.DataFrame({'Name': names, 'Rating': ratings, 'Year': years})

            return df
        else:
            print("List containing top-rated movies not found.")
            return None
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)
        return None

# URL of IMDb's top-rated movies
url = "https://www.imdb.com/list/ls091520106/"

# Call the function to scrape top-rated movies and create a DataFrame
top_rated_movies_df = scrape_top_rated_movies(url)

# Display the DataFrame
if top_rated_movies_df is not None:
    print(top_rated_movies_df)
else:
    print("Failed to scrape top-rated movies.")


                        Name  Rating  Year
0   The Shawshank Redemption     9.3  1994
1              The Godfather     9.2  1972
2      The Godfather Part II     9.0  1974
3            The Dark Knight     9.0  2008
4               12 Angry Men     9.0  1957
..                       ...     ...   ...
95        North by Northwest     8.3  1959
96        A Clockwork Orange     8.3  1971
97                    Snatch     8.2  2000
98                    Amélie     8.3  2001
99                   The Kid     8.2  1921

[100 rows x 3 columns]


In [28]:
import requests
from bs4 import BeautifulSoup

def scrape_dineout_details(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all restaurant cards
        restaurant_cards = soup.find_all('div', class_='restnt-card')

        # Initialize lists to store details
        names = []
        cuisines = []
        locations = []
        ratings = []
        image_urls = []

        # Extract details for each restaurant
        for card in restaurant_cards:
            # Extract restaurant name
            name_elem = card.find('div', class_='restnt-info cursor')
            name = name_elem.text.strip() if name_elem else 'N/A'
            names.append(name)

            # Extract cuisine
            cuisine_elem = card.find('div', class_='restnt-info').find('span', class_='double-line-ellipsis')
            cuisine = cuisine_elem.text.strip() if cuisine_elem else 'N/A'
            cuisines.append(cuisine)

            # Extract location
            location_elem = card.find('div', class_='restnt-loc')
            location = location_elem.text.strip() if location_elem else 'N/A'
            locations.append(location)

            # Extract rating
            rating_elem = card.find('div', class_='restnt-rating')
            rating = rating_elem.text.strip() if rating_elem else 'N/A'
            ratings.append(rating)

            # Extract image URL
            image_url_elem = card.find('div', class_='img-cursor')
            image_url = image_url_elem.img['data-src'] if image_url_elem and image_url_elem.img else 'N/A'
            image_urls.append(image_url)

            # Print details for debugging
            print(f"Name: {name}")
            print(f"Cuisine: {cuisine}")
            print(f"Location: {location}")
            print(f"Rating: {rating}")
            print(f"Image URL: {image_url}")
            print()

        # Print scraped details
        for i in range(len(names)):
            print(f"Restaurant Name: {names[i]}")
            print(f"Cuisine: {cuisines[i]}")
            print(f"Location: {locations[i]}")
            print(f"Rating: {ratings[i]}")
            print(f"Image URL: {image_urls[i]}")
            print()
    else:
        print("Failed to retrieve the page.")

# URL of dineout.co.in
url = "https://www.dineout.co.in/delhi-restaurants/buffet-special"

# Call the function to scrape details and display
scrape_dineout_details(url)



Name: Castle's BarbequeConnaught Place, Central Delhi
Cuisine: N/A
Location: Connaught Place, Central Delhi
Rating: 4
Image URL: N/A

Name: Cafe KnoshThe Leela Ambience Convention Hotel,Shahdara, East Delhi
Cuisine: N/A
Location: The Leela Ambience Convention Hotel,Shahdara, East Delhi
Rating: 4.3
Image URL: N/A

Name: India GrillHilton Garden Inn,Saket, South Delhi
Cuisine: N/A
Location: Hilton Garden Inn,Saket, South Delhi
Rating: 3.9
Image URL: N/A

Name: The Barbeque CompanyGardens Galleria,Sector 38A, Noida
Cuisine: N/A
Location: Gardens Galleria,Sector 38A, Noida
Rating: 3.9
Image URL: N/A

Name: Delhi BarbequeTaurus Sarovar Portico,Mahipalpur, South Delhi
Cuisine: N/A
Location: Taurus Sarovar Portico,Mahipalpur, South Delhi
Rating: 3.7
Image URL: N/A

Name: The Monarch - Bar Be Que VillageIndirapuram Habitat Centre,Indirapuram, Ghaziabad
Cuisine: N/A
Location: Indirapuram Habitat Centre,Indirapuram, Ghaziabad
Rating: 3.8
Image URL: N/A

Name: The Barbeque TimesM2K Corporate Park

In [39]:
import requests
from bs4 import BeautifulSoup

def scrape_former_presidents(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the section containing former presidents
        former_presidents_section = soup.find('div', class_='block block-system block-system-main-block')

        # Check if the section exists
        if former_presidents_section:
            # Initialize lists to store names and tenures
            names = []
            tenures = []

            # Extract details for each former president
            presidents = former_presidents_section.find_all('div', class_='desc-sec')
            for president in presidents:
                # Extract name
                name = president.find('h3').text.strip()
                names.append(name)

                # Extract tenure
                tenure = president.find('h5').text.strip()
                tenures.append(tenure)

            # Print the list of former presidents and their tenures
            for i in range(len(names)):
                print(f"Name of President: {names[i]}")
                print(f"Tenure of President: {tenures[i]}")
                print()
        else:
            print("Former presidents section not found on the page.")
    else:
        print("Failed to retrieve the page.")

# URL of the website containing former presidents
url = "https://presidentofindia.nic.in/former-presidents"

# Call the function to scrape details and display
scrape_former_presidents(url)


Name of President: Shri Ram Nath Kovind
Tenure of President: 14th President of India

Name of President: Shri Pranab Mukherjee
Tenure of President: 13th President of India

Name of President: Smt Pratibha Devisingh Patil
Tenure of President: 12th President of India

Name of President: DR. A.P.J. Abdul Kalam
Tenure of President: 11th President of India

Name of President: Shri K. R. Narayanan
Tenure of President: 10th President of India

Name of President: Dr Shankar Dayal Sharma
Tenure of President: 9th  President of India

Name of President: Shri R Venkataraman
Tenure of President: 8th President of India

Name of President: Giani Zail Singh
Tenure of President: 7th President of India

Name of President: Shri Neelam Sanjiva Reddy
Tenure of President: 6th President of India

Name of President: Dr. Fakhruddin Ali Ahmed
Tenure of President: 5th President of India

Name of President: Shri Varahagiri Venkata Giri
Tenure of President: 4th President of India

Name of President: Dr. Zakir Husa