# Web Scraping Assignment-1

#### import necessary libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## 1) Write a python program to display all the header tags from wikipedia.org and make data frame.

In [2]:
# Defining function that takes in a URL as input
def wikipedia_header(url):
    
    # Send a GET request to the Wikipedia page and parse its content using BeautifulSoup
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Initialize an empty list to store the header text
    header = []

    # Find the first span element with a class attribute of 'mw-headline' and append its text content to the header list
    h1 = soup.find('span', class_='mw-headline')
    if h1:
        header.append(h1.text)

    # Loop through all h2 elements with a class attribute of 'mp-h2' and append their text content to the header list   
    for i in soup.find_all('h2', class_='mp-h2'):
        header.append(i.text)

    # Create a Pandas DataFrame from the header list   
    df = pd.DataFrame({'HEADER': header})

    # Set the DataFrame index to start from 1 instead of the default 0
    df.index = range(1, len(df)+1)

    # Return the DataFrame
    return df


In [3]:
df = wikipedia_header("https://en.wikipedia.org/wiki/Main_Page")
df

Unnamed: 0,HEADER
1,Welcome to Wikipedia
2,From today's featured article
3,Did you know ...
4,In the news
5,On this day
6,Today's featured picture
7,Other areas of Wikipedia
8,Wikipedia's sister projects
9,Wikipedia languages


## 2) Write a python program to display IMDB’s Top rated 50 movies’ data (i.e. name, rating, year of release) and make data frame.

In [4]:
# Defining function that takes in a URL as input
def imdb_top50(url):
    

    # Send a GET request to the IMDb Top 250 page and parse its content using BeautifulSoup
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Find all td elements with a class attribute of 'titleColumn' and extract the movie name, rating, and year of release for the first 50 movies
    top_movies = soup.find_all('td', class_='titleColumn')
    movie_name = []
    movie_rating = []
    year_release = []

    for movie in top_movies[:50]:
        # Extract the movie name from the <a> element inside the <td> element and append it to the movie_name list
        name = movie.a.text
        movie_name.append(name)

        # Extract the movie rating from the <strong> element inside the <td> element's parent <td> element with a class attribute of 'ratingColumn imdbRating' and append it to the movie_rating list
        rating = movie.parent.find("td", class_="ratingColumn imdbRating").strong.text
        movie_rating.append(rating)

        # Extract the year of release from the <span> element inside the <td> element and append it to the year_release list
        year = movie.span.text.strip("()")
        year_release.append(year)

    # Create a Pandas DataFrame from the extracted movie data  
    df = pd.DataFrame({
        'Name': movie_name,
        'Rating': movie_rating,
        'Year': year_release
    })

    # Set the DataFrame index to start from 1 instead of the default 0
    df.index = range(1, len(df)+1)

    # Return the DataFrame
    return df


In [5]:
df=imdb_top50('https://www.imdb.com/chart/top/?ref_=nv_mv_250')
df

Unnamed: 0,Name,Rating,Year
1,The Shawshank Redemption,9.2,1994
2,The Godfather,9.2,1972
3,The Dark Knight,9.0,2008
4,The Godfather Part II,9.0,1974
5,12 Angry Men,9.0,1957
6,Schindler's List,8.9,1993
7,The Lord of the Rings: The Return of the King,8.9,2003
8,Pulp Fiction,8.8,1994
9,The Lord of the Rings: The Fellowship of the Ring,8.8,2001
10,"Il buono, il brutto, il cattivo",8.8,1966


## 3)Write a python program to display IMDB’s Top rated 50 Indian movies’ data (i.e. name, rating, year of release) and make data frame.

In [6]:
# Defining function that takes in a URL as input
def top_indian_movies(url):
   
    # Send a GET request to the IMDb India Top Rated Movies page and parse its content using BeautifulSoup
    page = requests.get(url)
    soup = BeautifulSoup(page.content,'html.parser')

    # Find all td elements with a class attribute of 'titleColumn' and extract the movie name, rating, and year of release for the first 50 movies
    top_movies = soup.find_all('td',class_='titleColumn')
    movie_name=[]
    movie_rating=[]
    year_relese=[]

    for movie in top_movies[:50]:
        # Extract the movie name from the <a> element inside the <td> element and append it to the movie_name list
        name=movie.a.text
        movie_name.append(name)

        # Extract the movie rating from the <strong> element inside the <td> element's parent <td> element with a class attribute of 'ratingColumn imdbRating' and append it to the movie_rating list
        rating = movie.parent.find("td", class_="ratingColumn imdbRating").strong.text
        movie_rating.append(rating)

        # Extract the year of release from the <span> element inside the <td> element and append it to the year_relese list
        year = movie.span.text.strip("()")
        year_relese.append(year)

    # Create a Pandas DataFrame from the extracted movie data    
    df = pd.DataFrame({
            'Name':movie_name,
            'Rating':movie_rating,
            'Year':year_relese

            })
    # Set the DataFrame index to start from 1 instead of the default 0
    df.index = range(1, len(df)+1)

    # Return the DataFrame to display the top 50 Indian movies with their ratings and years of release
    return df


In [7]:
df=top_indian_movies("https://www.imdb.com/india/top-rated-indian-movies/")
df

Unnamed: 0,Name,Rating,Year
1,Ramayana: The Legend of Prince Rama,8.6,1993
2,Rocketry: The Nambi Effect,8.4,2022
3,Nayakan,8.4,1987
4,Gol Maal,8.4,1979
5,777 Charlie,8.4,2022
6,Anbe Sivam,8.4,2003
7,Pariyerum Perumal,8.4,2018
8,Apur Sansar,8.4,1959
9,3 Idiots,8.4,2009
10,Manichitrathazhu,8.3,1993


## 4) Write s python program to display list of respected former presidents of India(i.e. Name , Term ofoffice) from https://presidentofindia.nic.in/former-presidents.htm and make data frame.

In [8]:
# Defining function that takes in a URL as input
def get_former_presidents(url):
    # Send a GET request to the President of India website and parse its content using BeautifulSoup
    page = requests.get(url)
    soup = BeautifulSoup(page.content,'html.parser')

    # Create empty lists to store the names and terms of office of former presidents
    name = []
    term = []

    # Extract the names of former presidents from the <h3> element inside the <div> element with a class attribute of 'presidentListing', and append them to the name list
    for i in soup.find_all('div', class_="presidentListing"):
        name.append(i.h3.text.strip())

    # Extract the terms of office of former presidents from the <p> element inside the <div> element with a class attribute of 'presidentListing', and append them to the term list    
    for i in soup.find_all('div', class_="presidentListing"):
        term.append(i.p.text)

    # Create a Pandas DataFrame from the extracted data   
    df=pd.DataFrame({'NAME':name,'TERM OF OFFICE':term})

    # Set the DataFrame index to start from 1 instead of the default 0
    df.index = range(1, len(df)+1)

    # Return the DataFrame to display the names and terms of office of former presidents
    return df


In [9]:
df = get_former_presidents("https://presidentofindia.nic.in/former-presidents.htm")
df


Unnamed: 0,NAME,TERM OF OFFICE
1,Shri Ram Nath Kovind (birth - 1945),"Term of Office: 25 July, 2017 to 25 July, 2022"
2,Shri Pranab Mukherjee (1935-2020),"Term of Office: 25 July, 2012 to 25 July, 2017"
3,Smt Pratibha Devisingh Patil (birth - 1934),"Term of Office: 25 July, 2007 to 25 July, 2012"
4,DR. A.P.J. Abdul Kalam (1931-2015),"Term of Office: 25 July, 2002 to 25 July, 2007"
5,Shri K. R. Narayanan (1920 - 2005),"Term of Office: 25 July, 1997 to 25 July, 2002"
6,Dr Shankar Dayal Sharma (1918-1999),"Term of Office: 25 July, 1992 to 25 July, 1997"
7,Shri R Venkataraman (1910-2009),"Term of Office: 25 July, 1987 to 25 July, 1992"
8,Giani Zail Singh (1916-1994),"Term of Office: 25 July, 1982 to 25 July, 1987"
9,Shri Neelam Sanjiva Reddy (1913-1996),"Term of Office: 25 July, 1977 to 25 July, 1982"
10,Dr. Fakhruddin Ali Ahmed (1905-1977),"Term of Office: 24 August, 1974 to 11 February..."


## 5) Write a python program to scrape cricket rankings from icc-cricket.com. You have to scrape and make data frame
### a) Top 10 ODI teams in men’s cricket along with the records for matches, points and rating.
### b) Top 10 ODI Batsmen along with the records of their team andrating.
### c) Top 10 ODI bowlers along with the records of their team andrating.

..........................................................................................................................................................................................

#### a)Top 10 ODI teams in men’s cricket along with the records for matches, points and rating.

In [10]:
# Defining function that takes in a URL as input
def odi_teams(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the table containing the rankings data
    table = soup.find("table", class_="table")

    # Extract the data from the table and store it in lists
    teams = []
    matches = []
    points = []
    ratings = []

    # Loop through each row in the table
    for row in table.tbody.find_all("tr"):
        # Extract team name from the <span> element with class 'u-hide-phablet' and append to 'teams' list
        team = row.find("span", class_="u-hide-phablet").text.strip()
        teams.append(team)

        # Extract number of matches from the third <td> element and append to 'matches' list
        match = row.find_all("td")[2].text.strip()
        matches.append(match)

        # Extract points from the fourth <td> element and append to 'points' list
        point = row.find_all("td")[3].text.strip()
        points.append(point)

        # Extract rating from the fifth <td> element and append to 'ratings' list
        rating = row.find_all("td")[4].text.strip()
        ratings.append(rating)

    # Create a data frame from the lists
    data = {"Team": teams[:10], "Matches": matches[:10], "Points": points[:10], "Rating": ratings[:10]}
    df = pd.DataFrame(data)

    # Set the DataFrame index to start from 1 instead of the default 0
    df.index = range(1, len(df)+1)
    return df


In [11]:
df = odi_teams("https://www.icc-cricket.com/rankings/mens/team-rankings/odi")
df

Unnamed: 0,Team,Matches,Points,Rating
1,Australia,35,3965,113
2,New Zealand,31,3504,113
3,India,47,5294,113
4,England,36,3988,111
5,Pakistan,25,2649,106
6,South Africa,31,3141,101
7,Bangladesh,38,3625,95
8,Sri Lanka,36,3099,86
9,West Indies,43,3105,72
10,Afghanistan,20,1419,71


#### b) Top 10 ODI Batsmen along with the records of their team andrating.

In [12]:
# Defining function that takes in a URL as input
def top_batsmenss(url):
    # send request to the server 
    response = requests.get(url)
    
    # create soup object
    soup = BeautifulSoup(response.content, 'html.parser')
    # fetching  the top player data
    name = []
    team = []
    rate =[]
    a=soup.find('div', 'rankings-block__banner--name').text
    b=soup.find('div','rankings-block__banner--nationality').text[2:5]
    c=soup.find('div','rankings-block__banner--rating').text
    name.append(a)
    team.append(b)
    rate.append(c)
    # extract the names of the batsmens
    namesss = []
    for i in soup.find_all('td', 'table-body__cell name'):
        namesss.append(i.text.strip())
    # extract the teams of  the players
    teams = []
    for i in soup.find_all('td', 'table-body__cell nationality-logo'):
        teams.append(i.text.strip())
    # extract the ratings of the batsmens
    rating = []
    for i in soup.find_all('td', 'table-body__cell u-text-right rating'):
        rating.append(i.text.strip())
    # adding the data together 
    names = name + namesss
    teams = team + teams
    rating = rate + rating
    
    # create dataframe
    df = pd.DataFrame({
        'Batsmen':names,
        'Team':teams,
        'Rating':rating
    })
    # Set the DataFrame index to start from 1 instead of the default 0
    df.index = range(1, len(df)+1)
    # return the DataFrame
    return df[:10]

In [13]:
name = top_batsmenss('https://www.icc-cricket.com/rankings/mens/player-rankings/odi')
name

Unnamed: 0,Batsmen,Team,Rating
1,Babar Azam,PAK,887
2,Rassie van der Dussen,SA,777
3,Imam-ul-Haq,PAK,740
4,Shubman Gill,IND,738
5,David Warner,AUS,726
6,Virat Kohli,IND,719
7,Quinton de Kock,SA,718
8,Rohit Sharma,IND,707
9,Steve Smith,AUS,702
10,Fakhar Zaman,PAK,699


#### c) Top 10 ODI bowlers along with the records of their team andrating.

In [14]:
# Defining function that takes in a URL as input
def get_top_10_ODI_bowlers(url):
    # Send a request to the ICC ODI rankings web page and get the response
    response = requests.get(url)

    # Parse the HTML content using Beautiful Soup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the table containing the top 10 ODI bowlers
    table = soup.find('table', attrs={'class': 'table'})

    # Extract the data from the table and store it in a list of dictionaries
    data = []
    rows = table.find_all('tr')
    for row in rows[1:11]: # We only need the top 10 bowlers
        cells = row.find_all('td')
        player_name = cells[1].text.strip()
        team = cells[2].text.strip()
        rating = cells[3].text.strip()
        data.append({
            'Player Name': player_name,
            'Team': team,
            'Rating': rating
        })

    # Create a Pandas DataFrame from the list of dictionaries and return it
    df = pd.DataFrame(data)
    # Set the DataFrame index to start from 1 instead of the default 0
    df.index = range(1, len(df)+1)
    return df



In [15]:
top = get_top_10_ODI_bowlers('https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling')
top

Unnamed: 0,Player Name,Team,Rating
1,Josh Hazlewood,AUS,705
2,Trent Boult,NZ,694
3,Mohammed Siraj,IND,691
4,Mitchell Starc,AUS,686
5,Matt Henry,NZ,676
6,Rashid Khan,AFG,659
7,Adam Zampa,AUS,652
8,Shaheen Afridi,PAK,641
9,Mujeeb Ur Rahman,AFG,637
10,Shakib Al Hasan,BAN,636


## 6) Write a python program to scrape cricket rankings from icc-cricket.com. You have to scrape and make data frame
#### a) Top 10 ODI teams in women’s cricket along with the records for matches, points and rating.
#### b) Top 10 women’s ODI Batting players along with the records of their team and rating.
#### c) Top 10 women’s ODI all-rounder along with the records of their team and rating.

..........................................................................................................................................

#### a) Top 10 ODI teams in women’s cricket along with the records for matches, points and rating

In [16]:
# Defining function that takes in a URL as input
def icc_womens_team(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the table containing the rankings data
    table = soup.find("table", class_="table")

    # Extract the data from the table and store it in lists
    teams = []
    matches = []
    points = []
    ratings = []

    for row in table.tbody.find_all("tr"):
        team = row.find("span", class_="u-hide-phablet").text.strip()
        teams.append(team)

        match = row.find_all("td")[2].text.strip()
        matches.append(match)

        point = row.find_all("td")[3].text.strip()
        points.append(point)

        rating = row.find_all("td")[4].text.strip()
        ratings.append(rating)

    # Create a data frame from the lists
    data = {"Team": teams[:10], "Matches": matches[:10], "Points": points[:10], "Rating": ratings[:10]}
    df = pd.DataFrame(data)

    # Set the DataFrame index to start from 1 instead of the default 0
    df.index = range(1, len(df)+1)

    # Return the data frame
    return df


In [17]:
top_odi_wo = icc_womens_team('https://www.icc-cricket.com/rankings/womens/team-rankings/odi')
top_odi_wo

Unnamed: 0,Team,Matches,Points,Rating
1,Australia,21,3603,172
2,England,28,3342,119
3,South Africa,26,3098,119
4,India,27,2820,104
5,New Zealand,25,2553,102
6,West Indies,27,2535,94
7,Bangladesh,13,983,76
8,Thailand,11,821,75
9,Pakistan,27,1678,62
10,Sri Lanka,8,353,44


#### b) Top 10 women’s ODI Batting players along with the records of their team and rating.

In [18]:
# Defining function that takes in a URL as input
def top_10_women_batting_players(url):
   

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the table containing the rankings data
    table = soup.find('table', {'class': 'table rankings-table'})

    # Create an empty list to store the data
    data = []

    # Find all the rows in the table
    rows = table.find_all('tr')

    # Loop through the rows and extract the data
    for row in rows[1:11]: # Take only first 10 rows for top 10 players
        cols = row.find_all('td')
        cols = [col.text.strip() for col in cols]
        player = cols[1]
        team = cols[2]
        rating = cols[3]
        data.append([player, team, rating])

    # Create a Pandas data frame with the data
    df = pd.DataFrame(data, columns=['Player', 'Team', 'Rating'])

    # Add position numbers to the data frame
    df.index += 1

    # Return the data frame
    return df

In [19]:
wo_batsmen = top_10_women_batting_players('https://www.icc-cricket.com/rankings/womens/player-rankings/odi/batting')
wo_batsmen

Unnamed: 0,Player,Team,Rating
1,Alyssa Healy,AUS,762
2,Beth Mooney,AUS,754
3,Laura Wolvaardt,SA,732
4,Natalie Sciver,ENG,731
5,Meg Lanning,AUS,717
6,Harmanpreet Kaur,IND,716
7,Smriti Mandhana,IND,714
8,Chamari Athapaththu,SL,655
9,Amy Satterthwaite,NZ,641
10,Ellyse Perry,AUS,626


#### c) Top 10 women’s ODI all-rounder along with the records of their team and rating.

In [20]:
# Defining function that takes in a URL as input
def top_10_women_allrounders(url):
    # URL of the ICC women's ODI all-rounder rankings
   

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the table containing the rankings data
    table = soup.find('table', {'class': 'table rankings-table'})

    # Create an empty list to store the data
    data = []

    # Find all the rows in the table
    rows = table.find_all('tr')

    # Loop through the rows and extract the data
    for row in rows[1:11]: # Take only first 10 rows for top 10 players
        cols = row.find_all('td')
        cols = [col.text.strip() for col in cols]
        player = cols[1]
        team = cols[2]
        rating = cols[3]
        data.append([player, team, rating])

    # Create a Pandas data frame with the data
    df = pd.DataFrame(data, columns=['Player', 'Team', 'Rating'])

    # Add position numbers to the data frame
    df.index += 1

    # Return the data frame
    return df

In [21]:
wo_all = top_10_women_allrounders('https://www.icc-cricket.com/rankings/womens/player-rankings/odi/all-rounder')
wo_all

Unnamed: 0,Player,Team,Rating
1,Hayley Matthews,WI,373
2,Natalie Sciver,ENG,371
3,Ellyse Perry,AUS,366
4,Marizanne Kapp,SA,349
5,Amelia Kerr,NZ,336
6,Deepti Sharma,IND,322
7,Ashleigh Gardner,AUS,292
8,Jess Jonassen,AUS,250
9,Nida Dar,PAK,232
10,Sophie Ecclestone,ENG,205


## 7) Write a python program to scrape mentioned news details from https://www.cnbc.com/world/?region=world and make data frame-
- i) Headline
- ii) Time
- iii) News Link

In [22]:
# Defining function that takes in a URL as input
def scrape_cnbc_world_news(url):
    #Send a request to the CNBC world news page and get the response
    page = requests.get(url)
    #Parse the HTML content using Beautiful Soup
    soup = BeautifulSoup(page.content,'html.parser')

    #Initialize empty lists to store the news headlines, time and links
    Headline = []
    Time = []
    News_Link=[]


    #Extract the news headlines, time and links from the parsed HTML content and append to the respective lists

    for i in soup.find_all('a',class_="LatestNews-headline"):
        Headline.append(i.text)

    for i in soup.find_all('a',class_="LatestNews-headline"):
        News_Link.append(i.get('href'))

    for i in soup.find_all('time',class_="LatestNews-timestamp"):
        Time.append(i.text)


    #Create a Pandas DataFrame from the lists    
    df=pd.DataFrame({'Headline':Headline,'Time':Time,'News_Link':News_Link})

    #Reset the index of the DataFrame and return it
    df.index = range(1, len(df)+1)
    return df


In [23]:
news = scrape_cnbc_world_news('https://www.cnbc.com/world/?region=world')
news

Unnamed: 0,Headline,Time,News_Link
1,Electric car sales in 2022 hit over 10 million...,35 Min Ago,https://www.cnbc.com/2023/04/26/electric-car-s...
2,CNBC Daily Open: Big Tech surpasses expectations,42 Min Ago,https://www.cnbc.com/2023/04/26/stock-markets-...
3,With Russia's role in the global energy system...,50 Min Ago,https://www.cnbc.com/2023/04/26/russias-role-i...
4,SK Hynix reports record quarterly operating lo...,56 Min Ago,https://www.cnbc.com/2023/04/26/sk-hynix-repor...
5,European markets head for negative open as ban...,2 Hours Ago,https://www.cnbc.com/2023/04/26/european-marke...
6,Here's how to spend King Charles III's coronat...,4 Hours Ago,https://www.cnbc.com/2023/04/26/how-to-spend-k...
7,Here are the top 25 companies to work for in I...,5 Hours Ago,https://www.cnbc.com/2023/04/26/here-are-the-t...
8,Alphabet's quarter delivers but doesn't dazzle...,6 Hours Ago,https://www.cnbc.com/2023/04/25/alphabets-quar...
9,"'The hard money is now,' Stifel's chief strate...",6 Hours Ago,https://www.cnbc.com/2023/04/25/easy-money-is-...
10,China's EV players are starting to compete on ...,7 Hours Ago,https://www.cnbc.com/2023/04/26/chinas-ev-play...


## 8) Write a python program to scrape the details of most downloaded articles from AI in last 90 days.https://www.journals.elsevier.com/artificial-intelligence/most-downloaded-articles Scrape below mentioned details and make data frame
- i) Paper Title
- ii) Authors
- iii) Published Date
- iv) Paper URL


In [24]:
# Defining function that takes in a URL as input
def most_downloaded_ai_articles(url):
    # Send a GET request to the URL and parse the HTML content using BeautifulSoup
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Create empty lists to store the data
    Title = []
    Authors = []
    Publish = []
    URL = []

    # Find the relevant HTML elements and extract the data
    for i in soup.find_all('h2', class_="sc-1qrq3sd-1 gRGSUS sc-1nmom32-0 sc-1nmom32-1 btcbYu goSKRg"):
        Title.append(i.text)

    for i in soup.find_all('span', class_="sc-1w3fpd7-0 dnCnAO"):
        Authors.append(i.text)

    for i in soup.find_all('span', class_='sc-1thf9ly-2 dvggWt'):
        Publish.append(i.text)

    for i in soup.find_all('a', class_="sc-5smygv-0 fIXTHm"):
        URL.append(i.get('href'))

    # Create a Pandas DataFrame from the lists
    df = pd.DataFrame({' Paper Title': Title, 'Authors': Authors, 'Published Date': Publish, 'Paper URL': URL})

    # Reset the index of the DataFrame and return it
    df.index = range(1, len(df) + 1)
    return df


In [25]:
articles = most_downloaded_ai_articles('https://www.journals.elsevier.com/artificial-intelligence/most-downloaded-articles')
articles 

Unnamed: 0,Paper Title,Authors,Published Date,Paper URL
1,Reward is enough,"Silver, David, Singh, Satinder, Precup, Doina,...",October 2021,https://www.sciencedirect.com/science/article/...
2,Making sense of raw input,"Evans, Richard, Bošnjak, Matko and 5 more",October 2021,https://www.sciencedirect.com/science/article/...
3,Law and logic: A review from an argumentation ...,"Prakken, Henry, Sartor, Giovanni",October 2015,https://www.sciencedirect.com/science/article/...
4,Creativity and artificial intelligence,"Boden, Margaret A.",August 1998,https://www.sciencedirect.com/science/article/...
5,Artificial cognition for social human–robot in...,"Lemaignan, Séverin, Warnier, Mathieu and 3 more",June 2017,https://www.sciencedirect.com/science/article/...
6,Explanation in artificial intelligence: Insigh...,"Miller, Tim",February 2019,https://www.sciencedirect.com/science/article/...
7,Making sense of sensory input,"Evans, Richard, Hernández-Orallo, José and 3 more",April 2021,https://www.sciencedirect.com/science/article/...
8,Conflict-based search for optimal multi-agent ...,"Sharon, Guni, Stern, Roni, Felner, Ariel, Stur...",February 2015,https://www.sciencedirect.com/science/article/...
9,Between MDPs and semi-MDPs: A framework for te...,"Sutton, Richard S., Precup, Doina, Singh, Sati...",August 1999,https://www.sciencedirect.com/science/article/...
10,The Hanabi challenge: A new frontier for AI re...,"Bard, Nolan, Foerster, Jakob N. and 13 more",March 2020,https://www.sciencedirect.com/science/article/...


## 9) Write a python program to scrape mentioned details from dineout.co.in and make data frame-
- i) Restaurant name
- ii) Cuisine
- iii) Location
- iv) Ratings
- v) Image URL

In [26]:
# Defining function that takes in a URL as input
def dineout_restaurants(url):
   
    #Send a GET request to the URL and parse the HTML content using BeautifulSoup
    page = requests.get(url)
    soup = BeautifulSoup(page.content,'html.parser')

    #Create empty lists to store the data
    Restaurant_name = []
    Cuisine = []
    Location=[]
    Ratings=[]
    Image_URL=[]


    #Find the relevant HTML elements and extract the data

    for i in soup.find_all('a',class_="restnt-name ellipsis"):
        Restaurant_name.append(i.text)

    for i in soup.find_all('div',class_="restnt-loc ellipsis"):
        Location.append(i.text)

    for i in soup.find_all('img',class_="no-img"):
        Image_URL.append(i.get('data-src'))

    for i in soup.find_all('span','double-line-ellipsis'):
        Cuisine.append(i.text.split("|")[1])

    for i in soup.find_all('div',"restnt-rating rating-4"):
        Ratings.append(i.text)

    #Create a Pandas DataFrame from the lists        
    df=pd.DataFrame({'Restaurant Name':Restaurant_name,'Cuisine':Cuisine,'Location':Location,'Ratings':Ratings,'Image_URL':Image_URL})

    #Reset the index of the DataFrame and return it
    df.index = range(1, len(df)+1)
    
    return df


In [27]:
restaurant = dineout_restaurants('https://www.dineout.co.in/delhi-restaurants/buffet-special')
restaurant

Unnamed: 0,Restaurant Name,Cuisine,Location,Ratings,Image_URL
1,Castle Barbeque,"Chinese, North Indian","Connaught Place, Central Delhi",4.0,https://im1.dineout.co.in/images/uploads/resta...
2,Jungle Jamboree,"North Indian, Asian, Italian","3CS Mall,Lajpat Nagar - 3, South Delhi",3.9,https://im1.dineout.co.in/images/uploads/resta...
3,Cafe Knosh,"Italian, Continental","The Leela Ambience Convention Hotel,Shahdara, ...",4.3,https://im1.dineout.co.in/images/uploads/resta...
4,Castle Barbeque,"Chinese, North Indian","Pacific Mall,Tagore Garden, West Delhi",3.9,https://im1.dineout.co.in/images/uploads/resta...
5,The Barbeque Company,"North Indian, Chinese","Gardens Galleria,Sector 38A, Noida",3.9,https://im1.dineout.co.in/images/uploads/resta...
6,India Grill,"North Indian, Italian","Hilton Garden Inn,Saket, South Delhi",3.9,https://im1.dineout.co.in/images/uploads/resta...
7,Delhi Barbeque,North Indian,"Taurus Sarovar Portico,Mahipalpur, South Delhi",3.7,https://im1.dineout.co.in/images/uploads/resta...
8,The Monarch - Bar Be Que Village,North Indian,"Indirapuram Habitat Centre,Indirapuram, Ghaziabad",3.8,https://im1.dineout.co.in/images/uploads/resta...
9,Indian Grill Room,"North Indian, Mughlai","Suncity Business Tower,Golf Course Road, Gurgaon",4.3,https://im1.dineout.co.in/images/uploads/resta...
