# Web Scraping

In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = 'https://web-scraping-demo.zgulde.net/news'
response = get(url)
response

<Response [200]>

In [3]:
print(response.text[:400])

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>News Example Page</title>
    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap


In [4]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
articles = soup.select('div.grid.grid-cols-4')

In [6]:
article = articles[0]
article

<div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
<img src="/static/placeholder.png"/>
<div class="col-span-3 space-y-3 py-3">
<h2 class="text-2xl text-green-900">figure go enter</h2>
<div class="grid grid-cols-2 italic">
<p> 1989-04-02 </p>
<p class="text-right">By Steven Martinez </p>
</div>
<p>At middle want happy. Kid least career. Group office film. Art outside teacher well yet sure tend.
Whether training senior low. Former voice sea your among. Eight lawyer mother use. During interesting set prepare it almost light.</p>
</div>
</div>

In [7]:
def parse_news_article(article):
    output = {}
    output['headline'] = article.find('h2').text
    output['date'], output['byline'], output['description'] = [p.text for p in article.find_all('p')]
    return output

In [8]:
pd.DataFrame([parse_news_article(article) for article in articles])

Unnamed: 0,headline,date,byline,description
0,figure go enter,1989-04-02,By Steven Martinez,At middle want happy. Kid least career. Group ...
1,travel kind body,2020-09-20,By Michelle Thompson,Best responsibility claim main simply rate hug...
2,TV religious thought,2006-07-29,By Michelle Roberts,Kid relationship style thought focus. Role min...
3,ok environmental always,1984-08-08,By John Sullivan,Oil draw sport picture star who. Sometimes thi...
4,soldier instead blue,1994-04-29,By Robert Johnson,Teach collection you nothing space discover ro...
5,stuff because describe,2020-06-03,By Tasha Duncan,West perform lose find quickly father. Probabl...
6,from center choice,2004-03-30,By Danielle Perez,Around see audience decide onto respond west. ...
7,relate former let,1999-07-06,By Terri Walker,Discussion computer room old tough. Memory sta...
8,rise low gas,2007-01-11,By John Wilkins,Wrong choice phone. Culture discussion detail....
9,become successful mind,1991-01-14,By Douglas Good,Outside price road piece vote oil art positive...


People exercise 

In [9]:
url = 'https://web-scraping-demo.zgulde.net/people'
response = get(url)
response

<Response [200]>

In [10]:
response.text[:400]

'<!DOCTYPE html>\n<html lang="en">\n<head>\n    <meta charset="UTF-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    <title>Example People Page</title>\n    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />\n    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstr'

In [11]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.text, 'html.parser')

In [12]:
people = soup.select('div.person')
people

[<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
 <h2 class="text-2xl text-purple-800 name col-span-full border-b">Nicole Jones</h2>
 <p class="quote col-span-full px-5 py-5 text-center text-gray-500">
             "Expanded disintermediate support"
         </p>
 <div class="grid grid-cols-9">
 <i class="bi bi-envelope-fill text-purple-800"></i>
 <p class="email col-span-8">jameshuff@wallace.com</p>
 <i class="bi bi-telephone-fill text-purple-800"></i>
 <p class="phone col-span-8">779-322-6932x015</p>
 </div>
 <div class="address grid grid-cols-9">
 <i class="bi bi-geo-fill text-purple-800"></i>
 <p class="col-span-8">
                 47191 Chloe Brook <br/>
                 New Jessicamouth, CO 65277
             </p>
 </div>
 </div>,
 <div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
 <h2 class="text-2xl text-purple-800 name col-span-

In [13]:
person = people[0]
person

<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Nicole Jones</h2>
<p class="quote col-span-full px-5 py-5 text-center text-gray-500">
            "Expanded disintermediate support"
        </p>
<div class="grid grid-cols-9">
<i class="bi bi-envelope-fill text-purple-800"></i>
<p class="email col-span-8">jameshuff@wallace.com</p>
<i class="bi bi-telephone-fill text-purple-800"></i>
<p class="phone col-span-8">779-322-6932x015</p>
</div>
<div class="address grid grid-cols-9">
<i class="bi bi-geo-fill text-purple-800"></i>
<p class="col-span-8">
                47191 Chloe Brook <br/>
                New Jessicamouth, CO 65277
            </p>
</div>
</div>

In [14]:
person.find_all('p')

[<p class="quote col-span-full px-5 py-5 text-center text-gray-500">
             "Expanded disintermediate support"
         </p>,
 <p class="email col-span-8">jameshuff@wallace.com</p>,
 <p class="phone col-span-8">779-322-6932x015</p>,
 <p class="col-span-8">
                 47191 Chloe Brook <br/>
                 New Jessicamouth, CO 65277
             </p>]

In [15]:
def parse_people(person):
    output = {}
    output['name'] = person.find('h2').text
    output['quote'], output['email'], output['phone'], output['address'] = [p.text.strip() for p in person.find_all('p')]
    return output

In [16]:
parse_people(person)

{'name': 'Nicole Jones',
 'quote': '"Expanded disintermediate support"',
 'email': 'jameshuff@wallace.com',
 'phone': '779-322-6932x015',
 'address': '47191 Chloe Brook \n                New Jessicamouth, CO 65277'}

In [17]:
pd.DataFrame([parse_people(person) for person in people])

Unnamed: 0,name,quote,email,phone,address
0,Nicole Jones,"""Expanded disintermediate support""",jameshuff@wallace.com,779-322-6932x015,47191 Chloe Brook \n New Jessic...
1,Rodney Smith,"""Multi-channeled even-keeled system engine""",teresajones@robertson.com,+1-090-865-5580x228,"8052 Brandy Dam \n Tanyaview, I..."
2,Marc Henderson,"""Reverse-engineered reciprocal strategy""",brookestevens@yahoo.com,892-930-7916x37592,38044 Hodges Shore Apt. 603 \n ...
3,Sean Douglas,"""Persistent client-driven utilization""",simpsonmegan@ramirez-middleton.org,001-656-735-2498,7283 Church Isle Apt. 716 \n La...
4,Randy King,"""Reduced hybrid database""",allenjulie@herrera.com,(908)168-3987x739,98408 Melissa Wells Apt. 428 \n ...
5,Wesley Key,"""Digitized asynchronous flexibility""",shelby77@simmons-herman.com,001-801-452-1353,"71687 Kevin Center \n East Ann,..."
6,Candice Johnson,"""Universal dynamic protocol""",davissusan@yahoo.com,656.123.7921,017 Jonathan Underpass Suite 021 \n ...
7,Elizabeth Michael,"""Secured needs-based ability""",rdoyle@gilmore.com,(507)671-9491x84533,730 Fernando Expressway Suite 545 \n ...
8,Shannon Chavez,"""Up-sized hybrid archive""",maria61@gmail.com,283.322.0363,359 Walker Estates \n West Bran...
9,Christopher Munoz,"""Cross-platform client-driven open system""",csantana@robbins.com,381-252-9663,7443 Mcintosh Plains \n Davidbe...


# Exercises
---

1. Codeup Blog Articles
 - Visit Codeup's Blog and record the urls for at least 5 distinct blog posts. 
 - For each post: 
    - title
    - content
 - Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries:
    - each dictionary representing one article
    - dictionary should look like this:
        
        {
        'title': 'the title of the article',
        'content': 'the full text content of the article'
        }


In [18]:
headers = {'user-agent': 'Innis Data Science Cohort'}
url = 'https://codeup.com/blog/'
response = get(url, headers=headers)
response

<Response [200]>

In [19]:
response.text[:400]

'<!DOCTYPE html>\n<html lang="en-US">\n<head>\n\t<meta charset="UTF-8" />\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\n\t<link rel="pingback" href="https://codeup.com/xmlrpc.php" />\n\n\t<script type="text/javascript">\n\t\tdocument.documentElement.className = \'js\';\n\t</script>\n\t\n\t<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin /><script id="diviarea-loader">window.DiviPopupData=wi'

In [20]:
soup = BeautifulSoup(response.text, 'html.parser')

In [21]:
blogs = soup.find_all('article')[:16]

In [22]:
blog = blogs[0]

In [23]:
blog.find('h2').text

'From Bootcamp to Bootcamp | A Military Appreciation Panel'

In [24]:
blog.find('p').text[:12]

'Apr 27, 2022'

In [25]:
blog.find('p').text[14:]

' Alumni Stories, Dallas, Events, Featured, Military, San Antonio, Veterans, Virtual, Workshops'

In [26]:
blog.find_all('p')[1].text

'In honor of Military Appreciation Month, join us for a discussion with Codeup Alumni who are also Military Veterans!...'

In [27]:
def parse_blog(blog):
    '''
    This function takes the information scraped from codeup blog page and return a dataframe with:
    title, date, tags and content of each blog 
    '''
    output = {}
    output['title'] = blog.find('h2').text
    output['date'] = blog.find('p').text[:12]
    output['tags'] = blog.find('p').text[14:]
    output['content'] = blog.find_all('p')[1].text
    return output

In [28]:
parse_blog(blog)

{'title': 'From Bootcamp to Bootcamp | A Military Appreciation Panel',
 'date': 'Apr 27, 2022',
 'tags': ' Alumni Stories, Dallas, Events, Featured, Military, San Antonio, Veterans, Virtual, Workshops',
 'content': 'In honor of Military Appreciation Month, join us for a discussion with Codeup Alumni who are also Military Veterans!...'}

In [29]:
blogs[5].find_all('p')[1].text

"On this International Women's Day 2022 we wanted to tell stories about women in tech. What better way to do that than..."

In [30]:
pd.DataFrame([parse_blog(blog) for blog in blogs])

Unnamed: 0,title,date,tags,content
0,From Bootcamp to Bootcamp | A Military Appreci...,"Apr 27, 2022","Alumni Stories, Dallas, Events, Featured, Mil...","In honor of Military Appreciation Month, join ..."
1,Our Acquisition of the Rackspace Cloud Academy...,"Apr 14, 2022","Codeup News, Featured, IT Training","Just about a year ago on April 16th, 2021 we a..."
2,Learn to Code: HTML & CSS on 4/30,"Apr 1, 2022","Virtual, Workshops",HTML & CSS are the design building blocks of a...
3,Learn to Code: Python Workshop on 4/23,"Mar 31, 2022","Events, Virtual, Workshops","According to LinkedIn, the ""#1 Most Promising ..."
4,Coming Soon: Cloud Administration,"Mar 17, 2022",Codeup News,We're launching a new program out of San Anton...
5,5 Books Every Woman In Tech Should Read,"Mar 8, 2022",Featured,On this International Women's Day 2022 we want...
6,Codeup Start Dates for March 2022,"Jan 26, 2022",Codeup News,As we approach the end of January we wanted to...
7,VET TEC Funding Now Available For Dallas Veterans,"Jan 7, 2022","Codeup News, Dallas Newsletter, Featured, Tips...",We are so happy to announce that VET TEC benef...
8,Dallas Campus Re-opens With New Grant Partner,"Dec 30, 2021","Codeup News, Featured",We are happy to announce that our Dallas campu...
9,Codeup’s Placement Team Continues Setting Records,"Nov 19, 2021","Codeup News, Employers",Our Placement Team is simply defined as a grou...


Make the function to scrape code up blog

In [31]:
def scrape_codeup():
    '''
    This function will scrape 16 blogs from codeup blog page.
    '''
    headers = {'user-agent': 'Innis Data Science Cohort'}
    url = 'https://codeup.com/blog/'
    response = get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    blogs = soup.find_all('article')[:15]
    df = pd.DataFrame([parse_blog(blog) for blog in blogs])
    return df

In [32]:
df = scrape_codeup()
df

Unnamed: 0,title,date,tags,content
0,From Bootcamp to Bootcamp | A Military Appreci...,"Apr 27, 2022","Alumni Stories, Dallas, Events, Featured, Mil...","In honor of Military Appreciation Month, join ..."
1,Our Acquisition of the Rackspace Cloud Academy...,"Apr 14, 2022","Codeup News, Featured, IT Training","Just about a year ago on April 16th, 2021 we a..."
2,Learn to Code: HTML & CSS on 4/30,"Apr 1, 2022","Virtual, Workshops",HTML & CSS are the design building blocks of a...
3,Learn to Code: Python Workshop on 4/23,"Mar 31, 2022","Events, Virtual, Workshops","According to LinkedIn, the ""#1 Most Promising ..."
4,Coming Soon: Cloud Administration,"Mar 17, 2022",Codeup News,We're launching a new program out of San Anton...
5,5 Books Every Woman In Tech Should Read,"Mar 8, 2022",Featured,On this International Women's Day 2022 we want...
6,Codeup Start Dates for March 2022,"Jan 26, 2022",Codeup News,As we approach the end of January we wanted to...
7,VET TEC Funding Now Available For Dallas Veterans,"Jan 7, 2022","Codeup News, Dallas Newsletter, Featured, Tips...",We are so happy to announce that VET TEC benef...
8,Dallas Campus Re-opens With New Grant Partner,"Dec 30, 2021","Codeup News, Featured",We are happy to announce that our Dallas campu...
9,Codeup’s Placement Team Continues Setting Records,"Nov 19, 2021","Codeup News, Employers",Our Placement Team is simply defined as a grou...


---
2. News Articles
 - We will now be scraping text data from inshorts, a website that provides a brief overview of many different topics.
 - Write a function that scrapes the news articles for the following topics:
     - Business
     - Sports
     - Technology
     - Entertainment
 - The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:
 
    {
    'title': 'The article title',
    'content': 'The article content',
    'category': 'business' # for example
    }

In [33]:
url = 'https://inshorts.com/en/read/business'
response = get(url)
response

<Response [200]>

In [34]:
response.text[:500]

'<!doctype html>\n<html lang="en">\n\n<head>\n  <meta charset="utf-8" />\n  <style>\n    /* The Modal (background) */\n    .modal_contact {\n        display: none; /* Hidden by default */\n        position: fixed; /* Stay in place */\n        z-index: 8; /* Sit on top */\n        left: 0;\n        top: 0;\n        width: 100%; /* Full width */\n        height: 100%;\n        overflow: auto; /* Enable scroll if needed */\n        background-color: rgb(0,0,0); /* Fallback color */\n        background-color: rgba(0,'

In [35]:
soup = BeautifulSoup(response.text, 'html.parser')

In [36]:
news = soup.select('.news-card')

In [37]:
short = news[0]

In [38]:
short.find('span', itemprop= 'headline').text

'Rupee closes at all-time low of 77.50 against US dollar'

In [39]:
short.find('span', class_= 'author').text

'Pragya Swastik'

In [40]:
short.find('span', class_= 'date').text

'09 May'

In [41]:
short.find('span', class_= 'time').text

'08:57 pm'

In [42]:
short.find('div', itemprop= 'articleBody').text

'The Indian rupee weakened further on Monday to close at a new all-time low of 77.50 against the US dollar, 60 paise over its previous close. During the trading session, the rupee touched its lifetime low of 77.52. The currency was weighed down by elevated crude oil prices and a widening trade deficit.'

In [44]:
def parse_shorts(short):
     '''
    This function takes the information scraped from shorts page and return a dataframe with:
    title, date, tags and content of each article 
    '''
    output = {}
    output['title'] = short.find('span', itemprop= 'headline').text
    output['date'] = short.find('span', class_= 'date').text, short.find('span', class_= 'time').text
    output['author'] = short.find('span', class_= 'author').text
    output['content'] = short.find('div', itemprop= 'articleBody').text
    return output

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 6)

In [None]:
parse_shorts(short)

{'title': 'Rupee hits all-time low of 77.42 against US dollar',
 'date': ('09 May', '10:35 am'),
 'author': 'Apaar Sharma',
 'content': 'The Indian rupee fell to an all-time low of 77.42 against the US dollar on Monday, Reuters reported. Asian markets were lower on Monday as US stock futures fell on fears of more policy tightening from the Federal Reserve and strict lockdown in Shanghai impacting global growth, according to Reuters.'}

In [None]:
pd.DataFrame([parse_shorts(short) for short in news])

Unnamed: 0,title,date,author,content
0,Rupee hits all-time low of 77.42 against US do...,"(09 May, 10:35 am)",Apaar Sharma,The Indian rupee fell to an all-time low of 77...
1,Bitcoin falls to the lowest level since Januar...,"(09 May, 02:50 pm)",Pragya Swastik,"Bitcoin fell on Monday to as low as $33,266 in..."
2,Made best possible decision: IndiGo on barring...,"(09 May, 03:20 pm)",Pragya Swastik,IndiGo's CEO Ronojoy Dutta said the airline ma...
3,I will do my best to stay alive: Musk to his m...,"(09 May, 09:51 am)",Ridham Gambhir,Soon after Tesla CEO Elon Musk shared a tweet ...
4,"If I die under mysterious circumstances, nice ...","(09 May, 08:50 am)",Ridham Gambhir,"Tesla CEO Elon Musk has tweeted, ""If I die und..."
5,"Japan will cease to exist, Venice dying due to...","(09 May, 06:47 pm)",Pragya Swastik,The world's richest man Elon Musk recently twe...
6,Uber will treat hiring as a 'privilege' to cut...,"(09 May, 04:54 pm)",Pragya Swastik,Uber CEO Dara Khosrowshahi told employees in a...
7,GST Council may consider highest 28% tax on cr...,"(09 May, 10:22 am)",Ridham Gambhir,The GST Council is considering levying a 28% t...
8,Investigating it myself: Scindia after IndiGo ...,"(09 May, 09:40 am)",Ridham Gambhir,Union Civil Aviation Minister Jyotiraditya Sci...
9,FM Sitharaman gets water for NSDL MD during he...,"(08 May, 08:05 pm)",Ridham Gambhir,A video has surfaced on social media wherein F...


In [None]:
def scrape_busines_shorts():
    '''
    This function will scrape business articles from inshorts webpage
    '''
    url = 'https://inshorts.com/en/read/business'
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    shorts = soup.select('.news-card')
    df = pd.DataFrame([parse_shorts(short) for short in news])
    return df

In [None]:
business = scrape_busines_shorts()
business

Unnamed: 0,title,date,author,content
0,Rupee hits all-time low of 77.42 against US do...,"(09 May, 10:35 am)",Apaar Sharma,The Indian rupee fell to an all-time low of 77...
1,Bitcoin falls to the lowest level since Januar...,"(09 May, 02:50 pm)",Pragya Swastik,"Bitcoin fell on Monday to as low as $33,266 in..."
2,Made best possible decision: IndiGo on barring...,"(09 May, 03:20 pm)",Pragya Swastik,IndiGo's CEO Ronojoy Dutta said the airline ma...
3,I will do my best to stay alive: Musk to his m...,"(09 May, 09:51 am)",Ridham Gambhir,Soon after Tesla CEO Elon Musk shared a tweet ...
4,"If I die under mysterious circumstances, nice ...","(09 May, 08:50 am)",Ridham Gambhir,"Tesla CEO Elon Musk has tweeted, ""If I die und..."
5,"Japan will cease to exist, Venice dying due to...","(09 May, 06:47 pm)",Pragya Swastik,The world's richest man Elon Musk recently twe...
6,Uber will treat hiring as a 'privilege' to cut...,"(09 May, 04:54 pm)",Pragya Swastik,Uber CEO Dara Khosrowshahi told employees in a...
7,GST Council may consider highest 28% tax on cr...,"(09 May, 10:22 am)",Ridham Gambhir,The GST Council is considering levying a 28% t...
8,Investigating it myself: Scindia after IndiGo ...,"(09 May, 09:40 am)",Ridham Gambhir,Union Civil Aviation Minister Jyotiraditya Sci...
9,FM Sitharaman gets water for NSDL MD during he...,"(08 May, 08:05 pm)",Ridham Gambhir,A video has surfaced on social media wherein F...


In [None]:
def scrape_sports_shorts():
    '''
    This function will scrape sports articles from inshorts webpage
    '''
    url = 'https://inshorts.com/en/read/sports'
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    shorts = soup.select('.news-card')
    df = pd.DataFrame([parse_shorts(short) for short in news])
    return df

In [None]:
sports = scrape_sports_shorts()
sports

Unnamed: 0,title,date,author,content
0,Rupee hits all-time low of 77.42 against US do...,"(09 May, 10:35 am)",Apaar Sharma,The Indian rupee fell to an all-time low of 77...
1,Bitcoin falls to the lowest level since Januar...,"(09 May, 02:50 pm)",Pragya Swastik,"Bitcoin fell on Monday to as low as $33,266 in..."
2,Made best possible decision: IndiGo on barring...,"(09 May, 03:20 pm)",Pragya Swastik,IndiGo's CEO Ronojoy Dutta said the airline ma...
3,I will do my best to stay alive: Musk to his m...,"(09 May, 09:51 am)",Ridham Gambhir,Soon after Tesla CEO Elon Musk shared a tweet ...
4,"If I die under mysterious circumstances, nice ...","(09 May, 08:50 am)",Ridham Gambhir,"Tesla CEO Elon Musk has tweeted, ""If I die und..."
5,"Japan will cease to exist, Venice dying due to...","(09 May, 06:47 pm)",Pragya Swastik,The world's richest man Elon Musk recently twe...
6,Uber will treat hiring as a 'privilege' to cut...,"(09 May, 04:54 pm)",Pragya Swastik,Uber CEO Dara Khosrowshahi told employees in a...
7,GST Council may consider highest 28% tax on cr...,"(09 May, 10:22 am)",Ridham Gambhir,The GST Council is considering levying a 28% t...
8,Investigating it myself: Scindia after IndiGo ...,"(09 May, 09:40 am)",Ridham Gambhir,Union Civil Aviation Minister Jyotiraditya Sci...
9,FM Sitharaman gets water for NSDL MD during he...,"(08 May, 08:05 pm)",Ridham Gambhir,A video has surfaced on social media wherein F...


In [None]:
def scrape_tech_shorts():
    '''
    This function will scrape technology articles from inshorts webpage
    '''
    url = 'https://inshorts.com/en/read/technology'
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    shorts = soup.select('.news-card')
    df = pd.DataFrame([parse_shorts(short) for short in news])
    return df

In [None]:
tech = scrape_tech_shorts()
tech

Unnamed: 0,title,date,author,content
0,Rupee hits all-time low of 77.42 against US do...,"(09 May, 10:35 am)",Apaar Sharma,The Indian rupee fell to an all-time low of 77...
1,Bitcoin falls to the lowest level since Januar...,"(09 May, 02:50 pm)",Pragya Swastik,"Bitcoin fell on Monday to as low as $33,266 in..."
2,Made best possible decision: IndiGo on barring...,"(09 May, 03:20 pm)",Pragya Swastik,IndiGo's CEO Ronojoy Dutta said the airline ma...
3,I will do my best to stay alive: Musk to his m...,"(09 May, 09:51 am)",Ridham Gambhir,Soon after Tesla CEO Elon Musk shared a tweet ...
4,"If I die under mysterious circumstances, nice ...","(09 May, 08:50 am)",Ridham Gambhir,"Tesla CEO Elon Musk has tweeted, ""If I die und..."
5,"Japan will cease to exist, Venice dying due to...","(09 May, 06:47 pm)",Pragya Swastik,The world's richest man Elon Musk recently twe...
6,Uber will treat hiring as a 'privilege' to cut...,"(09 May, 04:54 pm)",Pragya Swastik,Uber CEO Dara Khosrowshahi told employees in a...
7,GST Council may consider highest 28% tax on cr...,"(09 May, 10:22 am)",Ridham Gambhir,The GST Council is considering levying a 28% t...
8,Investigating it myself: Scindia after IndiGo ...,"(09 May, 09:40 am)",Ridham Gambhir,Union Civil Aviation Minister Jyotiraditya Sci...
9,FM Sitharaman gets water for NSDL MD during he...,"(08 May, 08:05 pm)",Ridham Gambhir,A video has surfaced on social media wherein F...


In [None]:
def scrape_entertainment_shorts():
    '''
    This function will scrape entertainment articles from inshorts webpage
    '''
    url = 'https://inshorts.com/en/read/entertainment'
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    shorts = soup.select('.news-card')
    df = pd.DataFrame([parse_shorts(short) for short in news])
    return df

In [None]:
entertainment = scrape_entertainment_shorts()
entertainment

Unnamed: 0,title,date,author,content
0,Rupee hits all-time low of 77.42 against US do...,"(09 May, 10:35 am)",Apaar Sharma,The Indian rupee fell to an all-time low of 77...
1,Bitcoin falls to the lowest level since Januar...,"(09 May, 02:50 pm)",Pragya Swastik,"Bitcoin fell on Monday to as low as $33,266 in..."
2,Made best possible decision: IndiGo on barring...,"(09 May, 03:20 pm)",Pragya Swastik,IndiGo's CEO Ronojoy Dutta said the airline ma...
3,I will do my best to stay alive: Musk to his m...,"(09 May, 09:51 am)",Ridham Gambhir,Soon after Tesla CEO Elon Musk shared a tweet ...
4,"If I die under mysterious circumstances, nice ...","(09 May, 08:50 am)",Ridham Gambhir,"Tesla CEO Elon Musk has tweeted, ""If I die und..."
5,"Japan will cease to exist, Venice dying due to...","(09 May, 06:47 pm)",Pragya Swastik,The world's richest man Elon Musk recently twe...
6,Uber will treat hiring as a 'privilege' to cut...,"(09 May, 04:54 pm)",Pragya Swastik,Uber CEO Dara Khosrowshahi told employees in a...
7,GST Council may consider highest 28% tax on cr...,"(09 May, 10:22 am)",Ridham Gambhir,The GST Council is considering levying a 28% t...
8,Investigating it myself: Scindia after IndiGo ...,"(09 May, 09:40 am)",Ridham Gambhir,Union Civil Aviation Minister Jyotiraditya Sci...
9,FM Sitharaman gets water for NSDL MD during he...,"(08 May, 08:05 pm)",Ridham Gambhir,A video has surfaced on social media wherein F...
