In [275]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import csv
import re

In [117]:
def get_blog_links(url, headers):
    '''
    This will take in a url and header, and return a list of blog url links gathered
    from the blog selection pages
    '''
    # start at page 1
    page = 1
    # set our url
    page_url = f'{url}{page}'
    # get the first page results
    response = requests.get(page_url, headers=headers)
    # create an empty list to store the link urls
    link_list = []
    
    # go through every page until we get a status code other than 200
    while response.status_code == 200:
        # convert our received content into soup object
        soup = BeautifulSoup(response.content, 'html.parser')
        # have soup, will travel
        if soup:
            # get a list of items on the page with <a> tag
            article_list = soup.find_all('a', class_='entry-featured-image-url')
            # add each href url link to the link_list
            [link_list.append(article['href']) for article in article_list]
        else:
            print('page url has not returned any content')
            break
            
        # increment our page counter
        page += 1
        # set the url to the next page
        page_url = f'{url}{page}'
        # get the response for the next page
        response = requests.get(page_url, headers=headers)
        
    # return the list of blog url links
    return link_list

In [118]:
def get_blog_content(url, headers):
    '''
    This will gather the title and page content from the passed blog url
    '''
    # get the content from this blog
    response = requests.get(url, headers=headers)
    # check if we have correct response and if there is content in our response
    if response.status_code == 200:
        # convert our received content into soup object
        soup = BeautifulSoup(response.content, 'html.parser')
        # have content, will travel
        if soup:
            # gather the title from the <h2> tag
            title = soup.find('h1', class_='entry-title').text
            # get the blog content text, strip extra chars
            content = soup.find('div', class_='entry-content').text.strip()
            # create a regex function to search for the title, while ignoring case
            compiled = re.compile(re.escape(title), re.IGNORECASE)
            # remove the title from the content, while ignoring case
            content = compiled.sub('', content)
        # if there is no soup
        else:
            # display an error message
            print('page has no content')
    # if there is a page error
    else:
        # display an error message
        print('page url has returned an error')
    # return the scraped title and content for this blog
    return title, content

In [220]:
def get_blog_articles(fresh=False):
    '''
    This will gather the title, url and content for every blog on the codeup website
    '''
    # assign a filename for cached data
    filename = 'codeup_blogs.csv'
    # if we dont need fresh data then check for a cached version
    if not fresh:
        # check for cached file
        if os.path.exists(filename):
            # if cached file exists, display a status message
            print(f'Opening file {filename} from local directory')
            # then open it
            content_dict = pd.read_csv(filename, index_col=0)
            # return cached file, which will end the function
            return content_dict
        # if cached file not found
        else:
            # display a status message
            print(f'local file {filename} not found')
            # then download the data
    
    print('Downloading data from Codeup website')
    # set the url to the codeup blog pages
    url = 'https://codeup.com/blog/page/'
    # Some websites don't accept the pyhon-requests default user-agent
    headers = {'User-Agent': 'Codeup Data Science'}
    # display a status message
    print('Gathering blog links')
    # get a list of blog page urls
    link_list = get_blog_links(url, headers)
    # create an empty dictionary to store the results
    content_dict = {}
    # display a status message
    print('Gathering blog content')
    # cycle through all the gathered blog links
    for i, blog in enumerate(link_list):
        # gather the title and content for each page
        title, content = get_blog_content(blog, headers)
        # add a record to the dictionary with the title, url and content for the blog
        content_dict[i] = {'title':title, 'url':blog, 'content':content}
        
    # cache the dictionary data to a csv
    pd.DataFrame(content_dict).T.to_csv(filename)
    # return the dictionary of blogs
    return content_dict

In [2]:
url = 'https://codeup.com/blog/page/'
# Some websites don't accept the pyhon-requests default user-agent
headers = {'User-Agent': 'Codeup Data Science'} 
# response = requests.get(url, headers=headers)

In [221]:
blogs = get_blog_articles()

local file codeup_blogs.csv not found
Downloading data from Codeup website
Gathering blog links
Gathering blog content


In [222]:
blogs

{0: {'title': 'Women in tech: Panelist Spotlight – Magdalena Rahn',
  'url': 'https://codeup.com/featured/women-in-tech-panelist-spotlight/',
  'content': ['Mar 28, 2023 | Events, Featured',
   'Codeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry!',
   '',
   'Meet Magdalena!',
   'Magdalena Rahn is a current Codeup student in a Data Science cohort in San Antonio, Texas. She has a professional background in cross-cultural communications, international business development, the wine industry and journalism. After serving in the US Navy, she decided to complement her professional skill set by attending the Data Science program at Codeup; she is set to graduate in March 2023. Magdalena is fluent in French, Bulgarian, Chinese-Mandarin, Spanish and Italian.',
   'We asked Mag

In [280]:
pd.DataFrame(blogs).T

Unnamed: 0,title,url,content
0,Women in tech: Panelist Spotlight – Magdalena ...,https://codeup.com/featured/women-in-tech-pane...,"[Mar 28, 2023 | Events, Featured, Codeup is ho..."
1,Women in tech: Panelist Spotlight – Rachel Rob...,https://codeup.com/featured/women-in-tech-rach...,"[Mar 20, 2023 | Events, Featured, Codeup is ho..."
2,Women in tech: Panelist Spotlight – Sarah Mellor,https://codeup.com/codeup-news/women-in-tech-p...,"[Mar 13, 2023 | Codeup News, Featured, Codeup ..."
3,Women in tech: Panelist Spotlight – Madeleine ...,https://codeup.com/events/women-in-tech-madele...,"[Mar 6, 2023 | Events, Featured, Codeup is hos..."
4,Black excellence in tech: Panelist Spotlight –...,https://codeup.com/codeup-news/panelist-spotli...,"[Feb 16, 2023 | Codeup News, Events, Featured,..."
...,...,...,...
153,1. Collaboration,https://codeup.com/alumni-stories/codeup-prepa...,"[Nov 28, 2018 | Alumni Stories, By Joyce Yueh ..."
154,Latest Blog Articles,https://codeup.com/alumni-stories/why-im-thank...,"[Nov 21, 2018 | Alumni Stories, By Amy Yanaway..."
155,Latest Blog Articles,https://codeup.com/alumni-stories/path-codeup-...,"[Nov 14, 2018 | Alumni Stories, , \nBy Marcel..."
156,Latest Blog Articles,https://codeup.com/codeup-news/codeup-student-...,"[Nov 12, 2018 | Codeup News, Codeup welcomed t..."


In [218]:
pd.DataFrame(blogs).T.to_csv('test.csv')

In [219]:
# field_names = ['title', 'url', 'content']

# with open('test.csv', 'w') as csvfile:
#     writer = csv.DictWriter(csvfile, fieldnames = field_names)
#     writer.writeheader()
#     for i in blogs:
#         writer.writerows(blogs[i])

In [7]:
response.status_code

200

In [124]:
if not True:
    print('y')
else:
    print('n')

n


In [9]:
response.content

b'<!DOCTYPE html>\n<html lang="en-US">\n<head>\n\t<meta charset="UTF-8" />\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\n\t<link rel="pingback" href="https://codeup.com/xmlrpc.php" />\n\n\t<script type="text/javascript">\n\t\tdocument.documentElement.className = \'js\';\n\t</script>\n\t\n\t<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin /><script id="diviarea-loader">window.DiviPopupData=window.DiviAreaConfig={"zIndex":1000000,"animateSpeed":400,"triggerClassPrefix":"show-popup-","idAttrib":"data-popup","modalIndicatorClass":"is-modal","blockingIndicatorClass":"is-blocking","defaultShowCloseButton":true,"withCloseClass":"with-close","noCloseClass":"no-close","triggerCloseClass":"close","singletonClass":"single","darkModeClass":"dark","noShadowClass":"no-shadow","altCloseClass":"close-alt","popupSelector":".et_pb_section.popup","initializeOnEvent":"et_pb_after_init_modules","popupWrapperClass":"area-outer-wrap","fullHeightClass":"full-height","openPopupClas

In [13]:
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    if soup:
        print('y')

y


In [15]:
soup.find_all('a', class_='entry-featured-image-url')

[<a class="entry-featured-image-url" href="https://codeup.com/featured/women-in-tech-panelist-spotlight/"><img alt="Women in tech: Panelist Spotlight – Magdalena Rahn" class="" decoding="async" height="250" loading="lazy" sizes="(max-width:479px) 479px, 100vw " src="https://tribucodeup.wpenginepowered.com/wp-content/uploads/2023/03/WOMEN-IN-TECH-1920-×-1080-px-3-1-400x250.png" srcset="https://tribucodeup.wpenginepowered.com/wp-content/uploads/2023/03/WOMEN-IN-TECH-1920-×-1080-px-3-1.png 479w, https://tribucodeup.wpenginepowered.com/wp-content/uploads/2023/03/WOMEN-IN-TECH-1920-×-1080-px-3-1-400x250.png 480w " width="400"/></a>,
 <a class="entry-featured-image-url" href="https://codeup.com/featured/women-in-tech-rachel-robbins-mayhill/"><img alt="Women in tech: Panelist Spotlight – Rachel Robbins-Mayhill" class="" decoding="async" height="250" loading="lazy" sizes="(max-width:479px) 479px, 100vw " src="https://tribucodeup.wpenginepowered.com/wp-content/uploads/2023/03/WOMEN-IN-TECH-1920

In [27]:
article_list = soup.find_all('a', class_='entry-featured-image-url')

In [28]:
article_list[0]['href']

'https://codeup.com/featured/women-in-tech-panelist-spotlight/'

In [29]:
link_list = [article['href'] for article in article_list]

In [30]:
link_list[0]

'https://codeup.com/featured/women-in-tech-panelist-spotlight/'

In [223]:
url = 'https://codeup.com/featured/women-in-tech-panelist-spotlight/'
# Some websites don't accept the pyhon-requests default user-agent
headers = {'User-Agent': 'Codeup Data Science'} 
response = requests.get(url, headers=headers)

In [224]:
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    if soup:
        print('y')

y


In [272]:
title = soup.find('h1', class_='entry-title').text

In [273]:
title

'Women in tech: Panelist Spotlight – Magdalena Rahn'

In [260]:
content = soup.find('div', class_='entry-content').text.strip()

In [261]:
content

'Women in tech: Panelist Spotlight – Magdalena Rahn\nCodeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry!\n\nMeet Magdalena!\nMagdalena Rahn is a current Codeup student in a Data Science cohort in San Antonio, Texas. She has a professional background in cross-cultural communications, international business development, the wine industry and journalism. After serving in the US Navy, she decided to complement her professional skill set by attending the Data Science program at Codeup; she is set to graduate in March 2023. Magdalena is fluent in French, Bulgarian, Chinese-Mandarin, Spanish and Italian.\nWe asked Magdalena how Codeup impacted her career, and she replied “Codeup has provided a solid foundation in analytical processes, programming and data science methods, and 

In [277]:
compiled = re.compile(re.escape(title), re.IGNORECASE)
content = compiled.sub('', content)

In [278]:
content

'\nCodeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry!\n\nMeet Magdalena!\nMagdalena Rahn is a current Codeup student in a Data Science cohort in San Antonio, Texas. She has a professional background in cross-cultural communications, international business development, the wine industry and journalism. After serving in the US Navy, she decided to complement her professional skill set by attending the Data Science program at Codeup; she is set to graduate in March 2023. Magdalena is fluent in French, Bulgarian, Chinese-Mandarin, Spanish and Italian.\nWe asked Magdalena how Codeup impacted her career, and she replied “Codeup has provided a solid foundation in analytical processes, programming and data science methods, and it’s been an encouragement to have such supportive

In [243]:
content_list = content.text

AttributeError: ResultSet object has no attribute 'text'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?

In [232]:
content_list

[]

In [46]:
text_list = [i.text for i in content_list]

In [47]:
text_list

['Mar 28, 2023 | Events, Featured',
 'Codeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry!',
 '',
 'Meet Magdalena!',
 'Magdalena Rahn is a current Codeup student in a Data Science cohort in San Antonio, Texas. She has a professional background in cross-cultural communications, international business development, the wine industry and journalism. After serving in the US Navy, she decided to complement her professional skill set by attending the Data Science program at Codeup; she is set to graduate in March 2023. Magdalena is fluent in French, Bulgarian, Chinese-Mandarin, Spanish and Italian.',
 'We asked Magdalena how Codeup impacted her career, and she replied “Codeup has provided a solid foundation in analytical processes, programming and data science methods, and it’

In [51]:
# Some websites don't accept the pyhon-requests default user-agent so change the headers
headers = {'User-Agent': 'Codeup Data Science'} 
# set the base url that we are looking at
url = 'https://codeup.com/blog/'

In [49]:
# def get_blog_urls(url, headers):
#     # get a list of blog urls from the blog main page
#     response = requests.get(url, headers=headers)
    
#     # check if we have correct response and if there is content in our response
#     if response.status_code == 200:
#         # convert our received content into soup object
#         soup = BeautifulSoup(response.content, 'html.parser')
#         # have content, will travel
#         if soup:
#             article_list = soup.find_all('a', class_='entry-featured-image-url')
#             link_list = [article['href'] for article in article_list]
#     return link_list

In [52]:
blog_links = get_blog_urls(url, headers)

In [53]:
blog_links

['https://codeup.com/featured/women-in-tech-panelist-spotlight/',
 'https://codeup.com/featured/women-in-tech-rachel-robbins-mayhill/',
 'https://codeup.com/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/',
 'https://codeup.com/events/women-in-tech-madeleine/',
 'https://codeup.com/codeup-news/panelist-spotlight-4/',
 'https://codeup.com/events/black-excellence-in-tech-panelist-spotlight-stephanie-jones/']

In [58]:
url = 'https://codeup.com/featured/women-in-tech-panelist-spotlight/'
content_dict['title'], content_dict['content'] = get_blog_content(url, headers)

In [60]:
content_dict

{'title': 'Women in tech: Panelist Spotlight – Magdalena Rahn',
 'content': ['Mar 28, 2023 | Events, Featured',
  'Codeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry!',
  '',
  'Meet Magdalena!',
  'Magdalena Rahn is a current Codeup student in a Data Science cohort in San Antonio, Texas. She has a professional background in cross-cultural communications, international business development, the wine industry and journalism. After serving in the US Navy, she decided to complement her professional skill set by attending the Data Science program at Codeup; she is set to graduate in March 2023. Magdalena is fluent in French, Bulgarian, Chinese-Mandarin, Spanish and Italian.',
  'We asked Magdalena how Codeup impacted her career, and she replied “Codeup has provided a solid 

In [73]:
content_dict = {}
for i, blog in enumerate(blog_links):
    title, content = get_blog_content(blog, headers)
    content_dict[i] = title, blog, content

AttributeError: 'dict' object has no attribute 'T'

In [76]:
blog_content = pd.DataFrame(content_dict).T.\
    rename(columns={0:'title', 1:'url', 2:'content'})

In [77]:
blog_content

Unnamed: 0,title,url,content
0,Women in tech: Panelist Spotlight – Magdalena ...,https://codeup.com/featured/women-in-tech-pane...,"[Mar 28, 2023 | Events, Featured, Codeup is ho..."
1,Women in tech: Panelist Spotlight – Rachel Rob...,https://codeup.com/featured/women-in-tech-rach...,"[Mar 20, 2023 | Events, Featured, Codeup is ho..."
2,Women in tech: Panelist Spotlight – Sarah Mellor,https://codeup.com/codeup-news/women-in-tech-p...,"[Mar 13, 2023 | Codeup News, Featured, Codeup ..."
3,Women in tech: Panelist Spotlight – Madeleine ...,https://codeup.com/events/women-in-tech-madele...,"[Mar 6, 2023 | Events, Featured, Codeup is hos..."
4,Black excellence in tech: Panelist Spotlight –...,https://codeup.com/codeup-news/panelist-spotli...,"[Feb 16, 2023 | Codeup News, Events, Featured,..."
5,Black excellence in tech: Panelist Spotlight –...,https://codeup.com/events/black-excellence-in-...,"[Feb 13, 2023 | Codeup News, Events, Featured,..."


In [98]:
url = 'https://codeup.com/blog/page/'
# Some websites don't accept the pyhon-requests default user-agent
headers = {'User-Agent': 'Codeup Data Science'} 
response = requests.get(url, headers=headers)

In [79]:
response.status_code

404

In [99]:
page=1

In [100]:
f'{url}{page}'

'https://codeup.com/blog/page/1'

In [96]:
link_list

['https://codeup.com/featured/women-in-tech-panelist-spotlight/',
 'https://codeup.com/featured/women-in-tech-rachel-robbins-mayhill/',
 'https://codeup.com/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/',
 'https://codeup.com/events/women-in-tech-madeleine/',
 'https://codeup.com/codeup-news/panelist-spotlight-4/',
 'https://codeup.com/events/black-excellence-in-tech-panelist-spotlight-stephanie-jones/',
 'https://codeup.com/events/black-excellence-in-tech-panelist-spotlight-james-cooper/',
 'https://codeup.com/events/black-excellence-in-tech-panelist-spotlight/',
 'https://codeup.com/tips-for-prospective-students/coding-bootcamp-or-self-learning/',
 'https://codeup.com/codeup-news/codeup-best-bootcamps/',
 'https://codeup.com/data-science/become-a-data-scientist/',
 'https://codeup.com/employers/hiring-tech-talent/',
 'https://codeup.com/cloud-administration/cap-funding-options/',
 'https://codeup.com/dallas-info/it-professionals-dallas/',
 'https://codeup.com/codeup-news

# 2. Gathering from https://inshorts.com/en/read
Write a function that scrapes the news articles for the following topics:
- Business
- Sports
- Technology
- Entertainment

{
    'title': 'The article title',
    'content': 'The article content',
    'category': 'business' # for example
}


In [281]:
url = 'https://inshorts.com/en/read/'

In [206]:
def get_news_articles_1():
    '''
    This will gather news articles from https://inshorts.com in the categories:
    business, sports, technology and entertainment. it will return the news articles
    as a dictionary
    '''
    # set a list of urls for each category
    category_urls = {
    0: {'category':'business', 'url':'https://inshorts.com/en/read/business'},
    1: {'category':'sports', 'url':'https://inshorts.com/en/read/sports'},
    2: {'category':'technology', 'url':'https://inshorts.com/en/read/technology'},
    3: {'category':'entertainment', 'url':'https://inshorts.com/en/read/entertainment'}}
    # create an empty dictionary for the results
    content_dict = {}
    # create a counter
    item = 0
    # cycle through the category urls
    for i in range(len(category_urls)):
        # assign a category variable to this categoy's info
        category = category_urls[i]['category']
        # assign a url for this category's url
        url = category_urls[i]['url']
        # gather the data from this category's url
        response = requests.get(url)
        # parse the html using beautiful soup
        soup = BeautifulSoup(response.content, 'html.parser')
        # assign the article content to a variable
        content = soup.find_all('span', itemprop='headline')
        # assign the article title to a varialbe
        titles = soup.find_all('div', class_='news-card-title news-right-box')
        # cycle through all the gathered titles for this category
        for j in range(len(titles)):
            # add a dictionary item with the title, content and category
            content_dict[item] = {'title':titles[j].text,
                               'content':content[j].text,
                               'category':category}
            # iterate the item counter
            item += 1
    # return the article content dictionary
    return content_dict

In [183]:
category_urls = {
    0: {'category':'business', 'url':'https://inshorts.com/en/read/business'},
    1: {'category':'sports', 'url':'https://inshorts.com/en/read/sports'},
    2: {'category':'technology', 'url':'https://inshorts.com/en/read/technology'},
    3: {'category':'entertainment', 'url':'https://inshorts.com/en/read/entertainment'}}

In [184]:
category_urls[0]

{'category': 'business', 'url': 'https://inshorts.com/en/read/business'}

In [282]:
response = requests.get(url)
response

<Response [200]>

In [283]:
soup = BeautifulSoup(response.content, 'html.parser')

In [285]:
soup.find_all('span', itemprop='headline')[0].text

'Kriti Sanon wears saree with 24-carat gold print, Abu Jani & Sandeep Khosla share pics'

In [151]:
# soup

In [149]:
titles = soup.find_all('div', class_='news-card-title news-right-box')

In [150]:
content = soup.find_all('div', class_='news-card-content news-right-box')

In [172]:
titles[2].text

"\n\nIndia's high streets for shopping ranked, Bengaluru's MG Road bags top spot\n\n\nshort by Pragya Swastik / \n      07:46 pm on 10 May 2023,Wednesday\n\n"

In [176]:
content_dict = {}
# for i in range(len(category_urls)):
url = 'https://inshorts.com/en/read/business'
soup = BeautifulSoup(response.content, 'html.parser')
content = soup.find_all('span', itemprop='headline')
titles = soup.find_all('div', class_='news-card-title news-right-box')

for i in range(len(titles)):
    content_dict[i] = {'title':titles[i].text,
                       'content':content[i].text,
                       'category':'business'}

In [189]:
content_dict[0]['title']

"\n\nMicrosoft won't give salary hikes to full-time employees this year\n\n\nshort by Pragya Swastik / \n      10:23 pm on 10 May 2023,Wednesday\n\n"

In [194]:
[category_urls[i]['category'] for i in category_urls]

['business', 'sports', 'technology', 'entertainment']

In [288]:
# select the category links
soup.select('li')

[<li class="active-category selected">All News</li>,
 <li class="active-category">India</li>,
 <li class="active-category">Business</li>,
 <li class="active-category">Sports</li>,
 <li class="active-category">World</li>,
 <li class="active-category">Politics</li>,
 <li class="active-category">Technology</li>,
 <li class="active-category">Startup</li>,
 <li class="active-category">Entertainment</li>,
 <li class="active-category">Miscellaneous</li>,
 <li class="active-category">Hatke</li>,
 <li class="active-category">Science</li>,
 <li class="active-category">Automobile</li>]

In [290]:
[li.text.lower() for li in soup.select('li')]

['all news',
 'india',
 'business',
 'sports',
 'world',
 'politics',
 'technology',
 'startup',
 'entertainment',
 'miscellaneous',
 'hatke',
 'science',
 'automobile']

In [293]:
# get rid of the all news
categories = [li.text.lower() for li in soup.select('li')][1:]
# the india category link is actually 'national'
categories[0] = 'national'

In [294]:
categories

['national',
 'business',
 'sports',
 'world',
 'politics',
 'technology',
 'startup',
 'entertainment',
 'miscellaneous',
 'hatke',
 'science',
 'automobile']

In [301]:
def get_news_articles():
    '''
    This will gather news articles from https://inshorts.com in the categories:
    business, sports, technology and entertainment. it will return the news articles
    as a dictionary
    '''
    # assign the url
    url = 'https://inshorts.com/en/read/'
    # request the data from the url
    response = requests.get(url)
    # check if good status code
    if response.status_code != 200:
        print('error code from request')
    else:
        # turn data into soup
        soup = BeautifulSoup(response.content, 'html.parser')
        # if we dont have soup, give an error
        if not soup:
            print('no content found on page')
        else:
            # get rid of the all news category
            categories = [li.text.lower() for li in soup.select('li')][1:]
            # the india category link is actually 'national'
            categories[0] = 'national'

            inshorts = []
            for category in categories:
                url = 'https://inshorts.com/en/read/' + category
                response = requests.get(url)
                soup = BeautifulSoup(response.content, 'html.parser')

                titles = [span.text for span in soup.find_all('span', 
                                                              itemprop='headline')]
                content = [div.text for div in soup.find_all('div', 
                                                             itemprop='articleBody')]

                for i in range(len(titles)):
                    article = {
                        'title': titles[i],
                        'content': content[i],
                        'category': category
                    }
                    inshorts.append(article)
    return inshorts

In [300]:
inshorts

[{'title': "Petition filed in HC challenging 'The Kerala Story' ban in Bengal",
  'content': "After West Bengal CM Mamata Banerjee banned the movie 'The Kerala Story' on Monday, a public interest litigation (PIL) has been filed against the same in Calcutta High Court. The petitioner on Wednesday told the court that the state government's decision is against the right to freedom of speech. The case will be heard by the court on May 15.",
  'category': 'national'},
 {'title': "Haryana CM makes 'The Kerala Story' tax-free in the state ",
  'content': "Haryana CM Manohar Lal Khattar on Wednesday said that 'The Kerala Story' has been made tax-free in the state. This comes after the movie was made tax-free in other BJP ruled states, including UP, MP and Uttarakhand. However, the movie was banned in West Bengal. It was done to avoid incidents of hatred in the state, CM Mamata Banerjee said. ",
  'category': 'national'},
 {'title': "Exit polls are done in hurry, there will be many errors: K'ta

In [297]:
titles

['Musk to pay $10,000 to Indian-American critic to settle defamation case',
 'Ola Electric to refund ₹130 crore to customers for its charger: Report',
 "Cutting ties with China 'unthinkable': Mercedes-Benz CEO",
 'Ather, TVS, Hero to refund EV charger cost to customers: Report',
 'Ban diesel 4-wheelers in big cities and towns by 2027, suggests govt panel',
 'Maruti Suzuki marks longest winning run since going public in 2003',
 'Lordstown shares fall 25% as Foxconn alleges $170 mn deal breach',
 "GM cuts hundreds of full-time contract workers' jobs in US",
 'Volvo Cars to slash 1,300 white-collar jobs to cut costs in Sweden',
 'EV ecosystem still not mature in India: Renault India MD & CEO',
 'Stellantis offers voluntary buyouts to 33,500 US employees',
 'Toyota subsidiary admits rigging safety test of 88,000 cars',
 'Passenger vehicle retail sales drop 4% in April 2023: FADA data',
 'Honda Cars records 33% Y-o-Y dip in domestic sales in April',
 'Tesla broke US labour law by silencing 