In [None]:
# Modified by Bravo based on https://github.com/iAhsanJaved/FetchGoogleNews/blob/master/main.py
pip install newspaper3k

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/211.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m174.1/211.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.1/211.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.2-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.6/97.6 kB[0m [31m12.5 MB/s[0m 

In [None]:
# import the needed libraries
import requests
import pandas as pd
import time # for timing script
import xml.etree.ElementTree as ET # built in library
from datetime import datetime, timedelta
from newspaper import Article

def clean_url(search_term, data_filter):
    """
    OUTPUT : url to be fecthed for the searched_item and data_filter
     ---------------------------------------------------
    Parameters:
      today' - get headlines of the news that are released only in today
                       'this_week' - get headlines of the news that are released in this week
                       'this month' - news released in this month
                       'this_year' - news released in this year
                        number : int/str input for number of days ago
                        or '' blank to get all data
    """
    x = datetime.today()
    today = str(x)[:10]
    yesterday = str(x + timedelta(days=-1))[:10]
    this_week = str(x + timedelta(days=-7))[:10]
    if data_filter == 'today':
        time = 'after%3A' + yesterday
    elif data_filter == 'this_week':
        time = 'after%3A'+ this_week + '+before%3A' + today
    elif data_filter == 'this_year':
        time = 'after%3A'+str(x.year - 1)
    elif str(data_filter).isdigit():
        temp_time = str(x + timedelta(days=-int(data_filter)))[:10]
        time =  'after%3A'+ temp_time + '+before%3A' + today
    else:
        time=''
    url = f'https://news.google.com/rss/search?q={search_term}+'+time+'&hl=en-US&gl=US&ceid=US%3Aen'
    return url

# clear the description
def get_text(x):
    start = x.find('<p>')+3
    end = x.find('</p>')
    return x[start:end]

def get_content(url):
    """
    Extract the main content of a news article from its URL.
    """
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text
    except Exception as e:
        return str(e)

def get_news(search_term, data_filter=None):
    """
    Search through Google News with the "search_term" and get the headlines
     and the contents of the news that was released today, this week, this month,
    or this year ("date_filter").
    """

    url = clean_url(search_term, data_filter)
    response = requests.get(url)
    # get the root directly as we have text file of string now
    root = ET.fromstring(response.text)
    #get the required data
    title = [i.text for i in root.findall('.//channel/item/title') ]
    link = [i.text for i in root.findall('.//channel/item/link') ]
    description = [i.text for i in root.findall('.//channel/item/description') ]
    pubDate = [i.text for i in root.findall('.//channel/item/pubDate') ]
    source = [i.text for i in root.findall('.//channel/item/source') ]
    # clear the description
    short_description = list(map(get_text, description))

    # extract content for each article
    content = [get_content(url) for url in link]

    # set the data frame
    df = pd.DataFrame({'title': title, 'link': link, 'description': short_description, 'date': pubDate, 'source': source, 'content': content})
    # adjust the date column
    df['date'] = pd.to_datetime(df['date'])
    # for saving purpose uncomment the below
    df.to_csv(f'{search_term}_news.csv', encoding='utf-8-sig', index=False)
    return df

if __name__ == "__main__":
    start = time.time()
    search_term = str(input('Enter your search term here: '))
    data = get_news(search_term, data_filter=5)
    end = time.time() - start
    print("Execution time", end)


Enter your search term here: Apple
Execution time 71.39073753356934
