In [17]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article
import datetime
import pandas as pd
import time
import os
import numpy as np
import transformers

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15',
    'Content-Type': 'text/html',
}
fmt = '%m/%d/%Y'

In [32]:
def run_google_news_scrapper(**params):
    
    for key, value in params.items():
        if key == 'min_date':
            min_date = value
        if key == 'output_file':
            output_file = value
        if key == 'news':
            news = value  

    params = {
    "q": "oil and gas price",  # search query
    "hl": "en",              # language of the search
    "gl": "us",              # country of the search
    "num": "100",            # number of search results per page
    "tbm": "nws",             # news results
    "tbs": "cdr:1,cd_min:{},cd_max:{}".format(min_date, min_date),  #sort by date    
}

    response = requests.get("https://www.google.com/search", headers=headers, params=params, timeout=30)
    
    news_data_dict = dict(url = [], text = [], publish_date = [], find_date =[])
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    if response.status_code != 200:
        print("******** fail ********** ")
        return
    
    count = 1
    
    for link in soup.find_all('a'):
        link_str = str(link.get('href'))
        try:
            if link_str.startswith("https://") and link_str.find('google.com') == -1 and link_str.find(
                    "https://www.youtube.com/") == -1 and link_str.find("https://www.blogger.com/") == -1:
                
                article = Article(link_str)
                article.download()
                article.parse()
                
                news_data_dict['url'].append(link_str)
                news_data_dict['text'].append(article.text)
                news_data_dict['publish_date'].append(article.publish_date)
                news_data_dict['find_date'].append(min_date)

                count += 1
                
                if count >= news:
                    break

        except:
            pass
        
    master_df = pd.DataFrame(news_data_dict)
    
    if os.path.exists(output_file):
        servant_df = master_df
        servant_df.to_csv('servant.csv',index=False)
        df = pd.concat(
        map(pd.read_csv, [output_file, 'servant.csv']), ignore_index=False)
        os.remove('servant.csv')
        df.to_csv(output_file,index=False)

    else:
        master_df.to_csv(output_file,index=False)

In [33]:
def google_news_scrapper(start_date, end_date, news_count, output_file_name):
    step_obj = datetime.timedelta(days=1)
    start_date_time_obj = datetime.datetime.strptime(start_date, fmt)
    end_date_time_obj = datetime.datetime.strptime(end_date, fmt)

    while start_date_time_obj <= end_date_time_obj:
        start_date = start_date_time_obj.strftime(fmt)
        print(start_date)
        run_google_news_scrapper(min_date=start_date, max_date=start_date, news=news_count, output_file=output_file_name)
        time.sleep(np.random.randint(2, 5))
        start_date_time_obj += step_obj

    master_df = pd.read_csv(output_file_name)
    master_df.reset_index()
    master_df.to_csv(output_file_name,index=True) 

In [20]:
def clean_news_report(input_file_name, cleaned_output_file_name):
    
    df = pd.read_csv(input_file_name)
    df = df.sort_index().drop_duplicates(subset=['url'], keep='last')
    for x in df.index: 
        if pd.isnull(df.loc[x, 'text']):
            df = df.drop(x)
    df = df[['find_date','text']]
    df = df.set_index('find_date', drop=True)
    df.index = pd.to_datetime(df.index, format=fmt)
    df.insert(0, 'text_ID', range(0, len(df)))
    df.to_csv(cleaned_output_file_name)

In [19]:
if __name__ == "__main__":
    
    news_count = 50 # results per day per page
    start_date = '10/01/2022'
    end_date = '10/21/2022'
    news_raw_filename = 'google_news_data.csv'
    
    google_news_scrapper(start_date, end_date, news_count, news_raw_filename)
    news_cleaned_filename = news_raw_filename[0:-4] + '_cleaned.csv'  
    clean_news_report(news_raw_filename, news_cleaned_filename)