In [1]:
import sys
import pandas as pd
sys.path.append("../utils/")
from fetchgooglenews import GoogleNews
from fetchcontextweb import fetch_context_web
import datetime
import math
import time

In [2]:
startdate = datetime.date(2016,3,31)
enddate = datetime.date(2021,4,16)

In [3]:
def generate_periods(startdate, enddate, length=30, shift=-1):
    totaldays = (enddate - startdate).days + 1
    num_periods = math.ceil(totaldays/length)
    output_dates = []
    for i in range(0, num_periods):
        if i < num_periods-1:
            leftdate = startdate + datetime.timedelta(days=i*length)
            rightdate = startdate + datetime.timedelta(days=((i+1)*length+shift))
        else:
            leftdate = startdate + datetime.timedelta(days=i*length)
            rightdate = enddate
        leftdate_str = datetime.date.strftime(leftdate, "%m/%d/%Y")
        rightdate_str = datetime.date.strftime(rightdate, "%m/%d/%Y")
        output_dates.append((leftdate_str, rightdate_str))
    return output_dates

In [11]:
def get_news(keywords, time_periods, outname, MAX_PAGE=5, duplicate=True, pause=1):
    # Download news by scrapping google search news section
    results_all = []
    count = 0
    for leftdate, rightdate in time_periods:
        # Duplicate will copy same article twice, and set one at 4am one at 12pm
        googlenews = GoogleNews(lang='en', start=leftdate,end=rightdate, numperpage=50, duplicate=duplicate)
        googlenews.search(keywords)
        for i in range(2,MAX_PAGE+1):
            time.sleep(pause)
            googlenews.get_page(i)
            if googlenews.failflag() == 1:
                break
        results_all.extend(googlenews.results())
        print("Finish Request from {} to {}, Get {} articles".format(leftdate, rightdate, len(googlenews.results())))
        time.sleep(10)
        count += 1
        if count % 10 == 0:
            pd.DataFrame(results_all).to_csv(outname.format(count//10))
            print("Reach 10 requests capacity, will sleep for 2000s. Last batch has been saved as period {}".format(count//10))
            time.sleep(2000)
    return results_all

In [5]:
def get_news_cw(keywords, time_periods, MAX_PAGE=5):
    # Download news by scrapping google search news section
    results_all = []
    
    for leftdate, rightdate in time_periods:
        results_period = []
        
        for i in range(1,MAX_PAGE+1):
            tmp_results, fail_flag = fetch_context_web(keywords, leftdate, rightdate, page=i)
            results_period.extend(tmp_results)
            if fail_flag == 1:
                break
        results_all.extend(results_period)
        print("Finish Request from {} to {}, Get {} articles".format(leftdate, rightdate, len(results_period)))
    return results_all

In [6]:
timeperiods = generate_periods(startdate, enddate, length=30, shift=-1)

#### Note

Scrapping google search has some risks. **The best practice is downloading 10 timeperiods at one time with `get_news`, wait for about half an hour, do another 10 timeperiods.**

At maximum, as I tried, we can keep downloading 14 timepriods, then the IP will be blocked for about 2 hour.

### Get Energy news

In [7]:
keyword = 'oil gas energy'
output_file_name= "../data/news_Energy/data/GoogleNews_Energy_large_period_{}.csv"
MAX_PAGE = 5

In [9]:
timeperiods[-2]

('03/05/2021', '04/03/2021')

In [12]:
Energy_news = get_news(keyword,timeperiods[:-2], outname=output_file_name, MAX_PAGE=MAX_PAGE, duplicate=True)

Finish Request from 03/31/2016 to 04/29/2016, Get 496 articles
Finish Request from 04/30/2016 to 05/29/2016, Get 430 articles
Finish Request from 05/30/2016 to 06/28/2016, Get 492 articles
Finish Request from 06/29/2016 to 07/28/2016, Get 496 articles
Finish Request from 07/29/2016 to 08/27/2016, Get 492 articles
Finish Request from 08/28/2016 to 09/26/2016, Get 456 articles
Finish Request from 09/27/2016 to 10/26/2016, Get 478 articles
Finish Request from 10/27/2016 to 11/25/2016, Get 426 articles
'NoneType' object is not iterable
Finish Request from 11/26/2016 to 12/25/2016, Get 382 articles
Finish Request from 12/26/2016 to 01/24/2017, Get 498 articles
Reach 10 requests capacity, will sleep for 2000s. Last batch has been saved as period 1
Finish Request from 01/25/2017 to 02/23/2017, Get 496 articles
Finish Request from 02/24/2017 to 03/25/2017, Get 492 articles
Finish Request from 03/26/2017 to 04/24/2017, Get 474 articles
Finish Request from 04/25/2017 to 05/24/2017, Get 500 artic

In [14]:
df1 = pd.DataFrame(Energy_news)

In [15]:
df1

Unnamed: 0,title,source,date,datetime,desc,link
0,Senate Passes Legislation Tailored to a Modern...,The New York Times,"Apr 20, 2016",2016-04-20 04:00:00,The Senate on Wednesday passed the first broad...,https://www.nytimes.com/2016/04/21/us/politics...
1,Senate Passes Legislation Tailored to a Modern...,The New York Times,"Apr 20, 2016",2016-04-20 12:00:00,The Senate on Wednesday passed the first broad...,https://www.nytimes.com/2016/04/21/us/politics...
2,The surprising things Democrats and Republican...,The Washington Post,"th · Apr 19, 2016",2016-04-19 04:00:00,Democrats want to fight climate change and fun...,https://www.washingtonpost.com/news/energy-env...
3,The surprising things Democrats and Republican...,The Washington Post,"th · Apr 19, 2016",2016-04-19 12:00:00,Democrats want to fight climate change and fun...,https://www.washingtonpost.com/news/energy-env...
4,How Murkowski crafted an energy bill that 80 s...,Alaska Public Media,"Apr 20, 2016",2016-04-20 04:00:00,It allows more money for renewable energy rese...,https://www.alaskapublic.org/2016/04/20/how-mu...
...,...,...,...,...,...,...
28901,Soil Revolution Conference Virtual Presentatio...,Boulder County,"Feb 4, 2021",2021-02-04 12:00:00,Housing & Energy · Affordable ... Healthy Home...,https://www.bouldercounty.org/news/soil-revolu...
28902,Boulder County Health Coverage Guides are Here...,,"Feb 4, 2021",2021-02-04 04:00:00,Housing & Energy · Affordable ... Healthy Home...,https://www.bouldercounty.org/news/boulder-cou...
28903,Boulder County Health Coverage Guides are Here...,,"Feb 4, 2021",2021-02-04 12:00:00,Housing & Energy · Affordable ... Healthy Home...,https://www.bouldercounty.org/news/boulder-cou...
28904,KRBN ETF Update: Nobel-Prize Winning Economist...,,"Feb 11, 2021",2021-02-11 04:00:00,... Regional Greenhouse Gas Initiative (RGGI)....,https://www.prnewswire.com/news-releases/krbn-...


In [17]:
df1.datetime.max()

Timestamp('2021-03-04 12:00:00')

In [16]:
df1.to_csv("../data/news_Energy/data/GoogleNews_Energy_large_section1.csv".format(keyword))

#### Note

It turns out that, on Google search news section, when the publish date is too close to the current date (like 1-2 months ago), the date will show as something like "1 month ago", "3 weeks ago", et al. So it is hard to get exact publish date through current scrap tools. **The best practice is to stop at some time earlier and use another News API to download recent articles.** 

So I will drop articles later than 03/04/2021 and download articles from 03/05/2021 to 04/16/2021 by News APIs. 

#### Use context web search API to download recent News

In [18]:
recent_periods = generate_periods(datetime.date(2021,3,5), datetime.date(2021,4,17), length=10, shift=0)

In [19]:
recent_periods

[('03/05/2021', '03/15/2021'),
 ('03/15/2021', '03/25/2021'),
 ('03/25/2021', '04/04/2021'),
 ('04/04/2021', '04/14/2021'),
 ('04/14/2021', '04/17/2021')]

In [20]:
Energy_news2 = get_news_cw(keyword, recent_periods, MAX_PAGE=5)

Finish Request from 03/05/2021 to 03/15/2021, Get 200 articles
Finish Request from 03/15/2021 to 03/25/2021, Get 147 articles
Finish Request from 03/25/2021 to 04/04/2021, Get 235 articles
Finish Request from 04/04/2021 to 04/14/2021, Get 250 articles
Finish Request from 04/14/2021 to 04/17/2021, Get 243 articles


In [21]:
df2 = pd.DataFrame(Energy_news2)

In [22]:
df2

Unnamed: 0,title,source,date,datetime,desc,link
0,Guest Opinion: An update on the oil and gas in...,fortmorgantimes,"Mar 14, 2021",2021-03-14 21:41:11,"Crude oil prices are rising again, as OPEC+ ha...",https://www.fortmorgantimes.com/2021/03/14/gue...
1,Work continues to preserve NM's oil and gas in...,currentargus,"Mar 14, 2021",2021-03-14 15:42:01,Sen. Martin Heinrich and Ben Ray Lujn both urg...,https://www.currentargus.com/story/opinion/col...
2,Oil and gas industry touts value of Permian Ba...,currentargus,"Mar 14, 2021",2021-03-14 12:21:00,Activists worried all that economic opportunit...,https://www.currentargus.com/story/news/local/...
3,"ONGC's share in India's oil, gas production ju...",indiatimes,"Mar 14, 2021",2021-03-14 11:14:00,While Oil and Natural Gas Corporation (ONGC) m...,https://energy.economictimes.indiatimes.com/ne...
4,Your turn: Oil and gas production ban could hu...,demingheadlight,"Mar 14, 2021",2021-03-14 11:02:00,"""Your turn"" is a guest column format View Comm...",https://www.demingheadlight.com/story/opinion/...
...,...,...,...,...,...,...
1070,Oil posts highest finish since mid-March on a ...,marketwatch,"Apr 14, 2021",2021-04-14 11:30:00,"Oil futures end sharply higher on Wednesday, a...",https://www.marketwatch.com/story/oil-prices-j...
1071,3 Energy Stocks Set to Soar When American Oil ...,fool,"Apr 14, 2021",2021-04-14 10:00:00,Learn about 3 undervalued oil stocks that are ...,https://www.fool.com/investing/general/2015/05...
1072,Oil Prices: Is Another Tumble in the Forecast?...,fool,"Apr 14, 2021",2021-04-14 08:09:00,The International Energy Agencys latest foreca...,https://www.fool.com/investing/2017/11/19/oil-...
1073,Oil climbs nearly 5% on signs of increasing cr...,reuters,"Apr 14, 2021",2021-04-14 08:09:00,"[ ""Oil prices surged almost 5% on Wednesday,...",https://www.reuters.com/business/energy/oil-ri...


In [23]:
df2_short = df2[df2.datetime < datetime.datetime(2021,4,17,0,0)]

In [24]:
df2_short.datetime.max()

Timestamp('2021-04-16 23:28:49')

In [25]:
df2_short.to_csv("../data/news_Energy/data/GoogleNews_Energy_large_section2.csv")

#### Combine the news from different sections 

In [26]:
dateframe_all = pd.concat([df1,df2_short])

In [27]:
dateframe_all.sort_values(by=['datetime'], inplace=True)

In [28]:
dateframe_test = dateframe_all.drop_duplicates()

In [29]:
dateframe_test

Unnamed: 0,title,source,date,datetime,desc,link
136,Deep in the Heart of Texas… Methane is Leaking...,News and blogs | Environmental Defense Fund,"Mar 31, 2016",2016-03-31 04:00:00,"And, methane pollution, at its core, is wasted...",https://blogs.edf.org/energyexchange/2016/03/3...
214,Black Thursday: Layoffs hit 500 Wyoming coal m...,,"Mar 31, 2016",2016-03-31 04:00:00,"Peabody Energy cut 235 miners, and Arch Coal c...",https://www.wyofile.com/black-thursday-layoffs...
32,"As U.S. shale drillers suffer, even the bankru...",Reuters,"Mar 31, 2016",2016-03-31 04:00:00,More than 50 North American oil and gas produc...,https://www.reuters.com/article/us-usa-energy-...
480,DTE Energy's Appliance Recycling Program puts ...,,"Mar 31, 2016",2016-03-31 04:00:00,"DETROIT, March 31, 2016 /PRNewswire/ -- DTE En...",https://www.prnewswire.com/news-releases/dte-e...
170,"BP, China's Top Producer to Develop Unconventi...",Natural Gas Intelligence,"Mar 31, 2016",2016-03-31 04:00:00,The companies also agreed to share knowledge o...,https://www.naturalgasintel.com/bp-chinas-top-...
...,...,...,...,...,...,...
833,EXCLUSIVE Nigeria refines energy reform bill t...,oilandgas360,"Apr 16, 2021",2021-04-16 22:30:36,"The latest oil and gas news, dedicated to all ...",https://www.oilandgas360.com/exclusive-nigeria...
973,The Market Bloodbath: A Boon for Oil and Gas -...,fool,"Apr 16, 2021",2021-04-16 23:00:00,Following Benjamin Graham's ideas is all that ...,https://www.fool.com/investing/general/2011/08...
832,Kodiak Oil & Gas' Year in Review -- The Motley...,fool,"Apr 16, 2021",2021-04-16 23:00:00,A look back at a strong year for the Denver oi...,https://www.fool.com/investing/general/2011/12...
972,Qatar- QFRDI develops new method to fight corr...,menafn,"Apr 16, 2021",2021-04-16 23:04:25,"Qatar Foundation Research, Development, and In...",https://menafn.com/1101931382/Qatar-QFRDI-deve...


In [30]:
dateframe_test.to_csv("../data/news_Energy/data/GoogleNews_Energy_large_all.csv", index=False)