In [1]:
import sys
import pandas as pd
sys.path.append("../../../utils/")
from fetchgooglenews import GoogleNews
from fetchcontextweb import fetch_context_web
import datetime
import math
import time

In [2]:
startdate = datetime.date(2010,1,1)
enddate = datetime.date(2016,3,30)

In [3]:
def generate_periods(startdate, enddate, length=30, shift=-1):
    totaldays = (enddate - startdate).days + 1
    num_periods = math.ceil(totaldays/length)
    output_dates = []
    for i in range(0, num_periods):
        if i < num_periods-1:
            leftdate = startdate + datetime.timedelta(days=i*length)
            rightdate = startdate + datetime.timedelta(days=((i+1)*length+shift))
        else:
            leftdate = startdate + datetime.timedelta(days=i*length)
            rightdate = enddate
        leftdate_str = datetime.date.strftime(leftdate, "%m/%d/%Y")
        rightdate_str = datetime.date.strftime(rightdate, "%m/%d/%Y")
        output_dates.append((leftdate_str, rightdate_str))
    return output_dates

In [4]:
def get_news(keywords, time_periods, outname, MAX_PAGE=5, duplicate=True, pause=1):
    # Download news by scrapping google search news section
    results_all = []
    count = 0
    for leftdate, rightdate in time_periods:
        # Duplicate will copy same article twice, and set one at 4am one at 12pm
        googlenews = GoogleNews(lang='en', start=leftdate,end=rightdate, numperpage=50, duplicate=duplicate)
        googlenews.search(keywords)
        for i in range(2,MAX_PAGE+1):
            time.sleep(pause)
            googlenews.get_page(i)
            if googlenews.failflag() == 1:
                break
        results_all.extend(googlenews.results())
        print("Finish Request from {} to {}, Get {} articles".format(leftdate, rightdate, len(googlenews.results())))
        time.sleep(10)
        count += 1
        if count % 10 == 0:
            pd.DataFrame(results_all).to_csv(outname.format(count//10))
            print("Reach 10 requests capacity, will sleep for 2000s. Last batch has been saved as period {}".format(count//10))
            time.sleep(2000)
    return results_all

In [5]:
def get_news_cw(keywords, time_periods, MAX_PAGE=5):
    # Download news by scrapping google search news section
    results_all = []
    
    for leftdate, rightdate in time_periods:
        results_period = []
        
        for i in range(1,MAX_PAGE+1):
            tmp_results, fail_flag = fetch_context_web(keywords, leftdate, rightdate, page=i)
            results_period.extend(tmp_results)
            if fail_flag == 1:
                break
        results_all.extend(results_period)
        print("Finish Request from {} to {}, Get {} articles".format(leftdate, rightdate, len(results_period)))
    return results_all

In [6]:
timeperiods = generate_periods(startdate, enddate, length=30, shift=-1)

#### Note

Scrapping google search has some risks. **The best practice is downloading 10 timeperiods at one time with `get_news`, wait for about half an hour, do another 10 timeperiods.**

At maximum, as I tried, we can keep downloading 14 timepriods, then the IP will be blocked for about 2 hour.

### Get Energy news

In [7]:
keyword = 'oil gas energy'
output_file_name= "../data/energy/GoogleNews_Energy_Mega_period_{}.csv"
MAX_PAGE = 5

In [8]:
timeperiods[-2]

('02/29/2016', '03/29/2016')

In [9]:
Energy_news = get_news(keyword,timeperiods, outname=output_file_name, MAX_PAGE=MAX_PAGE, duplicate=True)

Finish Request from 01/01/2010 to 01/30/2010, Get 482 articles
Finish Request from 01/31/2010 to 03/01/2010, Get 484 articles
Finish Request from 03/02/2010 to 03/31/2010, Get 470 articles
Finish Request from 04/01/2010 to 04/30/2010, Get 468 articles
Finish Request from 05/01/2010 to 05/30/2010, Get 480 articles
Finish Request from 05/31/2010 to 06/29/2010, Get 490 articles
Finish Request from 06/30/2010 to 07/29/2010, Get 484 articles
Finish Request from 07/30/2010 to 08/28/2010, Get 478 articles
Finish Request from 08/29/2010 to 09/27/2010, Get 480 articles
Finish Request from 09/28/2010 to 10/27/2010, Get 478 articles
Reach 10 requests capacity, will sleep for 2000s. Last batch has been saved as period 1
Finish Request from 10/28/2010 to 11/26/2010, Get 490 articles
Finish Request from 11/27/2010 to 12/26/2010, Get 488 articles
Finish Request from 12/27/2010 to 01/25/2011, Get 488 articles
Finish Request from 01/26/2011 to 02/24/2011, Get 500 articles
Finish Request from 02/25/2011

In [10]:
df1 = pd.DataFrame(Energy_news)

In [12]:
len(df1)

37322

In [13]:
df1.datetime.max()

Timestamp('2016-03-30 12:00:00')

In [14]:
df1.to_csv("../data/energy/GoogleNews_Energy_Mega_1.csv")

In [17]:
startdate = datetime.date(2009,1,1)
enddate = datetime.date(2009,10,27)
timeperiods = generate_periods(startdate, enddate, length=30, shift=-1)

In [None]:
Energy_news2 = get_news(keyword,timeperiods, outname=output_file_name, MAX_PAGE=MAX_PAGE, duplicate=True)

In [22]:
startdate = datetime.date(2009,10,28)
enddate = datetime.date(2009,12,31)
timeperiods = generate_periods(startdate, enddate, length=30, shift=-1)

In [None]:
energy_news3 = get_news(keyword,timeperiods, outname=output_file_name, MAX_PAGE=MAX_PAGE, duplicate=True)

#### Combine the news from different sections 

In [30]:
df1 = pd.read_csv('../data/energy/GoogleNews_Energy_large_all.csv')
df2 = pd.read_csv('../data/energy/GoogleNews_Energy_Mega_1.csv', index_col=0)
df3 = pd.read_csv('../data/energy/GoogleNews_Energy_Mega_2.csv', index_col=0)
df4 = pd.read_csv('../data/energy/GoogleNews_Energy_Mega_3.csv', index_col=0)

In [31]:
dateframe_all = pd.concat([df1,df2,df3,df4])

In [32]:
dateframe_all.sort_values(by=['datetime'], inplace=True)

In [33]:
dateframe_test = dateframe_all.drop_duplicates()

In [35]:
dateframe_test.datetime.min()

'2009-01-01 04:00:00'

In [36]:
dateframe_test.datetime.max()

'2021-04-16 23:28:49'

In [37]:
dateframe_test.to_csv("../data/energy/GoogleNews_Energy_Mega_all.csv", index=False)