In [1]:
import sys
import pandas as pd
sys.path.append("../utils/")
from fetchgooglenews import GoogleNews
from fetchcontextweb import fetch_context_web
import datetime
import math
import time

In [2]:
startdate = datetime.date(2016,3,31)
enddate = datetime.date(2021,4,16)

In [3]:
def generate_periods(startdate, enddate, length=30, shift=-1):
    totaldays = (enddate - startdate).days + 1
    num_periods = math.ceil(totaldays/length)
    output_dates = []
    for i in range(0, num_periods):
        if i < num_periods-1:
            leftdate = startdate + datetime.timedelta(days=i*length)
            rightdate = startdate + datetime.timedelta(days=((i+1)*length+shift))
        else:
            leftdate = startdate + datetime.timedelta(days=i*length)
            rightdate = enddate
        leftdate_str = datetime.date.strftime(leftdate, "%m/%d/%Y")
        rightdate_str = datetime.date.strftime(rightdate, "%m/%d/%Y")
        output_dates.append((leftdate_str, rightdate_str))
    return output_dates

In [4]:
def get_news(keywords, time_periods, outname, MAX_PAGE=5, duplicate=True, pause=1):
    # Download news by scrapping google search news section
    results_all = []
    count = 0
    for leftdate, rightdate in time_periods:
        # Duplicate will copy same article twice, and set one at 4am one at 12pm
        googlenews = GoogleNews(lang='en', start=leftdate,end=rightdate, numperpage=50, duplicate=duplicate)
        googlenews.search(keywords)
        for i in range(2,MAX_PAGE+1):
            time.sleep(pause)
            googlenews.get_page(i)
            if googlenews.failflag() == 1:
                break
        results_all.extend(googlenews.results())
        print("Finish Request from {} to {}, Get {} articles".format(leftdate, rightdate, len(googlenews.results())))
        time.sleep(10)
        count += 1
        if count % 10 == 0:
            pd.DataFrame(results_all).to_csv(outname.format(keywords,count//10))
            print("Reach 10 requests capacity, will sleep for 2000s. Last batch has been saved as period {}".format(count//10))
            time.sleep(2000)
    return results_all

In [5]:
def get_news_cw(keywords, time_periods, MAX_PAGE=5):
    # Download news by scrapping google search news section
    results_all = []
    
    for leftdate, rightdate in time_periods:
        results_period = []
        
        for i in range(1,MAX_PAGE+1):
            tmp_results, fail_flag = fetch_context_web(keywords, leftdate, rightdate)
            results_period.extend(tmp_results)
            if fail_flag == 1:
                break
        results_all.extend(results_period)
        print("Finish Request from {} to {}, Get {} articles".format(leftdate, rightdate, len(results_period)))
    return results_all

In [6]:
timeperiods = generate_periods(startdate, enddate, length=30, shift=-1)

#### Note

Scrapping google search has some risks. **The best practice is downloading 10 timeperiods at one time with `get_news`, wait for about half an hour, do another 10 timeperiods.**

At maximum, as I tried, we can keep downloading 14 timepriods, then the IP will be blocked for about 2 hour.

### Get Bitcoin news

In [7]:
keyword = 'Bitcoin'
output_file_name= "../data/news/data/GoogleNews_{}_large_period_{}.csv"
MAX_PAGE = 5

In [None]:
# Danger!!! This cell will make Google Search block your IP for about 2 hours
Bicoin_news = get_news(keyword,timeperiods[:16], MAX_PAGE=MAX_PAGE, duplicate=True)

In [11]:
df1 = pd.DataFrame(Bicoin_news)

In [15]:
df1.to_csv("../data/news/data/GoogleNews_{}_large_section1.csv".format(keyword))

### Now we have reached 06/24/2017 to 07/23/2017, should start from 07/24/2017

In [12]:
timeperiods[16]

('07/24/2017', '08/22/2017')

In [8]:
# This is a safe cell with "appropriate" wait time
time.sleep(2000)
Bitcoin_news2 = get_news(keyword,timeperiods[16:], outname=output_file_name, MAX_PAGE=MAX_PAGE, duplicate=True)

'NoneType' object is not iterable
Finish Request from 07/24/2017 to 08/22/2017, Get 356 articles
'NoneType' object is not iterable
Finish Request from 08/23/2017 to 09/21/2017, Get 390 articles
Finish Request from 09/22/2017 to 10/21/2017, Get 448 articles
'NoneType' object is not iterable
Finish Request from 10/22/2017 to 11/20/2017, Get 290 articles
'NoneType' object is not iterable
Finish Request from 11/21/2017 to 12/20/2017, Get 250 articles
Finish Request from 12/21/2017 to 01/19/2018, Get 416 articles
'NoneType' object is not iterable
Finish Request from 01/20/2018 to 02/18/2018, Get 338 articles
Finish Request from 02/19/2018 to 03/20/2018, Get 442 articles
Finish Request from 03/21/2018 to 04/19/2018, Get 492 articles
Finish Request from 04/20/2018 to 05/19/2018, Get 448 articles
Reach 10 requests capacity, will sleep for 2000s. Last batch has been saved as period 1
'NoneType' object is not iterable
Finish Request from 05/20/2018 to 06/18/2018, Get 344 articles
Finish Request 

#### Note

It turns out that, on Google search news section, when the publish date is too close to the current date (like 1-2 months ago), the date will show as something like "1 month ago", "3 weeks ago", et al. So it is hard to get exact publish date through current scrap tools. **The best practice is to stop at some time earlier and use another News API to download recent articles.** 

So I will drop articles later than 03/04/2021 and download articles from 03/05/2021 to 04/16/2021 by News APIs. 

In [10]:
df_all = pd.DataFrame(Bitcoin_news2)
df_all_short = df_all[df_all.datetime<=datetime.datetime(2021,3,5,0,0)]
df_all_short.datetime.max()

Timestamp('2021-03-04 12:00:00')

In [11]:
df_all_short.to_csv("../data/news/data/GoogleNews_Bitcoin_large_section_left.csv")

#### Use context web search API to download recent News

In [50]:
recent_periods = generate_periods(datetime.date(2021,3,5), datetime.date(2021,4,17), length=10, shift=0)

In [51]:
recent_periods

[('03/05/2021', '03/15/2021'),
 ('03/15/2021', '03/25/2021'),
 ('03/25/2021', '04/04/2021'),
 ('04/04/2021', '04/14/2021'),
 ('04/14/2021', '04/17/2021')]

In [57]:
Bitcoin_news6 = get_news_cw(keyword, recent_periods, MAX_PAGE=5)

Finish Request from 03/05/2021 to 03/15/2021, Get 42 articles
Finish Request from 03/15/2021 to 03/25/2021, Get 36 articles
Finish Request from 03/25/2021 to 04/04/2021, Get 250 articles
Finish Request from 04/04/2021 to 04/14/2021, Get 250 articles
Finish Request from 04/14/2021 to 04/17/2021, Get 250 articles


In [58]:
df6 = pd.DataFrame(Bitcoin_news6)

In [62]:
df6_short = df6[df6.datetime < datetime.datetime(2021,4,17,0,0)]

In [63]:
df6_short.datetime.max()

Timestamp('2021-04-16 20:37:04')

In [64]:
df6_short.to_csv("../data/news/data/GoogleNews_{}_large_section6.csv".format(keyword))

#### Combine the news from different sections 

In [12]:
import glob

In [13]:
files = [file for file in glob.glob("../data/news/data/GoogleNews_{}_large_section*".format(keyword))]

In [14]:
files

['../data/news/data/GoogleNews_Bitcoin_large_section6.csv',
 '../data/news/data/GoogleNews_Bitcoin_large_section1.csv',
 '../data/news/data/GoogleNews_Bitcoin_large_section_left.csv']

In [15]:
dateframe_all = pd.read_csv(files[0], index_col=0)
for file in files[1:]:
    dateframe_all = pd.concat([dateframe_all,pd.read_csv(file, index_col=0)])

In [16]:
dateframe_all.sort_values(by=['datetime'], inplace=True)

In [17]:
dateframe_all.to_csv("../data/news/data/GoogleNews_Bitcoin_large_all.csv", index=False)