Newsfeed based on GDELT Project
conda create -n newsfeed python=3.7
python setup install
Based on the gdelt-doc-api, we consider a continuous querying mechanism by spliting the time range into multiple sub range (default setting is every 60 minutes).
- FIPS 2 letter Contries list: please check: LOOK-UP COUNTRIES
- GKG Themes list: please check: LOOK-UP THEMES
The URL encoding reference: url encode
- GDELT DOC 2.0 API
- GDELT GEO 2.0 API # BETA VERSION
- GDELT TV 2.0 API # NOT YET
- GDELT Global Entity Graph
- GDELT Visual Global Entity Graph
- GDELT Different Graph
- GDELT Global Frontpage Graph
from newsfeed.news.apis.filters import *
from newsfeed.news.apis.query import *
f = Art_Filter(
keyword = ["Exchange Rate", "World"],
start_date = "20211231000000",
end_date = "20211231010000",
country = ["China", "US"]
)
articles_30 = article_search(query_filter = f, max_recursion_depth = 100, time_range = 30)
articles_60 = article_search(query_filter = f, max_recursion_depth = 100, time_range = 60)
from newsfeed.news.apis.filters import *
from newsfeed.news.apis.query import *
f = Art_Filter(
keyword = ["Exchange Rate", "World"],
start_date = "2021-12-31-00-00-00",
end_date = "2021-12-31-01-00-00",
country = ["China", "US"]
)
timelineraw = timeline_search(query_filter = f, max_recursion_depth = 100, query_mode = "timelinevolraw")
from newsfeed.news.apis.filters import *
from newsfeed.news.apis.query import *
f = Art_Filter(
keyword = ["Exchange Rate", "World"],
country = ["China", "US"]
)
geo_7d = geo_search(query_filter = f, sourcelang="english", timespan=7)
query_mode:
- artlist:
article_search
- timeline:
timelinevol
,timelinevolraw
,timelinetone
,timelinelang
,timelinesourcecountry
most of the parameters are the same with gdelt-doc-api, however, to specify the precise date range, we remove the timespan
and use start_date
and time_range
for iteratively collecting articles.
For event database (both V1 and V2):
from newsfeed.news.db.events import *
# GDELT Event Database Version 1.0
gdelt_events_v1_events = EventV1(start_date = "2021-01-01", end_date = "2021-01-02")
results_v1_events = gdelt_events_v1_events.query()
results_v1_events_nowtime = gdelt_events_v1_events.query_nowtime()
# GDELT Event Database Version 2.0 - Event
gdelt_events_v2_events = EventV2(start_date = "2021-01-01-00-00-00", end_date = "2021-01-02-00-00-00")
results_v2_events = gdelt_events_v2_events.query()
results_v2_events_nowtime = gdelt_events_v2_events.query_nowtime()
# GDELT Event Database Version 2.0 - Mentions
gdelt_events_v2_mentions = EventV2(start_date = "2021-01-01-00-00-00", end_date = "2021-01-02-00-00-00", table = "mentions")
results_v2_mentions = gdelt_events_v2_mentions.query()
results_v2_mentions_nowtime = gdelt_events_v2_mentions.query_nowtime()
For GKG databse (both V1 and V2):
from newsfeed.news.db.gkg import *
# GDELT GKG Database Version 1.0
gdelt_events_v1_gkg = GKGV1(start_date = "2021-01-01", end_date = "2021-01-02")
results_v1_gkg = gdelt_events_v1_gkg.query()
results_v1_gkg_nowtime = gdelt_events_v1_gkg.query_nowtime()
from newsfeed.news.db.gkg import *
# GDELT GKG Database Version 2.0
gdelt_events_v2_gkg = GKGV2(start_date = "2021-01-01-00-00-00", end_date = "2021-01-02-00-00-00")
results_v2_gkg = gdelt_events_v2_gkg.query()
results_v2_gkg_nowtime = gdelt_events_v2_gkg.query_nowtime()
For GEG, VGEG and GDG:
from newsfeed.news.db.others import *
# GDELT Global Entity Graph
gdelt_v3_geg = GEG(start_date = "2020-01-01", end_date = "2020-01-02")
gdelt_v3_geg_result = gdelt_v3_geg.query()
# GDELT Visual Global Entity Graph
gdelt_v3_vgeg = VGEG(query_date = "2020-01-01", domain = "CNN")
gdelt_v3_vgeg_result = gdelt_v3_vgeg.query()
# GDELT Global Difference Graph
gdelt_v3_gdg = GDG(query_date="2018-08-27-14-00-00")
gdelt_v3_gdg_result = gdelt_v3_gdg.query()
# GDELT Global Frontpage Graph
gdelt_v3_gfg = GFG(query_date="2018-03-02-02-00-00")
gdelt_v3_gfg_result = gdelt_v3_gfg.query()
Full-text downloader (based on newspaper3k
and Wayback Machine)
from newsfeed.utils import fulltext as ft
art = ft.download(url="https://english.news.cn/20220205/a4e93df9162e4053af64c392b5f5bfec/c.html")
print("full text: \n {}".format(art.text))