### Webhose.io Data Crawling

#### Configure API 

In [81]:
pip install python-decouple

In [82]:
import webhoseio
from decouple import config

webhoseio.config(token=config('WEBHOSE_TOKEN'))

query_params = {
    "q": "language:english site_type:news organization:\"tesla\" is_first:true",
    "ts": "1588565428918",
    "sort": "published"
}

#### Get the first batch of 100 news posts, add them to <i>newsfeeds</i> - a Python array of JSONs

In [83]:
output = webhoseio.query("filterWebContent", query_params)

In [87]:
newsfeeds = []
for post in output['posts']:
    newsfeeds.append(post)
len(newsfeeds)

100

In [94]:
newsfeeds[0]['entities']

{'persons': [{'name': 'charlie wang', 'sentiment': 'none'},
  {'name': 'tesla', 'sentiment': 'none'},
  {'name': 'wang', 'sentiment': 'none'}],
 'organizations': [{'name': 'tesla', 'sentiment': 'none'},
  {'name': 'tesla model y deliveries in canada starting', 'sentiment': 'none'},
  {'name': 'facebook', 'sentiment': 'none'},
  {'name': 'confirms vancouver store share &', 'sentiment': 'none'}],
 'locations': [{'name': 'canada', 'sentiment': 'none'},
  {'name': 'west vancouver', 'sentiment': 'none'},
  {'name': 'vancouver', 'sentiment': 'none'}]}

#### Check the remaining API call limiit and how many more posts are left (e.g. 1 call = 100 posts)

In [88]:
output['requestsLeft']

441

In [89]:
# print(output['totalResults']) # to see total available posts
num_more_posts = output['moreResultsAvailable']
num_more_posts

1037

#### Get the next batches and add them to <i>newsfeeds</i><br> watch out for the number of get_next() calls against your Webhose.io API call limits

In [70]:
count = 0
while count < num_more_posts:    
    output = webhoseio.get_next()
    for post in output['posts']:
        newsfeeds.append(post)
    count += 100
print(len(newsfeeds))

10800


#### Write the JSONs to a data file as one line per post JSON object

In [71]:
import json

with open("datasets/webhose_apple.json", "w") as data_file:
    for feed in newsfeeds:
        line = json.dumps(feed)
        data_file.write(line)
        data_file.write("\n")

#### Read the JSON file back into Python array of JSON objects and confirm the count

In [72]:
json_data=open("datasets/webhose_apple.json").readlines()
newsfeeds_read = []
for line in json_data:
    newsfeeds_read.append(json.loads(line))
print(len(newsfeeds_read))

10800


In [73]:
newsfeeds_read[1]

{'thread': {'uuid': '4aa124a2c78843f8404405e65cd69da7bfaed9c4',
  'url': 'http://omgili.com/ri/.wHSUbtEfZQVWCSCAg0UcdNr5_Jr5.9rcm.BgMo9Zwa_XTw1UsQ92zXPFZacmUOznR9FAnS8nu0qR4QbNrpO4xJ6h.awLU72832XEvC2yVr962k3sdtksI9qxhG9DiHVIX6kEY8G11HTa53TwRhREw--',
  'site_full': 'gadgets.ndtv.com',
  'site': 'ndtv.com',
  'site_section': 'https://feeds.feedburner.com/NDTV-LatestNews',
  'site_categories': ['media', 'tech'],
  'section_title': 'NDTV News -  Special',
  'title': 'iOS 14 Will Reportedly Support All iPhone Models Running iOS 13',
  'title_full': 'iOS 14 Will Reportedly Support All iPhone Models Running iOS 13',
  'published': '2020-06-03T09:58:00.000+03:00',
  'replies_count': 0,
  'participants_count': 0,
  'site_type': 'news',
  'country': 'US',
  'spam_score': 0.0,
  'main_image': 'https://i.gadgets360cdn.com/large/iPhone_11_Pro_max_display_1568757743031.jpg',
  'performance_score': 0,
  'domain_rank': 847,
  'social': {'facebook': {'likes': 0, 'comments': 0, 'shares': 0},
   'gplus':

In [74]:
for feed in newsfeeds_read:
    print(feed['title'])
    print(feed['url'])
    print(feed['published'])
    break

New iPad Air may come with USB-C not Lightning Port
http://omgili.com/ri/.wHSUbtEfZRyyLJKogCgQjT1s16xuxgsWJnp7alwJlmPEpdpnAas4X6UKG8Bmm3WULIVKdg2cyT32lur0k6h.YIUNbhwlGkL0XiDXCCfgrSCkX0_x7zycPM1.pO1OGSB
2020-06-03T10:00:00.000+03:00
