### `Internal training - BuzzSumo's API`

In [1]:
# warnings
import warnings
warnings.filterwarnings('ignore')

# import modules
import pandas as pd
import requests
import json
import os

In [2]:
# load API KEY
path = os.path.abspath('../config/api_key.json')
with open(path) as auth:
    api_key = json.load(auth)['api_key']
    auth.close()

In [3]:
# load parameters
params_path = os.path.abspath('../api/endpoints/articles.json')
with open(params_path) as file:
    params = json.load(file)
    file.close()

In [4]:
# parameters
params['api_key'] = api_key
URL = params['url']

# deleting URL key -> This will be used as variable instead.
del params['url']

In [6]:
# request
req = requests.request(method='GET', url=URL, params=params)
results = req.json()

In [8]:
req.headers

{'Server': 'nginx/1.16.1', 'Date': 'Wed, 20 May 2020 03:45:11 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Vary': 'Accept-Encoding, Origin', 'X-RateLimit-Reset': '1589946321', 'X-RateLimit-Limit': '10', 'X-RateLimit-Month-Remaining': '98', 'X-RateLimit-Remaining': '9', 'X-UA-Compatible': 'IE=Edge,chrome=1', 'ETag': 'W/"f7f348a8ce2e9ed9323b0721c0137cad"', 'Cache-Control': 'max-age=0, private, must-revalidate', 'X-Request-Id': '9319454a8e891902f5e32a5a2ac492ca', 'X-Runtime': '1.206689', 'X-Rack-Cache': 'miss', 'Content-Encoding': 'gzip'}

In [14]:
dict(req.headers)

{'Server': 'nginx/1.16.1',
 'Date': 'Wed, 20 May 2020 03:45:11 GMT',
 'Content-Type': 'application/json; charset=utf-8',
 'Transfer-Encoding': 'chunked',
 'Connection': 'keep-alive',
 'Vary': 'Accept-Encoding, Origin',
 'X-RateLimit-Reset': '1589946321',
 'X-RateLimit-Limit': '10',
 'X-RateLimit-Month-Remaining': '98',
 'X-RateLimit-Remaining': '9',
 'X-UA-Compatible': 'IE=Edge,chrome=1',
 'ETag': 'W/"f7f348a8ce2e9ed9323b0721c0137cad"',
 'Cache-Control': 'max-age=0, private, must-revalidate',
 'X-Request-Id': '9319454a8e891902f5e32a5a2ac492ca',
 'X-Runtime': '1.206689',
 'X-Rack-Cache': 'miss',
 'Content-Encoding': 'gzip'}

In [10]:
results.keys()

dict_keys(['results', 'suggested_language', 'total_pages', 'total_results', 'display_code', 'search_type'])

In [12]:
total_results = len(results['results'])
print (f'Number of results --> {total_results}')

Number of results --> 46


### `Matching columns between dataset from BuzzSumo's webapp VS data from BuzzSumo's API`

<img src='./test/img_columns.png'>

### `Creating dataset`

In [32]:
dataset = pd.DataFrame(results['results'])
dataset.shape

(46, 46)

In [33]:
dataset.head()

Unnamed: 0,author_name,youtube_trending_score,alexa_rank,pinterest_shares,num_words,twitter_shares,love_count,language,og_url,video,...,published_date,total_shares,article_types,general_article,how_to_article,infographic,list,what_post,why_post,display_title
0,Fernanda Echavarri,,7477,3,517,12086,6.0,en,https://www.motherjones.com/coronavirus-update...,0,...,1585009182,34638,"[all_content, general_article]",1,0,0,0,0,0,The Most Irresponsible President on the Contin...
1,Latinus,,167666,1,53,280,115.0,en,https://latinus.us/2020/02/13/video-trump-noct...,0,...,1581638460,12585,"[all_content, general_article]",1,0,0,0,0,0,Video <strong class='highlight'>Trump</strong>...
2,David Agren,,135,0,868,690,5.0,en,http://www.theguardian.com/world/2020/jan/26/m...,0,...,1580032800,4007,"[all_content, general_article]",1,0,0,0,0,0,'Mexico has become <strong class='highlight'>T...
3,Carlos Loret De Mola,,236394,0,326,52,28.0,en,https://www.carlosloret.com/2020/01/amlo-my-br...,0,...,1580083200,2853,"[all_content, general_article]",1,0,0,0,0,0,"“<strong class='highlight'>AMLO</strong>, my b..."
4,Kristinn Taylor,,7246,0,535,1224,26.0,en,https://www.thegatewaypundit.com/2020/02/poll-...,0,...,1580844657,2416,"[all_content, general_article]",1,0,0,0,0,0,Poll Stunner: Super Majority of Mexicans Suppo...


In [34]:
dataset['published_date'] = dataset['published_date'].apply(lambda x: pd.Timestamp.fromtimestamp(x))

In [35]:
dataset.head(1)

Unnamed: 0,author_name,youtube_trending_score,alexa_rank,pinterest_shares,num_words,twitter_shares,love_count,language,og_url,video,...,published_date,total_shares,article_types,general_article,how_to_article,infographic,list,what_post,why_post,display_title
0,Fernanda Echavarri,,7477,3,517,12086,6.0,en,https://www.motherjones.com/coronavirus-update...,0,...,2020-03-23 19:19:42,34638,"[all_content, general_article]",1,0,0,0,0,0,The Most Irresponsible President on the Contin...


In [39]:
dataset['twitter_user_id']

0      37722437.0
1     292010775.0
2     144722898.0
3     144134352.0
4     146184070.0
5             NaN
6             NaN
7             NaN
8             NaN
9     233982739.0
10            NaN
11            NaN
12            NaN
13            NaN
14            NaN
15    144565498.0
16            NaN
17            NaN
18    233982739.0
19            NaN
20            NaN
21            NaN
22            NaN
23            NaN
24            NaN
25    233982739.0
26            NaN
27            NaN
28    146463089.0
29            NaN
30            NaN
31            NaN
32            NaN
33    256998148.0
34            NaN
35     25294901.0
36            NaN
37            NaN
38            NaN
39            NaN
40            NaN
41            NaN
42            NaN
43    146880479.0
44            NaN
45    148012395.0
Name: twitter_user_id, dtype: float64