# Pull Data from Web and API using Python

### 1. Connecting to an URL on web (an API)

In [38]:
# requests-to sent HTTP request and access the HTML content from the target webpage
import requests

# BeautifulSoup is a Python package for parsing HTML and XML documents.
from bs4 import BeautifulSoup as bs

# pandas-to create a dataframe
import pandas as pd

In [17]:
# connetct to the API
response_API = requests.get('https://gmail.googleapis.com/$discovery/rest?version=v1')

In [18]:
# check the status of the connection to the API
response_API.status_code

200

- **200**: a healthy connection with the API on web
- **204**: successfully made a connection to the API but did not return any data
- **401**: Authentication failed
- **403**: Access is forbidden by the API service
- **404**: API service is not found
- **500**: Internal Server Error occurred

In [None]:
# advanced operation: add endpoint and api_key to the request function

endpoint = "https://api.nasa.gov/mars-photos/api/v1/rovers/curiosity/photos"

api_key = 'DEMO_KEY'

query_params = {'api_key':api_key, 'earth_data':'2020-07-01'}

response = requests.get(endpoint, params = query_params)

### 2. Getting the data from API

After making a healthy connection with the API, the next task is to pull the data from the API.

In [19]:
data = response_API.text

### 3. Parse the data into JSON format

JSON(JavaScript Object Notation) is the language of APIs. JSON is the primary format in which data is pased back and forth to APIs.

Python has standard JSON package, which can convert lists and dictionaries to JSON, and conver strings to lists and dictionaries. 
- **json.dumps()**: convert a python object and converts it to json(?)
- **json.loads()**: convert a json to a python object

Herein, we use .json() to convert the response to a Python dictionary. 

In [66]:
import json
#method 1: using two steps
data = response_API.text
json.loads(data)

#method 2: just one step
json_file = response_API.json()

.json() or json.load(data) sometimes will give errors. Since this won't protect you from a URL that doesn't comply with HTTP standards. When using arbirary URLs where this is a possibility, check if the server intended to give you JSON by checking the Content-Type header.

In [27]:
response_API.headers["content-type"].strip().startswith("application/json")

True

In [28]:
# Check if the returned data is JSON or text/html
response_API.headers["content-type"]

'application/json; charset=UTF-8'

In [69]:
# transfer JSON file to DataFrame
df = pd.read_json('https://gmail.googleapis.com/$discovery/rest?version=v1')
df.head()

Unnamed: 0,revision,discoveryVersion,resources,canonicalName,title,ownerName,batchPath,icons,protocol,servicePath,...,basePath,version,documentationLink,id,auth,baseUrl,schemas,parameters,description,ownerDomain
users,20211120,v1,{'resources': {'threads': {'methods': {'modify...,Gmail,Gmail API,Google,batch,,rest,,...,,v1,https://developers.google.com/gmail/api/,gmail:v1,,https://gmail.googleapis.com/,,,The Gmail API lets you view and manage Gmail m...,google.com
x32,20211120,v1,,Gmail,Gmail API,Google,batch,http://www.google.com/images/icons/product/sea...,rest,,...,,v1,https://developers.google.com/gmail/api/,gmail:v1,,https://gmail.googleapis.com/,,,The Gmail API lets you view and manage Gmail m...,google.com
x16,20211120,v1,,Gmail,Gmail API,Google,batch,http://www.google.com/images/icons/product/sea...,rest,,...,,v1,https://developers.google.com/gmail/api/,gmail:v1,,https://gmail.googleapis.com/,,,The Gmail API lets you view and manage Gmail m...,google.com
oauth2,20211120,v1,,Gmail,Gmail API,Google,batch,,rest,,...,,v1,https://developers.google.com/gmail/api/,gmail:v1,{'scopes': {'https://www.googleapis.com/auth/g...,https://gmail.googleapis.com/,,,The Gmail API lets you view and manage Gmail m...,google.com
HistoryMessageDeleted,20211120,v1,,Gmail,Gmail API,Google,batch,,rest,,...,,v1,https://developers.google.com/gmail/api/,gmail:v1,,https://gmail.googleapis.com/,"{'type': 'object', 'id': 'HistoryMessageDelete...",,The Gmail API lets you view and manage Gmail m...,google.com


### 4. Parse JSON to DataFrame

In [76]:
# import json file and transfer JSON to DataFrame: if direct transfer, will be list**
import pandas as pd
import json
with open("~city_searches.json",'r') as inf:
    sessions = json.load(inf)

sessions = pd.DataFrame(sessions)
sessions.head()

Unnamed: 0,session_id,unix_timestamp,cities,user
0,[D258NVMV202LS],[1442640552],"[San Jose CA, Montreal QC]","[[{'user_id': 5749, 'joining_date': '2015-04-0..."
1,[TDG10UKG7I4LR],[1432110137],[New York NY],"[[{'user_id': 10716, 'joining_date': '2015-03-..."
2,[OH4ZDIGN9BLQS],[1437049311],"[Montreal QC, Quebec QC]","[[{'user_id': 2941, 'joining_date': '2015-03-1..."
3,[CWHIAYKQ7RA28],[1432215908],[Chicago IL],"[[{'user_id': 2164, 'joining_date': '2015-03-2..."
4,[GI8GZJAWAC80P],[1443556226],"[Toronto ON, Houston TX]","[[{'user_id': 10493, 'joining_date': '2015-03-..."


In [80]:
#method 1: parse JSON to DataFrame
import pandas as pd
import json
import datetime

def clean_json(d):
    """
    clean the json data, make the data easier to be processed
    """
    assert len(d['cities']) == 1
    d['cities'] = d['cities'][0]

    assert len(d['session_id']) == 1
    d['session_id'] = d['session_id'][0]

    assert len(d['unix_timestamp']) == 1
    d['timestamp'] = datetime.datetime.utcfromtimestamp(d['unix_timestamp'][0])
    del d['unix_timestamp']

    user_dict = d['user']

    assert len(user_dict) == 1
    user_dict = user_dict[0]
    
    assert len(user_dict) == 1
    user_dict = user_dict[0]

    d['user_id'] = user_dict['user_id']
    d['user_country'] = user_dict['country']
    del d['user']
    
    return d


with open("~ city_searches.json", 'r') as inf:
    sessions = json.load(inf)

for item in sessions:
    clean_json(item)
    
sessions = pd.DataFrame(sessions)
sessions.head()

Unnamed: 0,session_id,cities,timestamp,user_id,user_country
0,D258NVMV202LS,"San Jose CA, Montreal QC",2015-09-19 05:29:12,5749,FR
1,TDG10UKG7I4LR,New York NY,2015-05-20 08:22:17,10716,DE
2,OH4ZDIGN9BLQS,"Montreal QC, Quebec QC",2015-07-16 12:21:51,2941,
3,CWHIAYKQ7RA28,Chicago IL,2015-05-21 13:45:08,2164,FR
4,GI8GZJAWAC80P,"Toronto ON, Houston TX",2015-09-29 19:50:26,10493,US


In [81]:
# method 2: parse json to DataFrame
def parse_json(data):
    """ function to parse json data """
    session_id = []
    timestamp = []
    cities = []
    user_id = []
    joining_date = []
    country = []
    
    # parse each item
    for item in data:
        session_id.append(item['session_id'][0])
        timestamp.append(item['unix_timestamp'][0])
        cities.append(item['cities'][0])
        user_id.append(item['user'][0][0]['user_id'])
        joining_date.append(item['user'][0][0]['joining_date'])
        country.append(item['user'][0][0]['country'])
    
    # create DataFrame
    maps = {'session_id': session_id, 'timestamp': timestamp, 'cities': cities, 
            'user_id': user_id, 'joining_date': joining_date, 'country': country}
    columns=['session_id', 'timestamp', 'cities', 'user_id', 'joining_date', 'country']
    
    return pd.DataFrame(maps, columns=columns)

with open('C:/Users/xiaoj/Desktop/CS/collection of data science-take home challenges/Data/13. city_searches.json', 'r') as f:
    json_data = json.load(f)

data = parse_json(json_data)
data['joining_date'] = pd.to_datetime(data['joining_date'])
data.head()

Unnamed: 0,session_id,timestamp,cities,user_id,joining_date,country
0,D258NVMV202LS,1442640552,"San Jose CA, Montreal QC",5749,2015-04-02,FR
1,TDG10UKG7I4LR,1432110137,New York NY,10716,2015-03-30,DE
2,OH4ZDIGN9BLQS,1437049311,"Montreal QC, Quebec QC",2941,2015-03-16,
3,CWHIAYKQ7RA28,1432215908,Chicago IL,2164,2015-03-27,FR
4,GI8GZJAWAC80P,1443556226,"Toronto ON, Houston TX",10493,2015-03-31,US


### 5. Web Scrapping with Python **

In [56]:
# requests-to sent HTTP request and access the HTML content from the target webpage
import requests

# BeautifulSoup is a Python package for parsing HTML and XML documents.
from bs4 import BeautifulSoup as bs

# pandas-to create a dataframe
import pandas as pd

In [57]:
# target website
base_url = 'https://www.consumeraffairs.com/food/dominos.html'

In [61]:
# define a scraper function
def scraper():
    # create an empty list to store all the scraped reviews
    all_pages_reviews = []
    
    # create an empty list to store the reviews of each page
    pagewise_reviews = []
    
    # write a for loop to loop through the number of pages you would like to scrape
    # here, I scrape the reviews of top 5 pages
    for i in range(1,6):
        # i represents the page number,which means which page you are scrapping 
        query_parameter = '?page='+ str(i)
        url = base_url + query_parameter
        
        # sent HTTP request and access the HTML content
        response = requests.get(url)
        
        # create a soup object and parse the HTML page
        soup = bs(response.content, 'html.parser')
        
        # select the particular elements(contents) you are interested in
        # if interest is to extract only the review text, 
        # need in inspect the page and obtain the HTML tags, attribute the names of the target element
        rev_div = soup.findAll('div', attrs = {'class':'rvw-bd'})
        
    for j in range(len(rev_div)):
        # find all the p tags to fetch only the review text
        pagewise_reviews.append(rev_div[j].find('p').text)
    
    for k in range(len(pagewise_reviews)):
        all_pages_reviews.append(pagewise_reviews[k])
    
    return all_pages_reviews   

In [65]:
reviews = scraper()
i = range(1, len(reviews)+1)
reviews_df = pd.DataFrame({"review":reviews}, index = i)
reviews_df.to_csv('reviews.txt', sep = 't')
reviews_df

Unnamed: 0,review
1,"I ordered pizza through the app, I got the not..."
2,My husband and I have ordered from the Westfie...
3,The driver was on side road with brights on in...
4,"So, I didn’t even want to rate one star but un..."
5,S. P road Gaya Domino's denied to provide ever...
6,I ordered a 12inch pizza and a pasta bowl. I f...
7,I place the order at Domino's at 1801 Valley V...
8,After an hour passed and refused cold uncooked...
9,This Domino's has the best pizza delivery and ...
10,"Overpriced ""Nothing Pizza"". Very dry and taste..."


In [None]:
# another example 
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, 'lxml')
text = [p.text for p in soup.find(class_ = 'post_content').find_all('p')]