# Scraping google charts using requests, scrapy & Beautifulsoup

__Purpose__

In this notebook google charts dataset will be scraped directly from itunes' website. Only title, description, rating, genre and the title of chart will be kept. We use Beautifulsoup instead of xpath to avoid any trouble brought by different positions of the same data in different apps' website. To scrape 100 apps' data from each chart's website, Scrapy is needed or dynamically loaded data will be missed.

In [1]:
import json
import os.path
import time
import random

import requests
from requests.compat import urljoin
from bs4 import BeautifulSoup
import scrapy
from scrapy.crawler import CrawlerProcess
from loguru import logger

In [2]:
google_save_path = '../../datasets/1200_google_chart_dataset.json'

# Scraping charts using requests & BeautifulSoup

In [3]:
logger.info('Scraping charts using requests & BeautifulSoup')

google_dataset = []

To scrape 100 apps' data from each chart's website, Scrapy is needed or dynamically loaded data will be missed.

In [4]:
class Spider(scrapy.Spider):
    name = "playspider"  
    allowed_domains = ['play.google.com']

    def start_requests(self): 
        start_urls = ["https://play.google.com/store/apps/collection/topselling_free",
                    "https://play.google.com/store/apps/collection/topselling_paid",
                    "https://play.google.com/store/apps/collection/topgrossing",
                    "https://play.google.com/store/apps/category/GAME/collection/topselling_free",
                    "https://play.google.com/store/apps/category/GAME/collection/topselling_paid",
                    "https://play.google.com/store/apps/category/GAME/collection/topgrossing",
                    "https://play.google.com/store/apps/collection/topselling_new_free",
                    "https://play.google.com/store/apps/collection/topselling_new_paid",
                    "https://play.google.com/store/apps/category/GAME/collection/topselling_new_free",
                    "https://play.google.com/store/apps/category/GAME/collection/topselling_new_paid"]
        for url in start_urls:
            targetURL = url
            yield  scrapy.FormRequest(               
                     targetURL,
                     formdata = {'start':'0',
                                 'num':'100',
                                 'numChildren':'0',
                                 'cctcss':'square-cover',
                                 'cllayout':'NORMAL',
                                 'ipf':'1',
                                 'xhr':'1',
                                 'token':'zNTXc17yBEzmbkMlpt4eKj14YOo:1458833715345'},
                    callback = self.parse_data
                 )


    def parse_data(self, response):  
        table_title = response.xpath('//div[@class="cluster-heading"]/h2/text()')[0].extract().strip()
        for object_per in response.xpath('//div[@class="card no-rationale square-cover apps small"]/div[@class="card-content id-track-click id-track-impression"]'):
            try:
                title = object_per.xpath('div[@class="details"]/a[@class="title"]/text()')[0].extract()
            except:
                title = ''
            try:
                title_URL = 'https://play.google.com' + object_per.xpath('div[@class="details"]/a/@href')[0].extract()
            except:
                title_URL = ''
            try:
                description_list = object_per.xpath('div[@class="details"]/div[@class="description"]/text()')[0].extract()
                description = ''.join(description_list)
            except:
                description = ''
            try:
                autor = object_per.xpath('div[@class="details"]/div[@class="subtitle-container"]/a/text()')[0].extract()
            except:
                autor = ''
            try:
                star = object_per.xpath('div[@class="reason-set"]/span/a/div/div/@aria-label')[0].extract()
            except:    
                star = 'no star_rate'
            star_rates = star
            
            playitem = {}
            playitem['title'] = title.strip()
            playitem['title_URL'] = title_URL.strip()
            playitem['description'] = description.strip()
            playitem['star_rates'] = star_rates.strip()
            playitem['table_title']= table_title.strip()
            google_dataset.append(playitem)

process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
})  

process.crawl(Spider)
process.start() 

2019-04-13 23:11:58 [scrapy.utils.log] INFO: Scrapy 1.5.2 started (bot: scrapybot)
2019-04-13 23:11:58 [scrapy.utils.log] INFO: Versions: lxml 4.3.0.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 18.9.0, Python 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) - [GCC 7.3.0], pyOpenSSL 19.0.0 (OpenSSL 1.1.1  11 Sep 2018), cryptography 2.6.1, Platform Linux-4.15.0-47-generic-x86_64-with-debian-buster-sid


To avoid being blocked, we change the 'User-Agent' every time we send a request.

In [5]:
headers = { "Accept":"text/html,application/xhtml+xml,application/xml;",
            "Accept-Encoding":"gzip",
            "Referer":"http://www.example.com/" }

user_agent_list = [
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
  'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
]

def get_soup(search_url):
    user_agent = random.choice(user_agent_list)
    headers['User-Agent'] = user_agent
    
    wait_sec = random.random()*2
    time.sleep(wait_sec)
    searchHtml = requests.get(search_url, headers = headers)
    soup = BeautifulSoup(searchHtml.text, features='html5lib')
    
    return soup

Create a Spider class. Url and title of a chart should be the input in initialization. The output of "work" should be a list containing items with title, description, rating, genre and the title of chart.

In [6]:
class Spider(object):
    def __init__(self, dataset_json):
        self.google_dataset = dataset_json
            
    def work(self):
        result = []
        for x in self.google_dataset:
            url = x['title_URL']
            soup = get_soup(url)
            try:
                rating = soup.find('div', {'class': 'BHMmbe'}).string.strip()
            except:
                rating = None
                
            genres = []
            labels = soup.find_all('span', {'class': 'T32cc UAO9ie'})
            for label in labels:
                label_name = label.find('a').string.strip()
                genres.append(label_name)
            genre = ','.join(genres)
            
            item = x
            item['genre'] = genre
            item['rating'] = rating
            
            result.append(item)
        
        return result


# Save Final Dataset

In any time failure of connection can happen. Thus we save the dataset in the format of json when we finish a chart, so that we can continue to work on the next chart after a failure of connection happens.

In [7]:
logger.info('Save Final Dataset')

spider = Spider(google_dataset)
result = spider.work()

with open(google_save_path, 'w') as file:
    file.write(json.dumps(result))