<a href="https://colab.research.google.com/github/AzhakAnwar/CosmeticStoreProject/blob/main/SingleFileImplementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deploying Scrapy On Google Colab

---
Contain the following major features:



1.   Deployed onn Google Colab.
2.   Implemented with both, command line and Single File.
3. The data is stored in Google Drive and Read/Write is made via mount.
4. The library is installed on Google Drive and is imported from there making it a completely flexible and portable project. 




### Necessary Imports

In [None]:
from google.colab import drive
import os, sys 
from IPython.core.interactiveshell import InteractiveShell
import platform, logging 

# try:
#     import scrapy
# except:
#     ! pip3 install scrapy # type: ignore
#     import scrapy

# from scrapy.crawler import CrawlerProcess, CrawlerRunner 
# from twisted.internet import reactor
from multiprocessing import Process, Queue

# import subprocess 

IPython shell to display all values of statements in the output, instead of only the last one by default.

In [19]:
InteractiveShell.ast_node_interactivity = "all"

### Mount the Google Drive (In case it's not mounted already)

In [None]:
drive.mount('/content/drive/') 

In [None]:
scrapy_path = "/content/drive/MyDrive/ScrapTest"  ## Project Path
os.chdir(scrapy_path)


In [None]:
lib_path = '/content/drive/MyDrive/Library'       ## Path where Scrapy Library is installed 
sys.path.insert(0,lib_path)

# ! chmod 777 -R '/content/drive/MyDrive/Library/bin/scrapy'


## Importing Scrapy Now As It's Installed In Google Drive

In [None]:
try:
    import scrapy
except:
    ! pip3 install scrapy # type: ignore
    import scrapy

from scrapy.crawler import CrawlerProcess, CrawlerRunner 
from twisted.internet import reactor

In [None]:
# !scrapy startproject cosmetics_project $scrapy_path
os.chdir('/content/drive/MyDrive/ScrapTest/cosmetics_project')

# !scrapy genspider cosmetic goschonheit.ch

## Single File Implemenation 

In [None]:
class CosmeticSpider(scrapy.Spider):
    name = 'cosmetic'
    allowed_domains = ['goschonheit.ch']
    # start_urls = ['http://goschonheit.ch/']

    custom_settings = {     # Class attribute
        'CONCURRENT_REQUESTS': 5,
        'ROBOTSTXT_OBEY': False,
        'CONCURRENT_REQUESTS': 5,
        'DOWNLOAD_DELAY': 3,
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
        'LOG_LEVEL': logging.WARNING,
        # 'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',
        'FEED_URI': 'cosmetic_stores.json',
        'LOG_FILE': 'test_project.log',
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_START_DELAY': 5,
        'AUTOTHROTTLE_MAX_DELAY': 60
    }
    def start_requests(self):
        yield scrapy.Request('http://goschonheit.ch/', callback=self.parse_city) 
        
    def parse_city(self, response):
        cities = response.xpath("//span[@class='li_inner']/a")
        for city in cities:                                  # follow city URL
            yield response.follow(city.xpath('./@href').get()+'?distance=99999', callback=self.open_entity, meta={'city': city.xpath('./text()').get()})
            
    def open_entity(self, response):
        for shop in response.xpath("//h3/a/@href").getall(): # follow shop 
            yield response.follow(shop, callback=self.parse_entity, meta={'city': response.meta.get('city')})
            
        nxt = response.xpath("//div[@class='pagination']/a[contains(., 'chste')]/@href").get()
        if nxt:                                              # pagination 
            yield response.follow(nxt, callback=self.open_entity)
            
    def parse_entity(self, response):                        # Final output
        out = {
            'name': response.xpath("//h1/text()").get(),
            'rating': response.xpath("(//span[@class='average'])[1]/text()").get(),
            'address': response.xpath("//div[@class='adr']/text()").getall(),
            'phone': response.xpath("//div[@itemprop='telephone']/text()").get(),
            'website': response.xpath("//div[@class='ca_content_info'][1]/div[last()]/text()").get(),
            'city': response.meta.get('city'),
        }
        yield out


### Defining a Pipeline to Store Data

In [None]:
import json

class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('cosmeticstores.json', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        record = json.dumps(dict(item)) + "\n"
        self.file.write(record)
        return item

### Spider Execution

In [None]:
# ! scrapy crawl cosmetic -o out.csv

process = CrawlerProcess()
process.crawl(CosmeticSpider)

if reactor.running:
    reactor.stop()


process.start()

A New Way to Execute (In Testing Phase)

In [None]:
def exec_spider(spider):
    def fork(q):
        try:
            runner = CrawlerRunner()
            deferred = runner.crawl(spider)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Queue()
    p = Process(target=fork, args=(q,))
    p.start()
    result = q.get()
    p.join()

In [None]:
exec_spider(CosmeticSpider)
