<a href="https://colab.research.google.com/github/Alenush/dish_id_sirius/blob/Team-1/chefnet_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!python -m pip install pymongo==3.7.2
!apt install mongodb
!mongod --dbpath /content/data/db --fork --logpath /var/log/mongod.log

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-440
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libpcap0.8 libstemmer0d libyaml-cpp0.5v5 mongo-tools mongodb-clients
  mongodb-server mongodb-server-core
The following NEW packages will be installed:
  libpcap0.8 libstemmer0d libyaml-cpp0.5v5 mongo-tools mongodb mongodb-clients
  mongodb-server mongodb-server-core
0 upgraded, 8 newly installed, 0 to remove and 33 not upgraded.
Need to get 53.1 MB of archives.
After this operation, 215 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libpcap0.8 amd64 1.8.1-6ubuntu1.18.04.1 [118 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 libstemmer0d amd64 0+svn585-1build1 [62.5 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 lib

In [3]:
import pandas as pd
import datetime
import requests
from bs4 import BeautifulSoup
import urllib
from urllib.request import urlopen
import os

from bson import ObjectId
import pymongo
from pymongo import MongoClient
import multiprocessing
from threading import Thread
import tqdm.notebook as tqdm

In [7]:
class request_info_thread(Thread):
    '''
    Inherits from Thread so I can store results of my threading.
    I want to be able to return something from my threading. This
    class will allow me to do that - perform all of the requesting
    and data gathering that I want to do, but store the results on
    the class so that I can access them later.
    '''

    def __init__(self, recipe_id, link):
        super().__init__()
        self.recipe_id = recipe_id
        self.url = link
        data = requests.get(self.url).text
        self.soup = BeautifulSoup(data, 'lxml')
        self.json_dct = None

    def run(self):
        self.json_dct = self._request_info()
        if self.json_dct:
            self.image_url = self._get_img_url()
            img_data = urlopen(self.image_url).read()
            self.img_soup =  BeautifulSoup(img_data, 'lxml')
            self._scrape_photos()

    def _request_info(self):
        '''
        Grab relevant information from the row and store it in mongo.
        Make sure that if there is missing information that is not crucial to my analysis, we still store the data.
        By checking the Mongo table during the extraction process we can save time by not getting the html of the url if that url already exists in the table.
        '''
        item_name = self.soup.find('h1', {'class':'headline heading-content'})
        if item_name is None:
          return False

        item_name = item_name.text
        ingred_list = []

        for s in self.soup.findAll('li', {'class': 'ingredients-item'}):
          ingred_list.append(s.text.strip())

        #Throw data into MongoDB
        json_dct = ({'id':self.recipe_id, 'item_name': item_name, 'ingred_list':ingred_list})

        return json_dct

    def _get_img_url(self):
        return self.url+'/photos'

    def _scrape_photos(self):

        photos = self.img_soup.find('section', {'class':'recipe-photo__page'})
        for i, img in enumerate(photos.findAll('img')):
          src = img.get('src')
          urllib.request.urlretrieve(src, 'AllRecipes_images_val/'+str(self.recipe_id)+'_'+str(i)+'.jpg')
        pass

In [8]:
def Pull_Recipe_Links(i):
    '''
    Pull links to all recipes on a search page and send them to search scraper
    Input:  (1) page Number to search for recipe links on
    '''
    class MyOpener(urllib.request.FancyURLopener):
      version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
    myopener = MyOpener()

    #Store results in mongo
    db_client = MongoClient('localhost', 27017)
    recipe_db = db_client['allrecipes']['recipe_data']

    url = "http://allrecipes.com/recipes/?page=" + str(i)

    page = myopener.open(url)
    soup = BeautifulSoup(page, 'lxml')

    recipes = soup.find('section', {'class':'recipe-section'}).\
                        find_all('div', {'class':'grid-card-image-container'})
    threads = []
    for recipe in recipes:
      try:
        link = recipe.find('a').get('href').strip()
        if link[26:34]=='/recipe/':
          t = scrape_search(link, recipe_db)
          if t:
            t.start()
            threads.append(t)
      except:
        continue
    #print(str(i) + '-' + str(len(threads)))
    for t in threads:
      t.join()
    
    for t in threads:
      store_data([t.json_dct], recipe_db)
    
    if i % 100 == 0:
      ingred_total = db_client['allrecipes']['recipe_data'].count_documents({})
      pics_total = len(os.listdir('AllRecipes_images_val'))
      print(f'Fetched {ingred_total} dishes with {pics_total} total of images') 

    db_client.close()


def scrape_search(link, recipe_db):
    '''
    Create request_info_threads for a page of the website
    Input:  (1) link to search page
            (2) recipe MongoDB
    Output: (1) list of data to be stored in MongoDB
    '''
    link_items = link.split('/')
    #Parse url string to locate recipe name and number
    recipe_id = link_items[4]
    recipe_label = link_items[5]

    if already_exists(recipe_db, recipe_id):
        return False
    else:
      t = request_info_thread(recipe_id, link)
      return t

def store_data(mongo_update_lst, recipe_db):
    '''
    Store Recipe Information in MongoDB
    '''
    for json_dct in mongo_update_lst:
        if json_dct:
            recipe_db.insert_one(json_dct)
    pass

def already_exists(recipe_db, id):
    '''
    Check if a recipe already exists in the database
    '''
    return recipe_db.count_documents({'id':id}) > 0

def run_parallel(start_page, end_page):
    '''
    Scrape multiple search pages in parallel
    '''

    page_range = range(start_page, end_page + 1)
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    for _ in tqdm.tqdm(pool.imap_unordered(Pull_Recipe_Links, page_range), total=end_page + 1 - start_page):
      pass
    # pool.map(Pull_Recipe_Links, page_range)
    # pass

In [None]:
!rm -r AllRecipes_images

In [6]:
!mkdir AllRecipes_images_val

In [None]:
db_client = MongoClient('localhost', 27017)
recipe_db = db_client['allrecipes']['recipe_data']
recipe_db.remove({})
db_client.close()

In [None]:
run_parallel(start_page=1, end_page=250)

  
  
  
  


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

Fetched 121 dishes with 1915 total of images
Fetched 321 dishes with 3829 total of images
Fetched 521 dishes with 6625 total of images
Fetched 720 dishes with 8617 total of images
Fetched 920 dishes with 11259 total of images
Fetched 1120 dishes with 13365 total of images
Fetched 1320 dishes with 15969 total of images
Fetched 1519 dishes with 18014 total of images
Fetched 1719 dishes with 20521 total of images
Fetched 1919 dishes with 22747 total of images
Fetched 2119 dishes with 24928 total of images
Fetched 2319 dishes with 27231 total of images
Fetched 2519 dishes with 29434 total of images
Fetched 2718 dishes with 31676 total of images
Fetched 2918 dishes with 33722 total of images
Fetched 3118 dishes with 35987 total of images


Exception in thread Thread-835:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "<ipython-input-3-5fed5f46d244>", line 24, in run
    self._scrape_photos()
  File "<ipython-input-3-5fed5f46d244>", line 55, in _scrape_photos
    urllib.request.urlretrieve(src, 'AllRecipes_images/'+str(self.recipe_id)+'_'+str(i)+'.jpg')
  File "/usr/lib/python3.6/urllib/request.py", line 248, in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
  File "/usr/lib/python3.6/urllib/request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "/usr/lib/python3.6/urllib/request.py", line 532, in open
    response = meth(req, response)
  File "/usr/lib/python3.6/urllib/request.py", line 642, in http_response
    'http', request, response, code, msg, hdrs)
  File "/usr/lib/python3.6/urllib/request.py", line 570, in error
    return self._call_chain(*args)
  File "/usr/lib/python3.6/urllib/requ

Fetched 3318 dishes with 38019 total of images
Fetched 3516 dishes with 40172 total of images
Fetched 3716 dishes with 42063 total of images
Fetched 3916 dishes with 44287 total of images
Fetched 4116 dishes with 46029 total of images
Fetched 4315 dishes with 48141 total of images
Fetched 4515 dishes with 49914 total of images
Fetched 4714 dishes with 51959 total of images
Fetched 4914 dishes with 53688 total of images



In [None]:
run_parallel(start_page=251, end_page=1000)

  
  
  


HBox(children=(FloatProgress(value=0.0, max=750.0), HTML(value='')))

  
Exception in thread Thread-183:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "<ipython-input-3-5fed5f46d244>", line 24, in run
    self._scrape_photos()
  File "<ipython-input-3-5fed5f46d244>", line 55, in _scrape_photos
    urllib.request.urlretrieve(src, 'AllRecipes_images/'+str(self.recipe_id)+'_'+str(i)+'.jpg')
  File "/usr/lib/python3.6/urllib/request.py", line 248, in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
  File "/usr/lib/python3.6/urllib/request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "/usr/lib/python3.6/urllib/request.py", line 532, in open
    response = meth(req, response)
  File "/usr/lib/python3.6/urllib/request.py", line 642, in http_response
    'http', request, response, code, msg, hdrs)
  File "/usr/lib/python3.6/urllib/request.py", line 570, in error
    return self._call_chain(*args)
  File "/usr/lib/python3.6/urllib/r

Fetched 5976 dishes with 62859 total of images


Exception in thread Thread-434:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "<ipython-input-3-5fed5f46d244>", line 24, in run
    self._scrape_photos()
  File "<ipython-input-3-5fed5f46d244>", line 55, in _scrape_photos
    urllib.request.urlretrieve(src, 'AllRecipes_images/'+str(self.recipe_id)+'_'+str(i)+'.jpg')
  File "/usr/lib/python3.6/urllib/request.py", line 248, in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
  File "/usr/lib/python3.6/urllib/request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "/usr/lib/python3.6/urllib/request.py", line 532, in open
    response = meth(req, response)
  File "/usr/lib/python3.6/urllib/request.py", line 642, in http_response
    'http', request, response, code, msg, hdrs)
  File "/usr/lib/python3.6/urllib/request.py", line 570, in error
    return self._call_chain(*args)
  File "/usr/lib/python3.6/urllib/requ

Fetched 7906 dishes with 78035 total of images
Fetched 9745 dishes with 91080 total of images
Fetched 11476 dishes with 102572 total of images


Exception in thread Thread-1662:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "<ipython-input-3-5fed5f46d244>", line 24, in run
    self._scrape_photos()
  File "<ipython-input-3-5fed5f46d244>", line 55, in _scrape_photos
    urllib.request.urlretrieve(src, 'AllRecipes_images/'+str(self.recipe_id)+'_'+str(i)+'.jpg')
  File "/usr/lib/python3.6/urllib/request.py", line 248, in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
  File "/usr/lib/python3.6/urllib/request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "/usr/lib/python3.6/urllib/request.py", line 532, in open
    response = meth(req, response)
  File "/usr/lib/python3.6/urllib/request.py", line 642, in http_response
    'http', request, response, code, msg, hdrs)
  File "/usr/lib/python3.6/urllib/request.py", line 570, in error
    return self._call_chain(*args)
  File "/usr/lib/python3.6/urllib/req

Fetched 13116 dishes with 112720 total of images
Fetched 14707 dishes with 121768 total of images
Fetched 16242 dishes with 130353 total of images
Fetched 17676 dishes with 137327 total of images



In [None]:
run_parallel(start_page=1001, end_page=2000)

  
  
  
  


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Fetched 19174 dishes with 144708 total of images
Fetched 20547 dishes with 150636 total of images
Fetched 21913 dishes with 156681 total of images
Fetched 23260 dishes with 162044 total of images
Fetched 24604 dishes with 166936 total of images
Fetched 25931 dishes with 171720 total of images
Fetched 27249 dishes with 175991 total of images
Fetched 28631 dishes with 180143 total of images
Fetched 29890 dishes with 183803 total of images
Fetched 31233 dishes with 187047 total of images



In [9]:
run_parallel(start_page=2001, end_page=2002)

  
  


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [None]:
pldb_client = MongoClient('localhost', 27017)

print(db_client['allrecipes']['recipe_data'].count_documents({}))
db_client.close()
# for document in cursor:
#   print(document)

31233


In [None]:
len(os.listdir('AllRecipes_images'))

187047

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!zip -rqq AllRecipes_images.zip AllRecipes_images

In [None]:
!rsync --info=progress2 AllRecipes_images.zip /content/drive/My\ Drive/Dish-id

  3,700,169,820 100%   47.50MB/s    0:01:14 (xfr#1, to-chk=0/1)


In [None]:
!ls '/content/drive/My Drive/Dish-id'

AllRecipes_images.zip  db.zip


In [None]:
!zip -r db.zip data/db

In [None]:
!rsync --info=progress2 db.zip /content/drive/My\ Drive/Dish-id

         32,768   0%    0.00kB/s    0:00:00       13,611,035 100%   65.40MB/s    0:00:00 (xfr#1, to-chk=0/1)     13,611,035 100%   65.40MB/s    0:00:00 (xfr#1, to-chk=0/1)     13,611,035 100%   63.79MB/s    0:00:00 (xfr#1, to-chk=0/1)     13,611,035 100%   63.79MB/s    0:00:00 (xfr#1, to-chk=0/1)
