# Load packages and define functions

In [96]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time
import pickle

chkpt_fname = 'cooksthesaurus_pickle.checkpoint'
df_fname = 'cooksthesaurus_pickle_df.pkl'
unpack_subcat = ['onions','fruit vegetables', 'cheese','herbs','spices','herb & spice mixes','condiments','liquers','wines','liquors','pasta',
                'asian noodles', 'other noodles','beef','pork','lamb','veal','cured meats','variety meats','lean, flaky-textured fish',
                ] #These subcategories have another layer of subcategories that we need to go through
keep_first_scrape = ['wine','pasta','asian noodles', 'other noodles'] #These subcategories have useful information on the landing page, so we keep the information
skip_subcat = ['africa','america','asia','europe','hispanic countries','india','middle east', 'world'] #These subcategories are unpacked separately (above)
skip_cat = ['equipment'] #These categories have no useful information and are therefore skipped

def _load( fname=chkpt_fname ):
    with open( fname, 'rb' ) as f:
        return pickle.load( f ).__dict__
def _dump( obj: object, fname=chkpt_fname ):
    with open( fname, 'wb' ) as f:
        pickle.dump( obj,  f )

class CooksThesaurusScraper:

    def __init__( self, initialize=True ):
        self.initialize = initialize
        if self.initialize: # If starting a new scrape
            self.base_url = 'http://www.foodsubs.com'
            self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
            self.cols = [ 'cat', 'cat_url', 'subcat', 'subcat_url', 'subcat_img', 'item', 'item_equivs', 'item_subs', 'item_img' ] # Output dataframe columns
            self.df = self._create_df( [] ) # Stores results
            self.tags = { 'cat':[], 'subcat':[] } # Category and subcategory tags
            self.i = { 'cat':0, 'subcat':0 } # Category and subcategory index, for restarting scrape (current)
            self.restart = { 'cat':0, 'subcat':0 } # Category and subcategory index, for restarting scrape (restart here)
            self.cat = { 'cat':'', 'subcat':'' } # Category and subcategory name
            self.url = { 'cat':'', 'subcat':'' } # Category and subcategory urls
            self.select_by = { 'cat':'td ul li a', 'subcat':'font a', 'subcat_old':'font p font a' } # Used for scraping categories and subcategories
            self.scraped_subcats = [] # Records urls of scraped subcats so we don't scrape them multiple times (sometimes a subcat belongs to more than one category)
        else: # If restarting a scrape
            self.__dict__ = _load()
            if not self.df.empty: # Make sure that we recorded some results
                self.initialize = False

    def _create_df( self, data: list ) -> object:
        return pd.DataFrame( data=data, columns=self.cols )

    def _create_soup( self, url: str ) -> object: 
        request = requests.get( url=url, headers=self.headers )
        time.sleep( 2 )
        return BeautifulSoup( request.content, features='html.parser' )

    def _get_categories( self, cat_choice:str ): #url: str, select_by: str
        if cat_choice == 'cat': #Use base url to get categories
            url = self.base_url
        else: #Use category url to get subcategories
            url = self.url['cat']
        soup = self._create_soup( url )
        tags = soup.select( self.select_by[cat_choice] )
        self.tags[cat_choice] = [t for t in tags if 'www' not in t['href'] and t.text.lower().replace('\n',' ').strip() not in skip_subcat]

    def _get_list( self, txt: str, split_on: str ) -> list:
        txt = re.sub( r'\s+', ' ', txt ).lower()
        txt = re.sub( r'\([^)]*\)', '', txt ).strip()
        txt_list = re.split( split_on,  txt )
        return [ t for t in txt_list if t ]

    def _get_item( self, soup_list: list ) -> list: #This is where most of the parsing logic is
        main = []
        equiv = []
        sub = []
        link = ''
        for i, s in enumerate( soup_list ):
            if main == [] and 'b' in [s.parent.name, s.parent.parent.name] and s[:5].lower().strip() not in ['subst','notes']:
                main = self._get_list( s.text, r'\s*=\s*' )
                if 'href' in s.parent.attrs:
                    link = ''.join([self.base_url,'/',s.parent['href']])
                elif 'href' in s.parent.parent.attrs:
                    link = ''.join([self.base_url,'/',s.parent.parent['href']])
            elif main != [] and '=' in s.text[:10] and not re.search(r'\d', s.text[:10]): #Sometimes the synonyms are broken up, so this is an effort to recombine #re.search('^\s*?=',s):
                if re.search('^\s*?=',s.text): # if the string starts with "=", we know a name was not broken up
                    main.extend( self._get_list( s.text, r'\s*=\s*' ) )
                else: # if the string does not start with "=", a name was likely broken up
                    main = self._get_list( s.text+soup_list[i-1].text, r'\s*=\s*' )
            elif 'equiv' in s[:12].lower():
                if len(soup_list) - (i+1):
                    equiv = self._get_list( soup_list[i+1].text, r'\s*=\s*' )
            elif 'subst' in s[:12].lower():
                if len(soup_list) - (i+1):
                    sub = self._get_list( soup_list[i+1].text, r'\s+or\s+' )
        return [main,equiv,sub, self.item_img], link
    
    def _tag_iter(self):
        soup = self._create_soup( self.url['subcat'] )
        tags = soup.select( 'blockquote table tr td' )
        self.base_item = [ self.cat['cat'], self.url['cat'], self.cat['subcat'], self.url['subcat'], self.img ]
        self.item_img = ''
        item = [[],[],[], '']
        self.items, self.links = [], []
        for tag in tags:
            text_finder = tag.find_all(text=True, recursive=True)
            text_finder = [ t for t in text_finder if re.sub('\s+','',t)]
            if text_finder:
                item, link = self._get_item( text_finder )
                if item[0]:
                    self.item_img = ''
                    self.items.append( self.base_item+item )
                    self.links.append(link)
            else:
                image_finder = tag.find('img')
                if image_finder:
                    self.item_img = self.base_url+image_finder['src']
                    if not self.base_item[-1]:
                        self.base_item[-1] = self.item_img

    def _unpack_sub(self) -> object:
        if self.cat['subcat'] in keep_first_scrape:
            self.df = pd.concat( [self.df,self._create_df( self.items )] )
        subcat_prefix = self.cat['subcat']
        for item,link in zip(self.items,self.links):
            if link != '':
                self.cat['subcat'] = ''.join([subcat_prefix,', ',item[5][0]])
                self.url['subcat'] = link
                self._scrape_sub(img=item[-1])

    def _scrape_sub( self, img='' ) -> object:
        self.img = img
        self.scraped_subcats.append( self.url['subcat'] )
        self._tag_iter()
        if self.cat['subcat'] in unpack_subcat:
            self._unpack_sub()
        else:
            self.df = pd.concat( [self.df,self._create_df( self.items )] )

    def _make_chkpt( self ):
        _dump( self )
        self.df.to_pickle( df_fname )

    def _set_restart( self, cat_choice ):
        if cat_choice == 'subcat':
            if len(self.tags['subcat'])-(self.i['subcat']+1):
                cat_i, subcat_i = 0, 1
            elif len(self.tags['cat'])-(self.i['cat']+1):
                cat_i, subcat_i = 1, -self.i['subcat']
            else:
                cat_i, subcat_i = 0, 1
        else:
            cat_i, subcat_i = 0, -self.i['subcat']
        self.restart['cat'] = self.i['cat']+cat_i
        self.restart['subcat'] = self.i['subcat']+subcat_i
    
    def _iter( self, cat_choice: str, single_cat='', debug=False ):
        if self.initialize: #If we have initialized the scrape, we always start from beginning 
            start = 0
        else: #If we restart a scrape, we start from where we left off
            start = self.restart[cat_choice]
        for i, tag in enumerate( self.tags[cat_choice][start:] ):
            self.i[cat_choice] = i
            self.cat[cat_choice] = tag.text.lower().replace('\n',' ').strip()
            self.url[cat_choice] = ''.join( [self.base_url,'/',tag['href']] )
            if cat_choice == 'subcat' and not debug and not (self.url['subcat'] in self.scraped_subcats):
                print( f'Scraping: {self.url[cat_choice]}' )
                self._scrape_sub()
                if self.save:
                    self._make_chkpt()
            elif self.url[cat_choice]==single_cat or (cat_choice=='cat' and not single_cat): # Only triggers once if scraping single category
                self._set_restart('cat')
                if self.cat[cat_choice] not in skip_cat: # We skip some categories that do not contain useful information
                    self._get_categories('subcat')
                    self._iter( 'subcat', debug=debug )
                if single_cat:
                    break
            if cat_choice == 'subcat' and self.save:
                self._set_restart('subcat')

    def scrape( self ): #This is used to scrape the entire website
        self.save = True #Results are saved, and the state of the scraper is saved (for restarting a crashed scraper)
        '''
        Running this returns nothing. The results of the scrape are stored in .df as a dataframe, and they are saved as 'cooks_thesaurus_results.csv' in
        the current working directory.
        '''
        if self.initialize:
            self._get_categories('cat')
        else:
            self._iter('subcat')
        self._iter('cat')

    def scrape_subcat( self, page_url: str, subcat='' ): #This is used to scrape a specific subcategory (used mostly for debugging)
        self.save = False #Results are not saved, and the state of the scraper is not saved
        '''
        page_url (str): Required. This is the url of the subcategory page.
        subcat (str): Optional. Name of the subcategory being scraped (all lowercase). If this is given, the scraper will check to see if subcategory
                                contains nested subcategories, and if so, it will unpack them.
        
        Running this returns nothing. The results of the scrape are stored in .df as a dataframe.
        '''
        self.url['subcat'] = page_url
        self.cat['subcat'] = subcat
        self._scrape_sub()
    
    def scrape_cat( self, page_url: str, debug=False ): #This is used to scrape a specific category (used mostly for debugging)
        self.save = False #Results are not saved, and the state of the scraper is not saved
        '''
        page_url (str): Required. This is the url of the category page.
        debug (bool): Optional. If True, the pages will not actually be scraped (they will simply be iterated over). This is for debugging purposes.
        
        Running this returns nothing. The results of the scrape are stored in .df as a dataframe.
        '''
        self._get_categories('cat')
        self._iter( 'cat', page_url, debug )
    
    def get_images( self, img_folder_path: str, df_path='' ):
        '''
        img_folder_path (str): Required. This is the path to folder where the images will be saved.
        df_path (str): Optional. If provided, the images from a results dataframe will be scraped. If not provided, results stored in the current
                                 object will be used.
        
        Running this returns nothing. The images will be stored in the desired folder.
        '''
        pass
        

# Scrape

In [80]:
scraper = CooksThesaurusScraper()
scraper.scrape()

Scraping: http://www.foodsubs.com/Roots.html
Scraping: http://www.foodsubs.com/Tubers.html
Scraping: http://www.foodsubs.com/Potatoes.html
Scraping: http://www.foodsubs.com/Sweetpotatoes.html
Scraping: http://www.foodsubs.com/Stalk.html
Scraping: http://www.foodsubs.com/Onions.html
Scraping: http://www.foodsubs.com/Garlic.html
Scraping: http://www.foodsubs.com/Ginger.html
Scraping: http://www.foodsubs.com/Cabbage.html
Scraping: http://www.foodsubs.com/Greensld.html
Scraping: http://www.foodsubs.com/Greenckg.html
Scraping: http://www.foodsubs.com/Vegiesinflor.html
Scraping: http://www.foodsubs.com/Snapbean.html
Scraping: http://www.foodsubs.com/Pods.html
Scraping: http://www.foodsubs.com/Shellpeas.html
Scraping: http://www.foodsubs.com/Shellbeans.html
Scraping: http://www.foodsubs.com/Mushroom.html
Scraping: http://www.foodsubs.com/Fruitvegies.html
Scraping: http://www.foodsubs.com/Seaveg.html
Scraping: http://www.foodsubs.com/Sprouts.html
Scraping: http://www.foodsubs.com/Vegies.html
S

In [None]:
scraper = CooksThesaurusScraper(False)
scraper.scrape()

In [98]:
scraper = CooksThesaurusScraper()
scraper.scrape_subcat( 'http://www.foodsubs.com/Fruitvegies.html','fruit vegetables' )

(133, 9)

In [99]:
scraper = CooksThesaurusScraper()
scraper.scrape_cat( 'http://www.foodsubs.com/FGVegetables.html',debug=True )

In [109]:
scraper.tags['subcat'][20].text.lower().replace('\n',' ').strip() in unpack_subcat

True