In [1]:
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
import requests
import os
import shutil
import time
import re
import json
import glob

URL = "https://www.winemag.com/?s=&drink_type=wine&pub_date_web={1}&page={0}"
HEADERS = {"user-agent": ("Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3")}
# Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36
# Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/85 Version/11.1.1 Safari/605.1.15
# Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36
# Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36
# Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3
session = requests.Session()
UNKNOWN_FORMAT = 0
APPELLATION_FORMAT_0 = 1
APPELLATION_FORMAT_1 = 2
APPELLATION_FORMAT_2 = 3

In [2]:
class Scraper:
    
    def __init__(self, pages_to_scrape=(1, 1), num_jobs=1, clear_old_data=True, year=2020):
        self.pages_to_scrape = pages_to_scrape
        self.num_jobs = num_jobs
        self.clear_old_data = clear_old_data
        self.session = requests.Session()
        self.start_time = time.time()
        self.page_scraped = pages_to_scrape[0]
        self.year = year

        if num_jobs > 1:      # Multi-threaded crawling
            self.multiprocessing = True
            self.worker_pool = Pool(num_jobs)
        else:
            self.multiprocessing = False

    def scrape_site(self):       # scrape data
        if self.clear_old_data:
            self.clear_data_dir()
        if self.multiprocessing:
            link_list = [URL.format(page, self.year)
                         for page in range(self.pages_to_scrape[0], self.pages_to_scrape[1] + 1)]
            records = self.worker_pool.map(self.scrape_page, link_list)
            self.worker_pool.terminate()
            self.worker_pool.join()
        else:
            for page in range(self.pages_to_scrape[0], self.pages_to_scrape[1] + 1):
                self.scrape_page(URL.format(page, self.year))
        self.condense_data()
        print("Scrape Finished...")
        
    def condense_data(self):       # condense data
        condensed_data = []
        all_files = glob.glob("{}/*.json".format("data"))    
        # The data of each page is stored in a json file, under the data package
        for file in all_files:
            with open(file, "rb") as fin:
                condensed_data += json.load(fin)
        print('\n\nThere is', len(condensed_data), 'data totally.')    # The length of total dataset
        filename = "{}.json".format("winemag dataset")    # The final json file
        with open(filename, "w") as fout:
            json.dump(condensed_data, fout)

    def scrape_page(self, page_url, isolated_review_count=0, retry_count=0):       # scrape every page
        scrape_data = []
        try:
            response = self.session.get(page_url, headers=HEADERS)
        except:
            retry_count += 1
            if retry_count <= 3:
                self.session = requests.Session()
                self.scrape_page(page_url, isolated_review_count, retry_count)
            else:
                raise

        soup = BeautifulSoup(response.content, "html.parser")
        reviews = soup.find_all("li", {"class": "review-item"})
        for review in reviews:
            isolated_review_count += 1
            review_url = review.find("a", {"class": "review-listing"})["href"]
            try:
                review_data = self.scrape_review(review_url)      # scrape review url
            except Exception as e:
                print("Encountered error", e)
                continue
            scrape_data.append(review_data)
        self.page_scraped += 1
#         time.sleep(3)
        self.update_scrape_status()       # tell which page has been scraped
        self.save_data(scrape_data)       # save to json file for page

    def scrape_review(self, review_url):
        review_response = self.session.get(review_url, headers=HEADERS)
        review_soup = BeautifulSoup(review_response.content, "html.parser")
        try:
            return self.parse_review(review_soup)
        except ReviewFormatException as e:
            print(f"\n-----\nError parsing: {review_url}\n{e.message}\n-----")

    def parse_review(self, review_soup):
        review_format = self.determine_review_format(review_soup)
        points = review_soup.find("span", {"id": "points"}).contents[0]      # points
        title = review_soup.title.string.split(" Rating")[0]      # title
        description = review_soup.find("p", {"class": "description"}).contents[0]      # description
        info_containers = review_soup.find("ul", {"class": "primary-info"}).find_all("li", {"class": "row"})

        if review_format["price_index"] is not None:
            try:
                price_string = (info_containers[review_format["price_index"]].find("div", {"class": "info"})
                                .span.span.contents[0].split(",")[0])
            except:
                raise ReviewFormatException("Unexpected price format")
            try:
                price = int(re.sub("[$]", "", price_string))  # Sometimes price is N/A     # price
            except ValueError:
                price = None
        else:
            price = None

        if review_format["variety_index"] is not None:
            try:
                variety = (info_containers[review_format["variety_index"]].find("div", {"class": "info"})
                           .span.findChildren()[0].contents[0])      # variety
            except:
                raise ReviewFormatException("Unexpected variety format")
        else:
            variety = None
        
        # Different countries have different appellation format.
        if review_format["appellation_index"] is not None:
            appellation_info = (info_containers[review_format["appellation_index"]].find("div", {"class": "info"})
                                .span.findChildren())
            try:
                if review_format["appellation_format"] == APPELLATION_FORMAT_0:
                    province = appellation_info[0].contents[0]           # province
                    country = appellation_info[1].contents[0]            # country
                elif review_format["appellation_format"] == APPELLATION_FORMAT_1:
                    province = appellation_info[1].contents[0]
                    country = appellation_info[2].contents[0]
                elif review_format["appellation_format"] == APPELLATION_FORMAT_2:
                    province = appellation_info[2].contents[0]
                    country = appellation_info[3].contents[0]
                else:
                    province = None
                    country = None
            except:
                raise ReviewFormatException("Unknown appellation format")
        else:
            province = None
            country = None

        if review_format["winery_index"] is not None:
            try:
                winery = (info_containers[review_format["winery_index"]].find("div", {"class": "info"})
                          .span.span.findChildren()[0].contents[0])        # winery
            except:
                raise ReviewFormatException("Unexpected winery format")
        else:
            winery = None

        taster_url = review_soup.find("span", {"class": "taster-area"}).contents[0]["href"]
        try:
            taster_name = self.scrape_taster(taster_url)        # taster_name
        except Exception as e:
            print("Encountered error", e)

        review_data = {"points": points, "title": title, "description": description, "taster_name": taster_name,
                       "price": price, "variety": variety, "province": province, "country": country, "winery": winery}
        return review_data

    def scrape_taster(self, taster_url):
        taster_response = self.session.get(taster_url, headers=HEADERS)
        taster_soup = BeautifulSoup(taster_response.content, "html.parser")
        try:
            return self.parse_taster(taster_soup)
        except ReviewFormatException as e:
            print(f"\n-----\nError parsing: {taster_url}\n{e.message}\n-----")

    def parse_taster(self, taster_soup):
        name = taster_soup.title.string.split(" |")[0]
        return name

    def determine_review_format(self, review_soup):
        review_format = {}
        info_containers = review_soup.find("ul", {"class": "primary-info"}).find_all("li", {"class": "row"})

        review_info = []
        for container in info_containers:
            review_info.append(str(container.find("span").contents[0]).lower())

        try:
            review_format["price_index"] = review_info.index("price")
        except ValueError:
            review_format["price_index"] = None
        try:
            review_format["variety_index"] = review_info.index("variety")
        except ValueError:
            review_format["variety_index"] = None
        try:
            review_format["appellation_index"] = review_info.index("appellation")
        except ValueError:
            review_format["appellation_index"] = None
        try:
            review_format["winery_index"] = review_info.index("winery")
        except ValueError:
            review_format["winery_index"] = None

        # The appellation format changes based on where in the world the winery is located
        if review_format["appellation_index"] is not None:
            appellation_info = (info_containers[review_format["appellation_index"]].find("div", {"class": "info"})
                                .span.findChildren())
            if len(appellation_info) == 2:
                review_format["appellation_format"] = APPELLATION_FORMAT_0
            elif len(appellation_info) == 3:
                review_format["appellation_format"] = APPELLATION_FORMAT_1
            elif len(appellation_info) == 4:
                review_format["appellation_format"] = APPELLATION_FORMAT_2
            else:
                review_format["appellation_format"] = UNKNOWN_FORMAT
        return review_format

    def save_data(self, data):           # save to json file for page
        filename = "{}/{}_{}.json".format("data", "winemag dataset", time.time())
        try:
            os.makedirs("data")
        except OSError:
            pass
        with open(filename, "w") as fout:
            json.dump(data, fout)

    def clear_all_data(self):
        self.clear_data_dir()
        self.clear_output_data()

    def clear_data_dir(self):
        try:
            shutil.rmtree("data")       # delete an entire directory tree, path is point to "data" directory
        except FileNotFoundError:
            pass

    def clear_output_data(self):
        try:
            os.remove("{}.json".format("winemag dataset"))
        except FileNotFoundError:
            pass

    def update_scrape_status(self):          # print which page has been scraped
        print("{0} page is finished.\r".format(self.page_scraped))

In [4]:
if __name__ == "__main__":
    pages_to_scrape = (0, 1999)    # 2000 pages (page 1-2000) are planned to be crawled totally,.
    winmag_scraper = Scraper(pages_to_scrape=pages_to_scrape, num_jobs=3, clear_old_data=False)
    winmag_scraper.scrape_site()   # stop at "1908 page is finished."

1 page is finished.
2 page is finished.
3 page is finished.
4 page is finished.
5 page is finished.
6 page is finished.
7 page is finished.
8 page is finished.
9 page is finished.
10 page is finished.
11 page is finished.
12 page is finished.
13 page is finished.
14 page is finished.
15 page is finished.
16 page is finished.
17 page is finished.
18 page is finished.
19 page is finished.
20 page is finished.
21 page is finished.
22 page is finished.
23 page is finished.
24 page is finished.
25 page is finished.
26 page is finished.
27 page is finished.
28 page is finished.
29 page is finished.
30 page is finished.
31 page is finished.
32 page is finished.
33 page is finished.
34 page is finished.
35 page is finished.
36 page is finished.
37 page is finished.
38 page is finished.
39 page is finished.
40 page is finished.
41 page is finished.
42 page is finished.
43 page is finished.
44 page is finished.
45 page is finished.
46 page is finished.
47 page is finished.
48 page is finished.
4

376 page is finished.
377 page is finished.
378 page is finished.
379 page is finished.
380 page is finished.
381 page is finished.
382 page is finished.
383 page is finished.
384 page is finished.
385 page is finished.
386 page is finished.
387 page is finished.
388 page is finished.
389 page is finished.
390 page is finished.
391 page is finished.
392 page is finished.
393 page is finished.
394 page is finished.
395 page is finished.
396 page is finished.
397 page is finished.
398 page is finished.
399 page is finished.
400 page is finished.
401 page is finished.
402 page is finished.
403 page is finished.
404 page is finished.
405 page is finished.
406 page is finished.
407 page is finished.
408 page is finished.
409 page is finished.
410 page is finished.
411 page is finished.
412 page is finished.
413 page is finished.
414 page is finished.
415 page is finished.
416 page is finished.
417 page is finished.
418 page is finished.
419 page is finished.
420 page is finished.
421 page i

749 page is finished.
750 page is finished.
751 page is finished.
752 page is finished.
753 page is finished.
754 page is finished.
755 page is finished.
756 page is finished.
757 page is finished.
758 page is finished.
759 page is finished.
760 page is finished.
761 page is finished.
762 page is finished.
763 page is finished.
764 page is finished.
765 page is finished.
766 page is finished.
767 page is finished.
768 page is finished.
769 page is finished.
770 page is finished.
771 page is finished.
772 page is finished.
773 page is finished.
774 page is finished.
775 page is finished.
776 page is finished.
777 page is finished.
778 page is finished.
779 page is finished.
780 page is finished.
781 page is finished.
782 page is finished.
783 page is finished.
784 page is finished.
785 page is finished.
786 page is finished.
787 page is finished.
788 page is finished.
789 page is finished.
790 page is finished.
791 page is finished.
792 page is finished.
793 page is finished.
794 page i

1117 page is finished.
1118 page is finished.
1119 page is finished.
1120 page is finished.
1121 page is finished.
1122 page is finished.
1123 page is finished.
1124 page is finished.
1125 page is finished.
1126 page is finished.
1127 page is finished.
1128 page is finished.
1129 page is finished.
1130 page is finished.
1131 page is finished.
1132 page is finished.
1133 page is finished.
1134 page is finished.
1135 page is finished.
1136 page is finished.
1137 page is finished.
1138 page is finished.
1139 page is finished.
1140 page is finished.
1141 page is finished.
1142 page is finished.
1143 page is finished.
1144 page is finished.
1145 page is finished.
1146 page is finished.
1147 page is finished.
1148 page is finished.
1149 page is finished.
1150 page is finished.
1151 page is finished.
1152 page is finished.
1153 page is finished.
1154 page is finished.
1155 page is finished.
1156 page is finished.
1157 page is finished.
1158 page is finished.
1159 page is finished.
1160 page i

1480 page is finished.

1481 page is finished.
1482 page is finished.
1483 page is finished.
1484 page is finished.
1485 page is finished.
1486 page is finished.
1487 page is finished.
1488 page is finished.
1489 page is finished.
1490 page is finished.
1491 page is finished.
1492 page is finished.
1493 page is finished.
1494 page is finished.
1495 page is finished.
1496 page is finished.
1497 page is finished.
1498 page is finished.
1499 page is finished.
1500 page is finished.
1501 page is finished.
1502 page is finished.
1503 page is finished.
1504 page is finished.
1505 page is finished.
1506 page is finished.
1507 page is finished.
1508 page is finished.
1509 page is finished.
1510 page is finished.
1511 page is finished.
1512 page is finished.
1513 page is finished.
1514 page is finished.
1515 page is finished.
1516 page is finished.
1517 page is finished.
1518 page is finished.
1519 page is finished.
1520 page is finished.
1521 page is finished.
1522 page is finished.
1523 page 

1837 page is finished.
1838 page is finished.
1839 page is finished.
1840 page is finished.
1841 page is finished.
1842 page is finished.
1843 page is finished.
1844 page is finished.
1845 page is finished.
1846 page is finished.
1847 page is finished.
1848 page is finished.
1849 page is finished.
1850 page is finished.
1851 page is finished.
1852 page is finished.
1853 page is finished.
1854 page is finished.
1855 page is finished.
1856 page is finished.
1857 page is finished.
1858 page is finished.
1859 page is finished.
1860 page is finished.
1861 page is finished.
1862 page is finished.
1863 page is finished.
1864 page is finished.
1865 page is finished.
1866 page is finished.
1867 page is finished.
1868 page is finished.
1869 page is finished.
1870 page is finished.
1871 page is finished.
1872 page is finished.
1873 page is finished.
1874 page is finished.
1875 page is finished.
1876 page is finished.
1877 page is finished.
1878 page is finished.
1879 page is finished.
1880 page i

TypeError: 'NoneType' object is not subscriptable

In [5]:
if __name__ == "__main__":
    pages_to_scrape = (1908, 1999)   # 2000 pages (page 1-2000) are planned to be crawled totally,.
    winmag_scraper = Scraper(pages_to_scrape=pages_to_scrape, num_jobs=3, clear_old_data=False)
    winmag_scraper.scrape_site()  

1909 page is finished.
1910 page is finished.
1911 page is finished.
1913 page is finished.

1914 page is finished.
1915 page is finished.
1916 page is finished.
1917 page is finished.
1918 page is finished.
1919 page is finished.
1920 page is finished.
1921 page is finished.
1922 page is finished.
1923 page is finished.
1924 page is finished.
1925 page is finished.
1926 page is finished.
1927 page is finished.
1928 page is finished.
1929 page is finished.
1930 page is finished.
1931 page is finished.
1932 page is finished.
1933 page is finished.
1934 page is finished.
1935 page is finished.
1936 page is finished.
1937 page is finished.
1938 page is finished.
1939 page is finished.
1940 page is finished.
1941 page is finished.
1942 page is finished.
1943 page is finished.
1944 page is finished.
1945 page is finished.
1946 page is finished.
1947 page is finished.
1948 page is finished.
1949 page is finished.
1950 page is finished.
1951 page is finished.
1952 page is finished.
1953 page 

In [4]:
if __name__ == "__main__":
    pages_to_scrape = (2000, 3999)   # 2000 pages (page 2001-4000) are planned to be crawled totally,.
    winmag_scraper = Scraper(pages_to_scrape=pages_to_scrape, num_jobs=5, clear_old_data=False)
    winmag_scraper.scrape_site()     # stop at "2500 page is finished."

2001 page is finished.
2002 page is finished.
2003 page is finished.
2004 page is finished.
2005 page is finished.
2006 page is finished.
2007 page is finished.
2008 page is finished.
2009 page is finished.
2010 page is finished.
2011 page is finished.
2012 page is finished.
2013 page is finished.
2014 page is finished.
2015 page is finished.
2016 page is finished.
2017 page is finished.
2018 page is finished.
2019 page is finished.
2020 page is finished.
2021 page is finished.
2022 page is finished.
2023 page is finished.
2024 page is finished.
2025 page is finished.
2026 page is finished.
2027 page is finished.
2028 page is finished.
2029 page is finished.
2030 page is finished.
2031 page is finished.
2032 page is finished.
2033 page is finished.
2034 page is finished.
2035 page is finished.
2036 page is finished.
2037 page is finished.
2038 page is finished.
2039 page is finished.
2040 page is finished.
2041 page is finished.
2042 page is finished.
2043 page is finished.
2044 page i

2360 page is finished.
2361 page is finished.
2362 page is finished.
2363 page is finished.
2364 page is finished.
2365 page is finished.
2366 page is finished.
2367 page is finished.
2368 page is finished.
2369 page is finished.
2370 page is finished.
2371 page is finished.
2372 page is finished.
2373 page is finished.
2374 page is finished.
2375 page is finished.
2376 page is finished.
2377 page is finished.
2378 page is finished.
2379 page is finished.
2380 page is finished.
2381 page is finished.
2382 page is finished.
2383 page is finished.
2384 page is finished.
2385 page is finished.
2386 page is finished.
2387 page is finished.
2388 page is finished.
2389 page is finished.
2390 page is finished.
2391 page is finished.
2392 page is finished.
2393 page is finished.
2394 page is finished.
2395 page is finished.
2396 page is finished.
2397 page is finished.
2398 page is finished.
2399 page is finished.
2400 page is finished.
2401 page is finished.
2402 page is finished.
2403 page i

ConnectionError: HTTPSConnectionPool(host='www.winemag.com', port=443): Max retries exceeded with url: /?s=&drink_type=wine&pub_date_web=2020&page=2100 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fc829e2ec10>: Failed to establish a new connection: [Errno 60] Operation timed out'))

In [8]:
if __name__ == "__main__":
    pages_to_scrape = (2500, 3999)   # 1000 pages (page 2001-4000) are planned to be crawled totally,.
    winmag_scraper = Scraper(pages_to_scrape=pages_to_scrape, num_jobs=5, clear_old_data=False)
    winmag_scraper.scrape_site()   

2501 page is finished.
2502 page is finished.
2503 page is finished.
2504 page is finished.
2505 page is finished.
2506 page is finished.
2507 page is finished.
2508 page is finished.
2509 page is finished.
2510 page is finished.
2511 page is finished.
2512 page is finished.
2513 page is finished.
2514 page is finished.
2515 page is finished.
2516 page is finished.
2517 page is finished.
2518 page is finished.
2519 page is finished.
2520 page is finished.
2521 page is finished.
2522 page is finished.
2523 page is finished.
2524 page is finished.
2525 page is finished.
2526 page is finished.
2527 page is finished.
2528 page is finished.
2529 page is finished.
2530 page is finished.
2531 page is finished.
2532 page is finished.
2533 page is finished.
2534 page is finished.
2535 page is finished.
2536 page is finished.
2537 page is finished.
2538 page is finished.
2539 page is finished.
2540 page is finished.
2541 page is finished.
2543 page is finished.

2544 page is finished.
2545 page 

2863 page is finished.
2864 page is finished.
2865 page is finished.
2866 page is finished.
2867 page is finished.
2868 page is finished.
2869 page is finished.
2870 page is finished.
2871 page is finished.
2872 page is finished.
2873 page is finished.
2874 page is finished.
2875 page is finished.
2876 page is finished.
2877 page is finished.
2878 page is finished.
2879 page is finished.
2880 page is finished.
2881 page is finished.
2882 page is finished.
2883 page is finished.
2884 page is finished.
2885 page is finished.
2886 page is finished.
2887 page is finished.
2888 page is finished.
2889 page is finished.
2890 page is finished.
2891 page is finished.
2892 page is finished.
2893 page is finished.
2894 page is finished.
2895 page is finished.
2896 page is finished.
2897 page is finished.
2898 page is finished.
2899 page is finished.
2900 page is finished.
2901 page is finished.
2902 page is finished.
2903 page is finished.
2904 page is finished.
2905 page is finished.
2906 page i

3220 page is finished.
3221 page is finished.
3222 page is finished.
3223 page is finished.
3224 page is finished.
3225 page is finished.
3226 page is finished.
3227 page is finished.
3228 page is finished.
3229 page is finished.
3230 page is finished.
3231 page is finished.
3232 page is finished.
3233 page is finished.
3234 page is finished.
3235 page is finished.
3236 page is finished.
3237 page is finished.
3238 page is finished.
3239 page is finished.
3240 page is finished.
3242 page is finished.

3243 page is finished.
3244 page is finished.
3245 page is finished.
3246 page is finished.
3247 page is finished.
3248 page is finished.
3249 page is finished.
3250 page is finished.
3251 page is finished.
3252 page is finished.
3253 page is finished.
3254 page is finished.
3255 page is finished.
3256 page is finished.
3257 page is finished.
3258 page is finished.
3259 page is finished.
3260 page is finished.
3261 page is finished.
3262 page is finished.
3263 page is finished.
3264 page 

3578 page is finished.
3579 page is finished.
3580 page is finished.
3581 page is finished.
3582 page is finished.
3583 page is finished.
3584 page is finished.
3585 page is finished.
3586 page is finished.
3587 page is finished.
3588 page is finished.
3589 page is finished.
3590 page is finished.
3591 page is finished.
3592 page is finished.
3593 page is finished.
3594 page is finished.
3595 page is finished.
3596 page is finished.
3597 page is finished.
3598 page is finished.
3599 page is finished.
3600 page is finished.
3601 page is finished.
3602 page is finished.
3603 page is finished.
3604 page is finished.
3605 page is finished.
3606 page is finished.
3607 page is finished.
3608 page is finished.
3609 page is finished.
3610 page is finished.
3611 page is finished.
3612 page is finished.
3613 page is finished.
3614 page is finished.
3615 page is finished.
3616 page is finished.
3617 page is finished.
3618 page is finished.
3619 page is finished.
3620 page is finished.
3621 page i

3939 page is finished.
3940 page is finished.
3941 page is finished.
3942 page is finished.
3943 page is finished.
3944 page is finished.
3945 page is finished.
3946 page is finished.
3947 page is finished.
3948 page is finished.
3949 page is finished.
3950 page is finished.
3951 page is finished.
3952 page is finished.
3953 page is finished.
3954 page is finished.
3955 page is finished.
3956 page is finished.
3957 page is finished.
3958 page is finished.
3959 page is finished.
3960 page is finished.
3961 page is finished.
3962 page is finished.
3963 page is finished.
3964 page is finished.
3965 page is finished.
3966 page is finished.
3967 page is finished.
3968 page is finished.
3969 page is finished.
3970 page is finished.
3971 page is finished.
3972 page is finished.
3973 page is finished.
3974 page is finished.
3975 page is finished.
3976 page is finished.
3977 page is finished.
3978 page is finished.
3979 page is finished.
3980 page is finished.
3981 page is finished.
3982 page i

In [9]:
if __name__ == "__main__":
    pages_to_scrape = (4000, 6999)   # 3000 pages (page 4001-7000) are planned to be crawled totally,.
    winmag_scraper = Scraper(pages_to_scrape=pages_to_scrape, num_jobs=5, clear_old_data=False)
    winmag_scraper.scrape_site()   

4001 page is finished.
4002 page is finished.
4003 page is finished.
4004 page is finished.
4005 page is finished.
4006 page is finished.
4007 page is finished.
4008 page is finished.
4009 page is finished.
4010 page is finished.
4011 page is finished.
4012 page is finished.
4013 page is finished.
4014 page is finished.
4015 page is finished.
4016 page is finished.
4017 page is finished.
4018 page is finished.
4019 page is finished.
4020 page is finished.
4021 page is finished.
4022 page is finished.
4023 page is finished.
4024 page is finished.
4025 page is finished.
4026 page is finished.
4027 page is finished.
4028 page is finished.
4029 page is finished.
4030 page is finished.
4031 page is finished.
4032 page is finished.
4033 page is finished.
4034 page is finished.
4035 page is finished.
4036 page is finished.
4037 page is finished.
4038 page is finished.
4039 page is finished.
4040 page is finished.
4041 page is finished.
4042 page is finished.
4043 page is finished.
4044 page i

4358 page is finished.
4359 page is finished.
4360 page is finished.
4361 page is finished.
4362 page is finished.
4363 page is finished.
4364 page is finished.
4365 page is finished.
4366 page is finished.
4367 page is finished.
4368 page is finished.
4369 page is finished.
4370 page is finished.
4371 page is finished.
4372 page is finished.
4373 page is finished.
4374 page is finished.
4375 page is finished.
4376 page is finished.
4377 page is finished.
4378 page is finished.
4379 page is finished.
4380 page is finished.
4381 page is finished.
4382 page is finished.
4383 page is finished.
4384 page is finished.
4385 page is finished.
4386 page is finished.
4387 page is finished.
4388 page is finished.
4389 page is finished.
4390 page is finished.
4391 page is finished.
4392 page is finished.
4393 page is finished.
4394 page is finished.
4395 page is finished.
4396 page is finished.
4397 page is finished.
4398 page is finished.
4399 page is finished.
4400 page is finished.
4401 page i

4717 page is finished.
4718 page is finished.
4719 page is finished.
4720 page is finished.
4721 page is finished.
4722 page is finished.
4723 page is finished.
4724 page is finished.
4725 page is finished.
4726 page is finished.
4727 page is finished.
4728 page is finished.
4729 page is finished.
4730 page is finished.
4731 page is finished.
4732 page is finished.
4733 page is finished.
4734 page is finished.
4735 page is finished.
4736 page is finished.
4737 page is finished.
4738 page is finished.
4739 page is finished.
4740 page is finished.
4741 page is finished.
4742 page is finished.
4743 page is finished.
4744 page is finished.
4745 page is finished.
4746 page is finished.
4747 page is finished.
4748 page is finished.
4749 page is finished.
4750 page is finished.
4751 page is finished.
4752 page is finished.
4753 page is finished.
4754 page is finished.
4755 page is finished.
4756 page is finished.
4757 page is finished.
4758 page is finished.
4759 page is finished.
4760 page i

5075 page is finished.
5076 page is finished.
5077 page is finished.
5078 page is finished.
5079 page is finished.
5080 page is finished.
5081 page is finished.
5082 page is finished.
5083 page is finished.
5084 page is finished.
5085 page is finished.
5086 page is finished.
5087 page is finished.
5088 page is finished.
5089 page is finished.
5090 page is finished.
5091 page is finished.
5092 page is finished.
5093 page is finished.
5094 page is finished.
5095 page is finished.
5096 page is finished.
5097 page is finished.
5098 page is finished.
5099 page is finished.
5100 page is finished.
5101 page is finished.
5102 page is finished.
5103 page is finished.
5104 page is finished.
5105 page is finished.
5106 page is finished.
5107 page is finished.
5108 page is finished.
5109 page is finished.
5110 page is finished.
5111 page is finished.
5112 page is finished.
5113 page is finished.
5114 page is finished.
5115 page is finished.
5116 page is finished.
5117 page is finished.
5118 page i

5434 page is finished.
5435 page is finished.
5436 page is finished.
5437 page is finished.
5438 page is finished.
5439 page is finished.
5440 page is finished.
5441 page is finished.
5442 page is finished.
5443 page is finished.
5444 page is finished.
5445 page is finished.
5446 page is finished.
5447 page is finished.
5448 page is finished.
5449 page is finished.
5450 page is finished.
5451 page is finished.
5452 page is finished.
5453 page is finished.
5454 page is finished.
5455 page is finished.
5456 page is finished.
5457 page is finished.
5458 page is finished.
5459 page is finished.
5460 page is finished.
5461 page is finished.
5463 page is finished.

5464 page is finished.
5465 page is finished.
5466 page is finished.
5468 page is finished.

5469 page is finished.
5470 page is finished.
5471 page is finished.
5472 page is finished.
5473 page is finished.
5474 page is finished.
5475 page is finished.
5476 page is finished.
5477 page is finished.
5478 page is finished.
5479 page

5795 page is finished.
5796 page is finished.
5797 page is finished.
5798 page is finished.
5799 page is finished.
5800 page is finished.
5801 page is finished.
5802 page is finished.
5803 page is finished.
5804 page is finished.
5805 page is finished.
5806 page is finished.
5807 page is finished.
5808 page is finished.
5809 page is finished.
5810 page is finished.
5811 page is finished.
5812 page is finished.
5813 page is finished.
5814 page is finished.
5815 page is finished.
5816 page is finished.
5817 page is finished.
5818 page is finished.
5819 page is finished.
5820 page is finished.
5821 page is finished.
5822 page is finished.
5823 page is finished.
5824 page is finished.
5825 page is finished.
5826 page is finished.
5827 page is finished.
5828 page is finished.
5829 page is finished.
5830 page is finished.
5831 page is finished.
5832 page is finished.
5833 page is finished.
5834 page is finished.
5835 page is finished.
5836 page is finished.
5837 page is finished.
5838 page i

6153 page is finished.
6154 page is finished.
6155 page is finished.
6156 page is finished.
6157 page is finished.
6158 page is finished.
6159 page is finished.
6160 page is finished.
6161 page is finished.
6162 page is finished.
6163 page is finished.
6164 page is finished.
6165 page is finished.
6166 page is finished.
6167 page is finished.
6168 page is finished.
6169 page is finished.
6170 page is finished.
6171 page is finished.
6172 page is finished.
6173 page is finished.
6174 page is finished.
6175 page is finished.
6176 page is finished.
6177 page is finished.
6178 page is finished.
6179 page is finished.
6180 page is finished.
6181 page is finished.
6182 page is finished.
6183 page is finished.
6184 page is finished.
6185 page is finished.
6186 page is finished.
6187 page is finished.
6188 page is finished.
6189 page is finished.
6190 page is finished.
6191 page is finished.
6192 page is finished.
6193 page is finished.
6194 page is finished.
6195 page is finished.
6196 page i

6511 page is finished.
6512 page is finished.
6513 page is finished.
6514 page is finished.
6515 page is finished.
6516 page is finished.
6517 page is finished.
6518 page is finished.
6519 page is finished.
6520 page is finished.
6521 page is finished.
6522 page is finished.
6523 page is finished.
6524 page is finished.
6525 page is finished.
6526 page is finished.
6527 page is finished.
6528 page is finished.
6529 page is finished.
6530 page is finished.
6531 page is finished.
6532 page is finished.
6533 page is finished.
6534 page is finished.
6535 page is finished.
6536 page is finished.
6537 page is finished.
6538 page is finished.
6539 page is finished.
6540 page is finished.
6541 page is finished.
6542 page is finished.
6543 page is finished.
6544 page is finished.
6545 page is finished.
6546 page is finished.
6547 page is finished.
6548 page is finished.
6549 page is finished.
6550 page is finished.
6551 page is finished.
6552 page is finished.
6553 page is finished.
6554 page i

6869 page is finished.
6870 page is finished.
6871 page is finished.
6872 page is finished.
6873 page is finished.
6874 page is finished.
6875 page is finished.
6876 page is finished.
6877 page is finished.
6878 page is finished.
6879 page is finished.
6880 page is finished.
6881 page is finished.
6882 page is finished.
6883 page is finished.
6884 page is finished.
6885 page is finished.
6886 page is finished.
6887 page is finished.
6888 page is finished.
6889 page is finished.
6890 page is finished.
6891 page is finished.
6892 page is finished.
6893 page is finished.
6894 page is finished.
6895 page is finished.
6896 page is finished.
6897 page is finished.
6898 page is finished.
6899 page is finished.
6900 page is finished.
6901 page is finished.
6902 page is finished.
6903 page is finished.
6904 page is finished.
6905 page is finished.
6906 page is finished.
6907 page is finished.
6908 page is finished.
6909 page is finished.
6910 page is finished.
6911 page is finished.
6912 page i

In [10]:
if __name__ == "__main__":
    pages_to_scrape = (7000, 9999)   # 3000 pages (page 7001-10000) are planned to be crawled totally,.
    winmag_scraper = Scraper(pages_to_scrape=pages_to_scrape, num_jobs=5, clear_old_data=False)
    winmag_scraper.scrape_site()   

7001 page is finished.
7002 page is finished.
7003 page is finished.
7004 page is finished.
7005 page is finished.
7006 page is finished.
7007 page is finished.
7008 page is finished.
7009 page is finished.
7010 page is finished.
7011 page is finished.
7012 page is finished.
7013 page is finished.
7014 page is finished.
7015 page is finished.
7016 page is finished.
7017 page is finished.
7018 page is finished.
7019 page is finished.
7020 page is finished.
7021 page is finished.
7022 page is finished.
7023 page is finished.
7024 page is finished.
7025 page is finished.
7026 page is finished.
7027 page is finished.
7028 page is finished.
7029 page is finished.
7030 page is finished.
7031 page is finished.
7032 page is finished.
7033 page is finished.
7034 page is finished.
7035 page is finished.
7036 page is finished.
7037 page is finished.
7038 page is finished.
7039 page is finished.
7040 page is finished.
7041 page is finished.
7042 page is finished.
7043 page is finished.
7044 page i

7359 page is finished.
7360 page is finished.
7361 page is finished.
7362 page is finished.
7363 page is finished.
7364 page is finished.
7365 page is finished.
7366 page is finished.
7367 page is finished.
7368 page is finished.
7369 page is finished.
7370 page is finished.
7371 page is finished.
7372 page is finished.
7373 page is finished.
7374 page is finished.
7375 page is finished.
7376 page is finished.
7377 page is finished.
7378 page is finished.
7379 page is finished.
7380 page is finished.
7381 page is finished.
7382 page is finished.
7383 page is finished.
7384 page is finished.
7385 page is finished.
7386 page is finished.
7387 page is finished.
7388 page is finished.
7389 page is finished.
7390 page is finished.
7391 page is finished.
7392 page is finished.
7393 page is finished.
7394 page is finished.
7395 page is finished.
7396 page is finished.
7397 page is finished.
7398 page is finished.
7399 page is finished.
7400 page is finished.
7401 page is finished.
7402 page i

7716 page is finished.
7717 page is finished.
7718 page is finished.
7719 page is finished.
7720 page is finished.
7721 page is finished.
7722 page is finished.
7723 page is finished.
7724 page is finished.
7725 page is finished.
7726 page is finished.
7727 page is finished.
7728 page is finished.
7729 page is finished.
7730 page is finished.
7731 page is finished.
7732 page is finished.
7733 page is finished.
7734 page is finished.
7735 page is finished.
7736 page is finished.
7737 page is finished.
7738 page is finished.
7739 page is finished.
7740 page is finished.
7741 page is finished.
7742 page is finished.
7743 page is finished.
7744 page is finished.
7745 page is finished.
7746 page is finished.
7747 page is finished.
7748 page is finished.
7749 page is finished.
7750 page is finished.
7751 page is finished.
7752 page is finished.
7753 page is finished.
7754 page is finished.
7755 page is finished.
7756 page is finished.
7757 page is finished.
7758 page is finished.
7759 page i

8077 page is finished.
8078 page is finished.
8079 page is finished.
8080 page is finished.
8081 page is finished.
8082 page is finished.
8083 page is finished.
8084 page is finished.
8085 page is finished.
8086 page is finished.
8087 page is finished.
8088 page is finished.
8089 page is finished.
8090 page is finished.
8091 page is finished.
8092 page is finished.
8093 page is finished.
8094 page is finished.
8095 page is finished.
8096 page is finished.
8097 page is finished.
8098 page is finished.
8099 page is finished.
8100 page is finished.
8101 page is finished.
8102 page is finished.
8103 page is finished.
8104 page is finished.
8105 page is finished.
8106 page is finished.
8107 page is finished.
8108 page is finished.
8109 page is finished.
8110 page is finished.
8111 page is finished.
8112 page is finished.
8113 page is finished.
8114 page is finished.
8115 page is finished.
8116 page is finished.
8117 page is finished.
8118 page is finished.
8119 page is finished.
8120 page i

8436 page is finished.
8437 page is finished.
8438 page is finished.
8439 page is finished.
8440 page is finished.
8441 page is finished.
8442 page is finished.
8443 page is finished.
8444 page is finished.
8445 page is finished.
8446 page is finished.
8447 page is finished.
8448 page is finished.
8449 page is finished.
8450 page is finished.
8451 page is finished.
8452 page is finished.
8453 page is finished.
8454 page is finished.
8455 page is finished.
8456 page is finished.
8457 page is finished.
8458 page is finished.
8459 page is finished.
8460 page is finished.
8461 page is finished.
8462 page is finished.
8463 page is finished.
8464 page is finished.
8465 page is finished.
8466 page is finished.
8467 page is finished.
8468 page is finished.
8469 page is finished.
8470 page is finished.
8471 page is finished.
8472 page is finished.
8473 page is finished.
8474 page is finished.
8475 page is finished.
8476 page is finished.
8477 page is finished.
8478 page is finished.
8479 page i

8797 page is finished.
8798 page is finished.
8799 page is finished.
8800 page is finished.
8801 page is finished.
8802 page is finished.
8803 page is finished.
8804 page is finished.
8805 page is finished.
8806 page is finished.
8807 page is finished.
8808 page is finished.
8809 page is finished.
8810 page is finished.
8811 page is finished.
8812 page is finished.
8813 page is finished.
8814 page is finished.
8815 page is finished.
8816 page is finished.
8817 page is finished.
8818 page is finished.
8819 page is finished.
8821 page is finished.

8822 page is finished.
8823 page is finished.
8824 page is finished.
8825 page is finished.
8826 page is finished.
8827 page is finished.
8828 page is finished.
8829 page is finished.
8830 page is finished.
8831 page is finished.
8832 page is finished.
8833 page is finished.
8834 page is finished.
8835 page is finished.
8836 page is finished.
8837 page is finished.
8838 page is finished.
8839 page is finished.
8840 page is finished.
8841 page 

9158 page is finished.

9159 page is finished.
9160 page is finished.
9161 page is finished.
9162 page is finished.
9163 page is finished.
9164 page is finished.
9165 page is finished.
9166 page is finished.
9167 page is finished.
9168 page is finished.
9169 page is finished.
9170 page is finished.
9171 page is finished.
9172 page is finished.
9173 page is finished.
9174 page is finished.
9175 page is finished.
9176 page is finished.
9177 page is finished.
9178 page is finished.
9179 page is finished.
9180 page is finished.
9181 page is finished.
9182 page is finished.
9183 page is finished.
9184 page is finished.
9185 page is finished.
9186 page is finished.
9187 page is finished.
9188 page is finished.
9189 page is finished.
9190 page is finished.
9191 page is finished.
9192 page is finished.
9193 page is finished.
9194 page is finished.
9195 page is finished.
9196 page is finished.
9197 page is finished.
9198 page is finished.
9199 page is finished.
9200 page is finished.
9201 page 

9516 page is finished.
9517 page is finished.
9518 page is finished.
9519 page is finished.
9520 page is finished.
9521 page is finished.
9522 page is finished.
9523 page is finished.
9524 page is finished.
9525 page is finished.
9526 page is finished.
9527 page is finished.
9528 page is finished.
9529 page is finished.
9530 page is finished.
9531 page is finished.
9532 page is finished.
9533 page is finished.
9534 page is finished.
9535 page is finished.
9536 page is finished.
9537 page is finished.
9538 page is finished.
9539 page is finished.
9540 page is finished.
9541 page is finished.
9542 page is finished.
9543 page is finished.
9544 page is finished.
9545 page is finished.
9546 page is finished.
9547 page is finished.
9548 page is finished.
9549 page is finished.
9550 page is finished.
9551 page is finished.
9552 page is finished.
9553 page is finished.
9554 page is finished.
9555 page is finished.
9556 page is finished.
9557 page is finished.
9558 page is finished.
9559 page i

9875 page is finished.
9876 page is finished.
9877 page is finished.
9878 page is finished.
9879 page is finished.
9880 page is finished.
9881 page is finished.
9882 page is finished.
9883 page is finished.
9884 page is finished.
9885 page is finished.
9886 page is finished.
9887 page is finished.
9888 page is finished.
9889 page is finished.
9890 page is finished.
9891 page is finished.
9892 page is finished.
9893 page is finished.
9894 page is finished.
9895 page is finished.
9896 page is finished.
9897 page is finished.
9898 page is finished.
9899 page is finished.
9900 page is finished.
9901 page is finished.
9902 page is finished.
9903 page is finished.
9904 page is finished.
9905 page is finished.
9906 page is finished.
9907 page is finished.
9908 page is finished.
9909 page is finished.
9910 page is finished.
9911 page is finished.
9912 page is finished.
9913 page is finished.
9914 page is finished.
9915 page is finished.
9916 page is finished.
9917 page is finished.
9918 page i

In [11]:
if __name__ == "__main__":
    pages_to_scrape = (10000, 14974)   # All remaining pages are planned to be crawled totally,.
    winmag_scraper = Scraper(pages_to_scrape=pages_to_scrape, num_jobs=7, clear_old_data=False)
    winmag_scraper.scrape_site()       # stop at "11964 page is finished."

10001 page is finished.
10002 page is finished.
10003 page is finished.
10004 page is finished.
10005 page is finished.
10006 page is finished.
10007 page is finished.
10008 page is finished.
10009 page is finished.
10010 page is finished.
10011 page is finished.
10012 page is finished.
10013 page is finished.
10014 page is finished.
10015 page is finished.
10016 page is finished.
10017 page is finished.
10018 page is finished.
10019 page is finished.
10020 page is finished.
10021 page is finished.
10022 page is finished.
10023 page is finished.
10024 page is finished.
10025 page is finished.
10026 page is finished.
10027 page is finished.
10028 page is finished.
10029 page is finished.
10030 page is finished.
10031 page is finished.
10032 page is finished.
10033 page is finished.
10034 page is finished.
10035 page is finished.
10036 page is finished.
10038 page is finished.

10039 page is finished.
10040 page is finished.
10041 page is finished.
10042 page is finished.
10043 page is f

10349 page is finished.
10350 page is finished.
10351 page is finished.
10352 page is finished.
10353 page is finished.
10354 page is finished.
10355 page is finished.
10356 page is finished.
10357 page is finished.
10358 page is finished.
10359 page is finished.
10360 page is finished.
10361 page is finished.
10362 page is finished.
10363 page is finished.
10364 page is finished.
10365 page is finished.
10366 page is finished.
10367 page is finished.
10368 page is finished.
10369 page is finished.
10370 page is finished.
10371 page is finished.
10372 page is finished.
10373 page is finished.
10374 page is finished.
10375 page is finished.
10376 page is finished.
10377 page is finished.
10378 page is finished.
10379 page is finished.
10380 page is finished.
10381 page is finished.
10382 page is finished.
10383 page is finished.
10385 page is finished.

10386 page is finished.
10387 page is finished.
10388 page is finished.
10389 page is finished.
10390 page is finished.
10391 page is f

10694 page is finished.
10695 page is finished.
10696 page is finished.
10697 page is finished.
10698 page is finished.
10699 page is finished.
10700 page is finished.
10701 page is finished.
10702 page is finished.
10703 page is finished.
10704 page is finished.
10705 page is finished.
10706 page is finished.
10707 page is finished.
10708 page is finished.
10709 page is finished.
10710 page is finished.
10711 page is finished.
10712 page is finished.
10713 page is finished.
10714 page is finished.
10715 page is finished.
10716 page is finished.
10717 page is finished.
10718 page is finished.
10719 page is finished.
10720 page is finished.
10721 page is finished.
10722 page is finished.
10723 page is finished.
10724 page is finished.
10725 page is finished.
10726 page is finished.
10727 page is finished.
10729 page is finished.

10730 page is finished.
10731 page is finished.
10732 page is finished.
10733 page is finished.
10734 page is finished.
10735 page is finished.
10736 page is f

11040 page is finished.
11041 page is finished.
11042 page is finished.
11043 page is finished.
11044 page is finished.
11045 page is finished.
11046 page is finished.
11047 page is finished.
11048 page is finished.
11049 page is finished.
11050 page is finished.
11051 page is finished.
11052 page is finished.
11053 page is finished.
11054 page is finished.
11055 page is finished.
11056 page is finished.
11057 page is finished.
11058 page is finished.
11059 page is finished.
11060 page is finished.
11061 page is finished.
11062 page is finished.
11063 page is finished.
11064 page is finished.
11065 page is finished.
11066 page is finished.
11067 page is finished.
11068 page is finished.
11069 page is finished.
11070 page is finished.
11071 page is finished.
11072 page is finished.
11073 page is finished.
11074 page is finished.
11075 page is finished.
11076 page is finished.
11077 page is finished.
11078 page is finished.
11079 page is finished.
11080 page is finished.
11081 page is fi

11386 page is finished.
11387 page is finished.
11388 page is finished.
11389 page is finished.
11390 page is finished.
11391 page is finished.
11392 page is finished.
11393 page is finished.
11394 page is finished.
11395 page is finished.
11396 page is finished.
11397 page is finished.
11398 page is finished.
11399 page is finished.
11400 page is finished.
11401 page is finished.
11402 page is finished.
11403 page is finished.
11404 page is finished.
11405 page is finished.
11406 page is finished.
11407 page is finished.
11408 page is finished.
11409 page is finished.
11410 page is finished.
11411 page is finished.
11412 page is finished.
11413 page is finished.
11414 page is finished.
11415 page is finished.
11416 page is finished.
11417 page is finished.
11418 page is finished.
11419 page is finished.
11420 page is finished.
11421 page is finished.
11422 page is finished.
11423 page is finished.
11424 page is finished.
11425 page is finished.
11426 page is finished.
11427 page is fi

11732 page is finished.
11733 page is finished.
11734 page is finished.
11735 page is finished.
11736 page is finished.
11737 page is finished.
11738 page is finished.
11739 page is finished.
11740 page is finished.
11741 page is finished.
11742 page is finished.
11743 page is finished.
11744 page is finished.
11745 page is finished.
11746 page is finished.
11747 page is finished.
11748 page is finished.
11749 page is finished.
11750 page is finished.
11751 page is finished.
11752 page is finished.
11753 page is finished.
11754 page is finished.
11755 page is finished.
11756 page is finished.
11757 page is finished.
11758 page is finished.
11759 page is finished.
11760 page is finished.
11761 page is finished.
11762 page is finished.
11763 page is finished.
11764 page is finished.
11765 page is finished.
11766 page is finished.
11768 page is finished.

11769 page is finished.
11770 page is finished.
11771 page is finished.
11772 page is finished.
11773 page is finished.
11774 page is f

KeyboardInterrupt: 

In [4]:
if __name__ == "__main__":
    pages_to_scrape = (11964, 14974)   # All remaining pages are planned to be crawled totally,.
    winmag_scraper = Scraper(pages_to_scrape=pages_to_scrape, num_jobs=7, clear_old_data=False)
    winmag_scraper.scrape_site()       # stop at "13055 page is finished."

11965 page is finished.
11966 page is finished.
11967 page is finished.
11968 page is finished.
11969 page is finished.
11970 page is finished.
11971 page is finished.
11972 page is finished.
11973 page is finished.
11974 page is finished.
11975 page is finished.
11976 page is finished.
11977 page is finished.
11978 page is finished.
11979 page is finished.
11980 page is finished.
11981 page is finished.
11982 page is finished.
11983 page is finished.
11984 page is finished.
11985 page is finished.
11986 page is finished.
11987 page is finished.
11988 page is finished.
11989 page is finished.
11990 page is finished.
11991 page is finished.
11992 page is finished.
11993 page is finished.
11994 page is finished.
11995 page is finished.
11996 page is finished.
11997 page is finished.
11998 page is finished.
11999 page is finished.
12000 page is finished.
12001 page is finished.
12002 page is finished.
12003 page is finished.
12005 page is finished.

12006 page is finished.
12007 page is f

12315 page is finished.
12316 page is finished.
12317 page is finished.
12318 page is finished.
12319 page is finished.
12320 page is finished.
12321 page is finished.
12322 page is finished.
12323 page is finished.
12324 page is finished.
12325 page is finished.
12326 page is finished.
12327 page is finished.
12328 page is finished.
12329 page is finished.
12330 page is finished.
12331 page is finished.
12332 page is finished.
12333 page is finished.
12334 page is finished.
12335 page is finished.
12336 page is finished.
12337 page is finished.
12338 page is finished.
12339 page is finished.
12340 page is finished.
12341 page is finished.
12342 page is finished.
12343 page is finished.
12344 page is finished.
12345 page is finished.
12346 page is finished.
12347 page is finished.
12348 page is finished.
12349 page is finished.
12350 page is finished.
12351 page is finished.
12352 page is finished.
12353 page is finished.
12354 page is finished.
12355 page is finished.
12356 page is fi

12666 page is finished.

12667 page is finished.
12668 page is finished.
12669 page is finished.
12670 page is finished.
12671 page is finished.
12672 page is finished.
12673 page is finished.
12674 page is finished.
12675 page is finished.
12676 page is finished.
12677 page is finished.
12678 page is finished.
12679 page is finished.
12680 page is finished.
12681 page is finished.
12682 page is finished.
12683 page is finished.
12684 page is finished.
12685 page is finished.
12686 page is finished.
12687 page is finished.
12688 page is finished.
12690 page is finished.

12691 page is finished.
12692 page is finished.
12693 page is finished.
12694 page is finished.
12695 page is finished.
12696 page is finished.
12697 page is finished.
12698 page is finished.
12699 page is finished.
12700 page is finished.
12701 page is finished.
12702 page is finished.
12703 page is finished.
12704 page is finished.
12705 page is finished.
12706 page is finished.
12707 page is finished.
12708 page is 

Encountered error ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
13012 page is finished.
13013 page is finished.
13014 page is finished.
13015 page is finished.
13016 page is finished.
13017 page is finished.
13018 page is finished.
13019 page is finished.
13020 page is finished.
13021 page is finished.
13022 page is finished.
13023 page is finished.
13024 page is finished.
13025 page is finished.
13026 page is finished.
13027 page is finished.
13028 page is finished.
13029 page is finished.
13030 page is finished.
13031 page is finished.
13032 page is finished.
13033 page is finished.
13034 page is finished.
13035 page is finished.
13036 page is finished.
13037 page is finished.
13038 page is finished.
13039 page is finished.
13040 page is finished.
13041 page is finished.
13042 page is finished.
13043 page is finished.
13044 page is finished.
13045 page is finished.
13046 page is finished.
13047 page is finished.
13048 page is finished.
13049 page is fi

KeyboardInterrupt: 

In [4]:
if __name__ == "__main__":
    pages_to_scrape = (13055, 14974)   # All remaining pages are planned to be crawled totally,.
    winmag_scraper = Scraper(pages_to_scrape=pages_to_scrape, num_jobs=4, clear_old_data=False)
    winmag_scraper.scrape_site()           # stop at "13539 page is finished."   

13056 page is finished.
13057 page is finished.
13058 page is finished.
13059 page is finished.
13060 page is finished.
13061 page is finished.
13062 page is finished.
13063 page is finished.
13064 page is finished.
13065 page is finished.
13066 page is finished.
13067 page is finished.
13068 page is finished.
13069 page is finished.
13070 page is finished.
13071 page is finished.
13072 page is finished.
13073 page is finished.
13074 page is finished.
13075 page is finished.
13076 page is finished.
13077 page is finished.
13078 page is finished.
13079 page is finished.
13080 page is finished.
13081 page is finished.
13082 page is finished.
13083 page is finished.
13084 page is finished.
13085 page is finished.
13086 page is finished.
13087 page is finished.
13088 page is finished.
13089 page is finished.
13090 page is finished.
13091 page is finished.
13092 page is finished.
13093 page is finished.
13094 page is finished.
13095 page is finished.
13096 page is finished.
13097 page is fi

13400 page is finished.
13401 page is finished.
13402 page is finished.
13403 page is finished.
13404 page is finished.
13405 page is finished.
13406 page is finished.
13407 page is finished.
13408 page is finished.
13409 page is finished.
13410 page is finished.
13411 page is finished.
13412 page is finished.
13413 page is finished.
13414 page is finished.
13415 page is finished.
13416 page is finished.
13417 page is finished.
13418 page is finished.
13419 page is finished.
13420 page is finished.
13421 page is finished.
13422 page is finished.
13423 page is finished.
13424 page is finished.
13425 page is finished.
13426 page is finished.
13427 page is finished.
13428 page is finished.
13429 page is finished.
13430 page is finished.
13431 page is finished.
13432 page is finished.
13433 page is finished.
13434 page is finished.
13435 page is finished.
13436 page is finished.
13437 page is finished.
13438 page is finished.
13439 page is finished.
13440 page is finished.
13441 page is fi

ConnectionError: HTTPSConnectionPool(host='www.winemag.com', port=443): Max retries exceeded with url: /?s=&drink_type=wine&pub_date_web=2020&page=13175 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f903b52b2e0>: Failed to establish a new connection: [Errno 60] Operation timed out'))

In [4]:
if __name__ == "__main__":
    pages_to_scrape = (13539, 14974)   # All remaining pages are planned to be crawled totally,.
    winmag_scraper = Scraper(pages_to_scrape=pages_to_scrape, num_jobs=4, clear_old_data=False)
    winmag_scraper.scrape_site()       

13540 page is finished.
13541 page is finished.
13542 page is finished.
13543 page is finished.
13544 page is finished.
13545 page is finished.
13546 page is finished.
13547 page is finished.
13548 page is finished.
13549 page is finished.
13550 page is finished.
13551 page is finished.
13552 page is finished.
13553 page is finished.
13554 page is finished.
13555 page is finished.
13556 page is finished.
13557 page is finished.
13558 page is finished.
13559 page is finished.
13560 page is finished.
13561 page is finished.
13562 page is finished.
13563 page is finished.
13564 page is finished.
13565 page is finished.
13566 page is finished.
13567 page is finished.
13568 page is finished.
13569 page is finished.
13570 page is finished.
13571 page is finished.
13572 page is finished.
13573 page is finished.
13574 page is finished.
13575 page is finished.
13576 page is finished.
13577 page is finished.
13578 page is finished.
13579 page is finished.
13580 page is finished.
13581 page is fi

13883 page is finished.
13884 page is finished.
13885 page is finished.
13886 page is finished.
13888 page is finished.

13889 page is finished.
13890 page is finished.
13891 page is finished.
13892 page is finished.
13893 page is finished.
13894 page is finished.
13895 page is finished.
13896 page is finished.
13897 page is finished.
13898 page is finished.
13899 page is finished.
13900 page is finished.
13901 page is finished.
13902 page is finished.
13903 page is finished.
13904 page is finished.
13905 page is finished.
13906 page is finished.
13907 page is finished.
13908 page is finished.
13909 page is finished.
13910 page is finished.
13911 page is finished.
13912 page is finished.
13913 page is finished.
13914 page is finished.
13915 page is finished.
13916 page is finished.
13917 page is finished.
13918 page is finished.
13919 page is finished.
13920 page is finished.
13921 page is finished.
13922 page is finished.
13923 page is finished.
13924 page is finished.
13925 page is f

14228 page is finished.
14229 page is finished.
14230 page is finished.
14231 page is finished.
14232 page is finished.
14233 page is finished.
14234 page is finished.
14235 page is finished.
14236 page is finished.
14237 page is finished.
14238 page is finished.
14239 page is finished.
14240 page is finished.
14241 page is finished.
14242 page is finished.
14243 page is finished.
14244 page is finished.
14245 page is finished.
14246 page is finished.
14247 page is finished.
14248 page is finished.
14249 page is finished.
14250 page is finished.
14251 page is finished.
14252 page is finished.
14253 page is finished.
14254 page is finished.
14255 page is finished.
14256 page is finished.
14257 page is finished.
14258 page is finished.
14259 page is finished.
14260 page is finished.
14261 page is finished.
14262 page is finished.
14263 page is finished.
14264 page is finished.
14265 page is finished.
14266 page is finished.
14267 page is finished.
14268 page is finished.
14269 page is fi

14572 page is finished.
14573 page is finished.
14574 page is finished.
14575 page is finished.
14576 page is finished.
14577 page is finished.
14578 page is finished.
14579 page is finished.
14580 page is finished.
14581 page is finished.
14582 page is finished.
14583 page is finished.
14584 page is finished.
14585 page is finished.
14586 page is finished.
14587 page is finished.
14588 page is finished.
14589 page is finished.
14590 page is finished.
14591 page is finished.
14592 page is finished.
14593 page is finished.
14594 page is finished.
14595 page is finished.
14596 page is finished.
14597 page is finished.
14598 page is finished.
14599 page is finished.
14600 page is finished.
14601 page is finished.
14602 page is finished.
14603 page is finished.
14604 page is finished.
14605 page is finished.
14606 page is finished.
14607 page is finished.
14608 page is finished.
14609 page is finished.
14610 page is finished.
14611 page is finished.
14612 page is finished.
14613 page is fi

14915 page is finished.
14916 page is finished.
14917 page is finished.
14918 page is finished.
14919 page is finished.
14920 page is finished.
14921 page is finished.
14922 page is finished.
14923 page is finished.
14924 page is finished.
14925 page is finished.
14926 page is finished.
14927 page is finished.
14928 page is finished.
14929 page is finished.
14930 page is finished.
14931 page is finished.
14932 page is finished.
14933 page is finished.
14934 page is finished.
14935 page is finished.
14936 page is finished.
14937 page is finished.
14938 page is finished.
14939 page is finished.
14940 page is finished.
14941 page is finished.
14942 page is finished.
14943 page is finished.
14944 page is finished.
14945 page is finished.
14946 page is finished.
14947 page is finished.
14948 page is finished.
14949 page is finished.
14950 page is finished.
14951 page is finished.
14952 page is finished.
14953 page is finished.
14954 page is finished.
14955 page is finished.
14956 page is fi

In [1]:
import pandas as pd
import json

jsonfile_path ="winemag dataset.json"    # read json file
with open(jsonfile_path, "r", encoding="utf-8") as j_obj:
    json_data = json.load(j_obj)
json_data

[{'points': 89,
  'title': 'Argiolas 2017 Perdera  (Monica di Sardegna)',
  'description': 'Fragrant blue flower, Mediterranean herb and wild berry aromas lead the nose. On the chewy palate, pliant tannins accompany succulent black cherry, blackberry and a ground pepper note.',
  'taster_name': 'Kerin O’Keefe',
  'price': 19.0,
  'variety': 'Red Blends',
  'province': 'Sicily & Sardinia',
  'country': 'Italy',
  'winery': 'Argiolas'},
 {'points': 89,
  'title': 'Argiolas 2018 Costamolino  (Vermentino di Sardegna)',
  'description': 'Spring blossom and yellow stone fruit aromas mingle with whiffs of Mediterranean brush. On the tangy palate, bright acidity accompanies apricot, green melon and a hint of chopped mint before a white-almond close.',
  'taster_name': 'Kerin O’Keefe',
  'price': 19.0,
  'variety': 'Vermentino',
  'province': 'Sicily & Sardinia',
  'country': 'Italy',
  'winery': 'Argiolas'},
 {'points': 89,
  'title': 'Bushong 2017 Power, Corruption & Lies Red (Paso Robles)',


In [2]:
import csv
import codecs

f = open('winemag dataset.json')
data = json.load(f)
f.close()
 
f = codecs.open('winemag dataset.csv', 'w')
writer = csv.writer(f);
writer.writerow(json_data[0].keys())

for item in data:
    writer.writerow(item.values())     # save to csv file
f.close()

df = pd.read_csv('winemag dataset.csv')
df   # check the dataset

Unnamed: 0,points,title,description,taster_name,price,variety,province,country,winery
0,89,Argiolas 2017 Perdera (Monica di Sardegna),"Fragrant blue flower, Mediterranean herb and w...",Kerin O’Keefe,19.0,Red Blends,Sicily & Sardinia,Italy,Argiolas
1,89,Argiolas 2018 Costamolino (Vermentino di Sard...,Spring blossom and yellow stone fruit aromas m...,Kerin O’Keefe,19.0,Vermentino,Sicily & Sardinia,Italy,Argiolas
2,89,"Bushong 2017 Power, Corruption & Lies Red (Pas...","Deep, dark and intense aromas of blackberry me...",Matt Kettmann,43.0,Rhône-style Red Blend,California,US,Bushong
3,89,Buttonwood Grove 2017 Riesling (Finger Lakes),Honey-drizzled pear and peach meld with a bit ...,Alexander Peartree,16.0,Riesling,New York,US,Buttonwood Grove
4,89,Casa Silva 2018 Los Lingues Vineyard Estate Gr...,Leathery plum and berry aromas show some oak i...,Michael Schachner,20.0,Cabernet Sauvignon,Colchagua Valley,Chile,Casa Silva
...,...,...,...,...,...,...,...,...,...
147304,90,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Notes of honeysuckle and cantaloupe sweeten th...,Anna Lee C. Iijima,28.0,Riesling,Mosel,Germany,Dr. H. Thanisch (Erben Müller-Burggraef)
147305,90,Citation 2004 Pinot Noir (Oregon),Citation is given as much as a decade of bottl...,Paul Gregutt,75.0,Pinot Noir,Oregon,US,Citation
147306,90,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Well-drained gravel soil gives this wine its c...,Roger Voss,30.0,Gewürztraminer,Alsace,France,Domaine Gresser
147307,90,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),"A dry style of Pinot Gris, this is crisp with ...",Roger Voss,32.0,Pinot Gris,Alsace,France,Domaine Marcel Deiss
