## Libraries

In [1]:
# imports and setup 
from bs4 import BeautifulSoup
# you can use either of these libraries to get html from a website
import requests
import urllib.request

import pickle
import re
import time

import pandas as pd
import scipy as sc
import numpy as np

import statsmodels.formula.api as sm

import matplotlib.pyplot as plt 
plt.style.use('ggplot')
%matplotlib inline  
plt.rcParams['figure.figsize'] = (10, 6) 

## Parameters

In [2]:
session = requests.Session()
HEADERS = {
    'user-agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36')
}
first_page = 1
last_page = 10
results_url = "https://www.winemag.com/?s=&drink_type=wine&page="

## Scrape Web sites to find Urls for reviews

In [26]:
# raw_pages = []
# for i in range(first_page ,last_page + 1 ):
#     url = results_url + str(i)
#     response =session.get(url, headers=HEADERS)
#     raw_pages.append(response)
#     time.sleep(5) 

In [27]:
#pickle.dump( raw_pages, open( "raw_pages.p", "wb" ) )

## read in previous run

In [3]:
raw_pages = pickle.load( open( "raw_pages.p", "rb" ) )

## Convert raw pages

In [4]:
soup_pages = []
for i in range(0,last_page - first_page + 1):
    soup_pages.append(BeautifulSoup(raw_pages[i].content, 'html.parser'))

## Get url for reviews

In [5]:
raw_review_urls = [ review.get("href") for soup_page in soup_pages for review in soup_page.select(".review-item a")]

## Clean Urls

In [6]:
clean_review_urls = [ my_url for my_url in raw_review_urls  if   bool(re.search(r'^https://www.winemag.com/buying-guide/', my_url))]

In [7]:
clean_review_urls

['https://www.winemag.com/buying-guide/chateau-margaux-2015-margaux/',
 'https://www.winemag.com/buying-guide/chateau-ausone-2015-saint-emilion/',
 'https://www.winemag.com/buying-guide/chateau-cheval-blanc-2015-saint-emilion/',
 'https://www.winemag.com/buying-guide/chateau-lafite-rothschild-2015-pauillac/',
 'https://www.winemag.com/buying-guide/chateau-petrus-2015-pomerol/',
 'https://www.winemag.com/buying-guide/chateau-leoville-las-cases-2015-saint-julien/',
 'https://www.winemag.com/buying-guide/chateau-palmer-2015-margaux/',
 'https://www.winemag.com/buying-guide/chateau-mouton-rothschild-2015-pauillac/',
 'https://www.winemag.com/buying-guide/shafer-2013-hillside-select-cabernet-sauvignon-stags-leap-district/',
 'https://www.winemag.com/buying-guide/arpepe-2007-sassella-riserva-rocce-rosse-valtellina-superiore/',
 'https://www.winemag.com/buying-guide/chateau-valandraud-2015-saint-emilion/',
 'https://www.winemag.com/buying-guide/quinta-do-vale-meao-2015-red-douro/',
 'https://

## Get Reviews

In [52]:
# raw_review_pages = []
# for url_i in clean_review_urls:
#     response =session.get(url_i, headers=HEADERS)
#     raw_review_pages.append(response)
#     time.sleep(5) 

In [53]:
#pickle.dump( raw_review_pages, open( "raw_review_pages.p", "wb" ) )

## read in previous run

In [8]:
raw_review_pages = pickle.load( open( "raw_review_pages.p", "rb" ) )

## turn response into soup

In [9]:
soup_review_pages = []
for raw_review_page in raw_review_pages:
    soup_review_pages.append(BeautifulSoup(raw_review_page.content, 'html.parser'))

## put some structure on the data

In [11]:
structure_reviews = []
for url, soup_review_page in zip(clean_review_urls,soup_review_pages):
    title = soup_review_page.select(".heading-area .article-title")[0].text
    points = soup_review_page.select(".rating #points")[0].text
    description = soup_review_page.select(".description")[0].text
    primary_info_label = soup_review_page.select(".primary-info .row .info-label span")
    primary_info = soup_review_page.select(".primary-info .row .info")
    secondary_info_label = soup_review_page.select(".secondary-info .row .info-label span")
    secondary_info = soup_review_page.select(".secondary-info .row .info")
    taster = soup_review_page.select(".taster .name")[0].text
    structure_reviews.append([url, title,points,description,primary_info_label,
                            primary_info,secondary_info_label,
                            secondary_info, taster])

In [12]:
structure_reviews

[['https://www.winemag.com/buying-guide/chateau-margaux-2015-margaux/',
  'Château Margaux 2015  Margaux',
  '100',
  "Predominantly Cabernet Sauvignon, this wine shows a wonderful black-currant purity on the palate, along with intense, vibrant acidity. The background is all tannin, which speaks to its aging potential. This wine is the last vintage produced by Paul Pontallier, who was general manager from 1990 until his death in 2016. It's a memorable wine and one for aging. Drink from 2027.",
  [<span>Price</span>,
   <span>Variety</span>,
   <span>Appellation</span>,
   <span>Winery</span>],
  [<div class="info medium-9 columns">
   <span><span>N/A,  <a href="http://www.wine-searcher.com/find/ch%C3%A2teau+margaux+margaux+bordeaux-style+red+blend/2015/USA/USD/?referring_site=WEM" target="blank">Buy Now</a></span></span>
   </div>, <div class="info medium-9 columns">
   <span><a href="https://www.winemag.com/?s=Bordeaux-style%20Red%20Blend">Bordeaux-style Red Blend</a></span>
   </div>