# Web Scrapping

In [41]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import csv

In [52]:
# Web Scraping Wine.com
base_url = 'https://www.wine.com/list/wine/7155/{}?showOutOfStock=true&sortBy=mostInteresting'

# Function to scrape a single review from wine.com and put data into a pandas series
def scrape_winecom_review(review_href):
    # Web Scraping Wine.com review
    url = 'https://www.wine.com' + review_href
    response = requests.Session().get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Defining the data structure
    data = {'Name': np.nan,
         'Variety': np.nan,
         'Origin': np.nan,
         'Attr_1': np.nan,
         'Attr_2': np.nan,
         'Attr_3': np.nan,
         'Attr_4': np.nan,
         'Alcohol_vol': np.nan,
         'Alcohol_percentage': np.nan,
         'Winemaker_notes': np.nan,
         'Review': np.nan,
         'Avg_rating': np.nan,
         'N_ratings': np.nan,
         'Price_Out-of-stock': np.nan,
         'Price': np.nan}
    
    # Use a list for the case there is multiple records
    data_list = [data]

    # Name of the wine
    try:
        name = soup.find_all('h1', {'class':'pipName', 'itemprop':'name'})[0].get_text()
        data['Name'] = name
    except:
        data['Name'] = np.nan

    # Variety and origin
    try:
        var_ori = soup.find_all('a', {'class':'pipOrigin_link'})
        for i in range(2):
            if i == 0:
                variety = var_ori[0].get_text()
                data['Variety'] = variety
            else:
                origin = var_ori[1].get_text()
                data['Origin'] = origin
    except:
        data['Variety'] = np.nan
        data['Origin'] = np.nan

    # Product Attributes
    try:
        attr = soup.find_all('ul', {'class':'prodAttr', 'aria-label':'product attributes'})
        for i, at in enumerate(attr):
            data['Attr_{}'.format(i+1)] = at.find_all('li')[i].get('title')
    except:
        data['Attr_1'] = np.nan

    # Alcohol volume
    try:
        alcvol = soup.find('span', {'class':'prodAlcoholVolume_text'}).get_text()
        data['Alcohol_vol'] = alcvol
    except:
        data['Alcohol_vol'] = np.nan

    # Alcohol Percentage
    try:
        alcper = soup.find('span', {'class':'prodAlcoholPercent_percent'}).get_text()
        data['Alcohol_percentage'] = alcper
    except:
        data['Alcohol_percentage'] = np.nan

    # Winemaker notes
    try:
        wine_notes = soup.find('div', {'class':'viewMoreModule_text'}).find_all('p')
        wine_notes_text = [wn.get_text() for wn in wine_notes]
        wine_text = ' '.join(wine_notes_text)
        data['Winemaker_notes'] = wine_text
    except:
        data['Winemaker_notes'] = np.nan

    # Rating
    try:
        rating = soup.find('span', {'class':'averageRating_average', 'itemprop':'ratingValue'}).get_text()
        data['Avg_rating'] = rating
    except:
        data['Avg_rating'] = np.nan

    # Number of Ratings
    try:
        n_ratings = soup.find('span', {'class':'averageRating_number', 'itemprop':'ratingCount'}).get_text()
        data['N_ratings'] = n_ratings
    except:
        data['N_ratings'] = np.nan

    # Price out of stock
    try:
        price_oos = soup.find('span', {'class':'prodItemStock_soldOut-smallText'}).get_text().split(' ')[1].strip('$').strip(')')
    except:
        price_oos = np.nan
    data['Price_Out-of-stock'] = price_oos

    try:
        price_now = soup.find('span', {'class':'prodItemStock_soldOut-vintagePriceWhole'}).get_text() + '.' + soup.find('span', {'class':'prodItemStock_soldOut-vintagePriceFractional'}).get_text()
    except:
        price_now = np.nan
    data['Price'] = price_now
    
    # Reviews need to be treated differently, multiple reviews require multiple records in order to keep all the data as possible
    try:
        reviews = soup.find_all('div', {'class':'pipSecContent_copy'})
        reviews_text = [r.get_text() for r in reviews]
        for i, rev in enumerate(reviews_text):
            if i == 0:
                data['Review'] = rev
            elif len(reviews_text) > 1:
                d = data.copy()
                d['Review'] = rev
                data_list.append(d)
    except:
        data['Review'] = np.nan

    return data_list


def save_data_to_csv(list_dicts, filepath):
    # Variables scraped from the website
    field_names = ['Name', 'Variety', 'Origin', 'Attr_1', 'Attr_2', 'Attr_3', 'Attr_4', 'Alcohol_vol', 'Alcohol_percentage', 'Winemaker_notes', 'Review',
                   'Avg_rating', 'N_ratings', 'Price_Out-of-stock', 'Price']
    
    # Name of the file to store data
    with open(filepath, 'a+', newline='') as csvfile:
        # Create a csvwriter object
        csvwriter = csv.DictWriter(csvfile, fieldnames=field_names)
        
        # Check if the csv file is empty, if so, write the header
        csvfile.seek(0)
        first_char = csvfile.read(1)
        if not first_char:
            csvwriter.writeheader()
            
        # Write the data
        for row in list_dicts:
            csvwriter.writerow(row)
            
        csvfile.close()


def get_review_links_winecom(base_url, pages_to_scrape):
    links = []
    
    # Loop that iterates over the quantity of pages to extract review links for
    for page_num in range(1, pages_to_scrape+1):
        response = requests.Session().get(base_url.format(page_num))
        soup = BeautifulSoup(response.text, 'html.parser')
        lists = soup.find_all('a', {'class':'listGridItemName event_productClick productNoShowPrice'})
        hrefs = [l.get('href') for l in lists] # Get href for each wine review
        links.extend(hrefs)
    return links


def scrape_winecom(base_url, pages_to_scrape, filepath):
    links = get_review_links_winecom(base_url, pages_to_scrape)
    for i, link in enumerate(links):
        save_data_to_csv(scrape_winecom_review(link), filepath)
        if (i % 1000 == 0):
            print(i)
    return pd.read_csv(filepath)

## GPT Optimized code

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import numpy as np
import concurrent.futures  # For multithreading

BASE_URL = 'https://www.wine.com/list/wine/7155/{}?showOutOfStock=true&sortBy=mostInteresting'
CSV_FILE = 'wine_data.csv'
FIELD_NAMES = ['Name', 'Variety', 'Origin', 'Attr_1', 'Attr_2', 'Attr_3', 'Attr_4',
               'Alcohol_vol', 'Alcohol_percentage', 'Winemaker_notes', 'Review',
               'Avg_rating', 'N_ratings', 'Price_Out-of-stock', 'Price']

# Create a session and reuse it for requests
session = requests.Session()

def scrape_single_review(review_href):
    url = 'https://www.wine.com' + review_href
    try:
        response = session.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
    except (requests.RequestException, ValueError, AttributeError):
        return None  # Handle errors gracefully

    data = {field: np.nan for field in FIELD_NAMES}

    try:
        data['Name'] = soup.find('h1', {'class':'pipName', 'itemprop':'name'}).get_text()
    except AttributeError:
        pass

    # Other data extraction code...

    return data

def save_data_to_csv(data_list):
    with open(CSV_FILE, 'a+', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.DictWriter(csvfile, fieldnames=FIELD_NAMES)
        csvfile.seek(0)
        first_char = csvfile.read(1)
        if not first_char:
            csvwriter.writeheader()
        
        csvwriter.writerows(data_list)

def scrape_winecom_page(page_num):
    links = get_review_links_winecom(BASE_URL, page_num)
    data_list = []

    for link in links:
        data = scrape_single_review(link)
        if data:
            data_list.append(data)

    return data_list

def get_review_links_winecom(base_url, page_num):
    response = session.get(base_url.format(page_num))
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    lists = soup.find_all('a', {'class':'listGridItemName event_productClick productNoShowPrice'})
    hrefs = [l.get('href') for l in lists]
    return hrefs

def main():
    pages_to_scrape = 1000  # Adjust the number of pages as needed
    concurrent_requests = 10  # Adjust the number of concurrent requests

    with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
        # Scrape data concurrently from multiple pages
        all_data = executor.map(scrape_winecom_page, range(1, pages_to_scrape + 1))

    # Flatten the list of lists returned by executor.map
    flattened_data = [data for page_data in all_data for data in page_data]

    # Save data to CSV
    save_data_to_csv(flattened_data)

if __name__ == '__main__':
    main()

## GPT Parallel Computation

In [21]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import time

# Function to scrape a single review from wine.com and put data into a dictionary
def scrape_winecom_review(review_href):
    url = 'https://www.wine.com' + review_href
    response = session.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Predefine a data structure to not have anomalies in each record
    data = {
        'Name': np.nan,
        'Variety': np.nan,
        'Origin': np.nan,
        'Attr_1': np.nan,
        'Attr_2': np.nan,
        'Alcohol_vol': np.nan,
        'Alcohol_percentage': np.nan,
        'Winemaker_notes': np.nan,
        'Review': np.nan,
        'Avg_rating': np.nan,
        'N_ratings': np.nan,
        'Price_Out-of-stock': np.nan,
        'Price': np.nan
    }
    
    # Get wine name
    try:
        name = soup.find('h1', {'class': 'pipName', 'itemprop': 'name'}).get_text()
        data['Name'] = name
    except:
        pass
    
    # Get wine variety and origin
    try:
        var_ori = soup.find_all('a', {'class': 'pipOrigin_link'})
        data['Variety'] = var_ori[0].get_text()
        data['Origin'] = var_ori[1].get_text()
    except:
        pass

    # Get attributes, max of 4 attributes for each wine
    try:
        attr = soup.find_all('ul', {'class': 'prodAttr', 'aria-label': 'product attributes'})
        for i, at in enumerate(attr):
            data[f'Attr_{i+1}'] = at.find_all('li')[i].get('title')
    except:
        pass
    
    # get the alcohol volume of the wine
    try:
        data['Alcohol_vol'] = soup.find('span', {'class': 'prodAlcoholVolume_text'}).get_text()
    except:
        pass

    # Get the alcohol percentage of the wine
    try:
        data['Alcohol_percentage'] = soup.find('span', {'class': 'prodAlcoholPercent_percent'}).get_text()
    except:
        pass

    # Get winemaker notes
    try:
        wine_notes = soup.find('div', {'class': 'viewMoreModule_text'}).find_all('p')
        data['Winemaker_notes'] = ' '.join(wn.get_text() for wn in wine_notes)
    except:
        pass

    # Get average rating for the wine
    try:
        data['Avg_rating'] = soup.find('span', {'class': 'averageRating_average', 'itemprop': 'ratingValue'}).get_text()
    except:
        pass

    # Get the number of rating for that wine
    try:
        data['N_ratings'] = soup.find('span', {'class': 'averageRating_number', 'itemprop': 'ratingCount'}).get_text()
    except:
        pass

    # Get price out of stock for the wine
    try:
        price_oos = soup.find('span', {'class': 'prodItemStock_soldOut-smallText'}).get_text().split(' ')[1].strip('$').strip(')')
        data['Price_Out-of-stock'] = price_oos
    except:
        pass

    # Get the actual price of the wine when available
    try:
        price_now = soup.find('span', {'class': 'prodItemStock_soldOut-vintagePriceWhole'}).get_text() + '.' + soup.find('span', {'class': 'prodItemStock_soldOut-vintagePriceFractional'}).get_text()
        data['Price'] = price_now
    except:
        pass

    # Get the most important review for the wine
    try:
        reviews = soup.find('div', {'class': 'pipSecContent_copy'})
        data['Review'] = reviews.get_text()
    except:
        pass

    # Return the data dictionary
    return data

# Function to save a batch of data to CSV
def save_data_to_csv(batch, csv_filepath):
    # Pre-defined data structure
    field_names = ['Name', 'Variety', 'Origin', 'Attr_1', 'Attr_2', 'Alcohol_vol', 'Alcohol_percentage', 
                   'Winemaker_notes', 'Review', 'Avg_rating', 'N_ratings', 'Price_Out-of-stock', 'Price']

    # Open a csv file in append+write mode with utf-8 encoding
    with open(csv_filepath, 'a+', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.DictWriter(csvfile, fieldnames=field_names) # Specify the columns to be written

        # Check if the CSV file is empty, if so, write the header
        csvfile.seek(0)
        first_char = csvfile.read(1)
        if not first_char:
            csvwriter.writeheader()

        # Write the batch of data
        for data_dict in batch:
            csvwriter.writerow(data_dict)

        csvfile.close()

# Function to scrape wine.com reviews using multiprocessing
def scrape_winecom_parallel(base_url, csv_filepath, txt_filepath, get_links=True, pages_to_scrape=(1,1), num_threads=4, batch_size=1000):
    start_time = time.time() # Get start time
    
    # Execute function to get all the links of the desired pages
    if get_links:
        get_review_links_winecom(base_url, txt_filepath, pages_to_scrape=pages_to_scrape)
    
    # Get all links
    links = read_review_links_from_file(txt_filepath)
    
    end_time = time.time() # Get finished time
    elapsed_time = end_time - start_time # Get total time
    print('Time to gather all the links for {} scraped pages:'.format(pages_to_scrape), elapsed_time)
    print('Number of links recollected:', len(links))

    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        data_batches = []
        current_batch = []
        batch_count = 0

        for i, link in enumerate(links):
            current_batch.append(link)

            # When the batch size is reached or we have processed all links, scrape and save the batch
            if len(current_batch) == batch_size or i == len(links) - 1:
                batch_count += 1
                
                if batch_count % 1000 == 0:
                    start_time = time.time()  # Record the start time before processing 1000 batches
            
                data_batch = executor.map(scrape_winecom_review, current_batch)
                data_batches.extend(data_batch)
                current_batch = []
                
                # If we've processed 1000 batches, measure and print the elapsed time
                if batch_count % 1000 == 0:
                    end_time = time.time()  # Record the end time after processing 1000 batches
                    elapsed_time = end_time - start_time  # Calculate the elapsed time
                    print(f"Processed 1000 batches in {elapsed_time:.2f} seconds")  # Print the elapsed time

            if (i % 1000 == 0):
                print(i)

        # Save all the data in batches
        while data_batches:
            batch_to_save = data_batches[:batch_size]
            save_data_to_csv(batch_to_save, csv_filepath)
            data_batches = data_batches[batch_size:]

    return pd.read_csv(csv_filepath, encoding='utf-8')

# Function to get review links from wine.com
def get_review_links_winecom(base_url, txt_filepath, pages_to_scrape=(1,1)):
    # Open the text file in write mode
    with open(txt_filepath, 'w') as link_file:
        for page_num in range(pages_to_scrape[0], pages_to_scrape[1] + 1):
            response = session.get(base_url.format(page_num))
            soup = BeautifulSoup(response.text, 'html.parser')
            lists = soup.find_all('a', {'class': 'listGridItemName event_productClick productNoShowPrice'})
            hrefs = [l.get('href') for l in lists]  # Get href for each wine review
            
            # Write the links to the text file
            for href in hrefs:
                link_file.write(href + '\n')
                
# Function to read review links from a text file
def read_review_links_from_file(txt_filepath):
    links = []

    with open(txt_filepath, 'r') as link_file:
        for line in link_file:
            link = line.strip()  # Remove leading/trailing whitespace and newline characters
            links.append(link)

    return links

In [23]:
# Attempt
base_url = 'https://www.wine.com/list/wine/7155/{}?showOutOfStock=true&sortBy=mostInteresting'
pages_to_scrape = (1,10_000)  # Adjust as needed
csv_filepath = 'wine_reviews.csv'
txt_filepath = 'reviews.txt'

# Create a session for making requests
session = requests.Session()

# Create new DataFrame with data
df = scrape_winecom_parallel(base_url, csv_filepath, txt_filepath, get_links=False, pages_to_scrape=pages_to_scrape,
                            num_threads=6, batch_size=1000)

df

Time to gather all the links for (1, 10000) scraped pages: 0.10726213455200195
Number of links recollected: 208199
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
1420

Unnamed: 0,Name,Variety,Origin,Attr_1,Attr_2,Alcohol_vol,Alcohol_percentage,Winemaker_notes,Review,Avg_rating,N_ratings,Price_Out-of-stock,Price
0,Chateau Du Caillau Cahors 2021,Malbec,"Cahors, Southwest, France",Red Wine,Green Wine,750.0,13.0,Deep intense ruby color. Ripe fruit on the nos...,COMMENTARY: The 2021 Chateau du Caillau is gen...,5.0,19.0,13.99,21.99
1,Coates and Seely Brut Reserve,Non-Vintage Sparkling Wine,England,Sparkling & Champagne,,750.0,12.0,"A blend of Chardonnay and Pinot Noir, with res...",There's no rule that English sparkling wines m...,4.4,33.0,41.99,
2,Emilio Moro Polvorete 2022,Godello,"Bierzo, Spain",White Wine,,750.0,13.5,,"A round and balanced white with green apples, ...",4.8,14.0,17.99,19.99
3,Bodegas La Purisma Old Vines Red Blend 2019,Other Red Blends,"Yecla, Spain",Red Wine,,750.0,14.5,"La Purisima has a gorgeous black cherry color,...","An excellent blend with 85% monastrell, 10% sy...",4.4,56.0,10.99,
4,La Chapelle du Bastion Picpoul de Pinet 2022,Picpoul,"Picpoul de Pinet, Languedoc, South of France, ...",White Wine,Screw Cap,750.0,12.5,Blend: 100% Picpoul Blanc,COMMENTARY: The 2022 Le Chapelle du Bastion Pi...,4.2,91.0,11.99,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
208194,DuMOL Maranet Pinot Noir 2010,Pinot Noir,"Russian River, Sonoma County, California",Red Wine,,750.0,0.0,,,,,34.99,
208195,La Spinetta Langhe Nebbiolo 2012,Nebbiolo,"Piedmont, Italy",Red Wine,,750.0,14.0,,,,,26.99,29.99
208196,Domaine Michel Juillot Corton-Charlemagne Gran...,Chardonnay,"Cote de Beaune, Cote d'Or, Burgundy, France",White Wine,,750.0,0.0,,,,,116.99,
208197,Chatom Semillon 2011,Semillon,"Sierra Foothills, California",White Wine,,750.0,0.0,,,,,17.99,
