# This notebook shows how to make data extraction from the page:
https://www.internationalwinechallenge.com/canopy/search_results?wpcat=WineTab.S&Challenge_Year=2025_993276&Medal=&Score=&Vintage=&Style=&Country=&PrimGrpq=&Region=&Prodq=&WinNamq=

The page previously mentioned, is the first one that contains a total of 20 wines displayed and are partioned by page, our process then is gonna be: 
*Extract the first page* -- after 20 wines --> *Pass page* 
To define the next page we can use this link:https://www.internationalwinechallenge.com/canopy/search_results?page=2&wpcat=WineTab.S&Challenge_Year=2025_993276
And just increase the page number on the url.


In [None]:
import bs4
import lxml
import pandas as pd
import urllib
import re

from urllib import request

ERROR! Session/line number was not unique in database. History logging moved to new session 128


### Downloading the page via URLLIB

In [None]:
jo = "https://www.internationalwinechallenge.com/canopy/search_results?wpcat=WineTab.S&Challenge_Year=2025_993276&Medal=&Score=&Vintage=&Style=&Country=&PrimGrpq=&Region=&Prodq=&WinNamq="
req = request.Request(jo, headers={"User-Agent": "Mozilla/5.0"})

request_text = request.urlopen(req).read()
print(request_text[:1000])
page = bs4.BeautifulSoup(request_text, "lxml")
print(page.find("title"))

b'<!DOCTYPE  html>\n<html lang="en"><head>\n<meta name="twitter:card" content="summary" />\n<meta name="twitter:site" content="@WineChallenge" />\n<meta property="og:site_name" content="IWC" />\n<title>International Wine Challenge - The most influential wine competition in the World</title>\n<meta property="og:title" content="International Wine Challenge - The most influential wine competition in the World" />\n<meta name="twitter:title" content="International Wine Challenge - The most influential wine competition in the World" />\n<meta itemprop="inLanguage" http-equiv="content-language" content="en" />\n<meta http-equiv="language" content="en" />\n<meta http-equiv="X-UA-Compatible" content="IE=edge" />\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n<script src="//admin.webpuzzleapp.com/js/jquery/1.12.2/jquery.min.js" type="text/javascript"></script>\n<meta name="viewport" content="initial-scale=1.0,width=device-width" /><link rel="stylesheet" href="https://use

### Processing the page to obtain the items displayed
Normally a page should have 20 items and has been tested based on this assumption.

In [None]:
results_header = page.find("h2", class_="displaying")
if results_header:
    results_text = results_header.get_text(strip=True)
    # extrair o número, ex: "Displaying 20 results"
    import re
    match = re.search(r"Displaying\s+(\d+)", results_text)
    total_results = int(match.group(1)) if match else None
    print("Total results:", total_results)

Total results: 20


### Wine page to display how we are getting information
from IPython.display import display, HTML

display(HTML('<img src="Wine_scrape.png" width="600">'))

Notes:
- The title H1 always have the name of the wine, but can also include the year or NV flag. NV stands for a blend of wines so it's normally impossible to define a single year of production. Check the tests on: https://regexr.com/8hfrf

- For any image displayed on the screen, we are keeping just it link, so we can use it afterwards without store problems.

- The info field on the dataset creation is relative to the information about the wine: *producer by*,*country*,*grape*,*alcohol level*,*wine style*,*wine colour*, *IWC points* and *trophies*. Just in case some wine got extra information we add like that


In [None]:

BASE_URL = 'https://www.internationalwinechallenge.com/canopy/'
# Considering the standard page has been captured
# Go thorgh their links and add to the list
links = []
for a in page.find_all("a", class_="result"):
    href = a.get("href")
    if href:
        links.append(href)

print("Found links:", links)

data = []

for url in links:
    detail_req = request.Request(BASE_URL + url, headers={"User-Agent": "Mozilla/5.0"})
    with request.urlopen(detail_req) as resp:
        detail_text = resp.read()
    detail_soup = bs4.BeautifulSoup(detail_text, "html.parser")

    # --- Name + Year + NV ---
    title_tag = detail_soup.find("h1")
    full_title = title_tag.get_text(strip=True) if title_tag else None

    name = None
    year = None
    is_nv = False

    if full_title:
        # Regex to capture last comma + year or NV https://regexr.com/8hfrf
        match = re.match(r"^(.*),\s*([0-9]{4}|NV)$", full_title.strip(), re.IGNORECASE)
        if match:
            name = match.group(1).strip()
            year_candidate = match.group(2).strip().upper()
            if year_candidate == "NV":
                is_nv = True
                year = None
            else:
                year = int(year_candidate)
        else:
            name = full_title.strip()

    # --- Wine Image ---
    img_tag = detail_soup.select_one("div.bottle img")
    wine_img = img_tag["src"] if img_tag else None

    # --- Medals ---
    medals = [img["src"] for img in detail_soup.select("div.medals img")]

    # --- Tasting Notes ---
    tasting_div = detail_soup.find("div", class_="tasting-note")
    tasting_p = tasting_div.find("p") if tasting_div else None
    tasting_notes = tasting_p.get_text(strip=True) if tasting_p else None

    # --- Additional information ---
    info_div = detail_soup.find("div", class_="information")
    info = {}
    if info_div:
        # Extract <p><span>Label</span>Value</p>
        for p in info_div.find_all("p"):
            span = p.find("span")
            if span:
                label = span.get_text(strip=True)
                value = p.get_text(strip=True).replace(label, "").strip()
                info[label] = value

        # Extract trophies from <ul>
        ul = info_div.find("ul")
        trophies = []

        if ul:
            children = list(ul.children)  # get all children of <ul>
            for child in children:
                # Skip the first <li> that just says "Trophies"
                if child.name == "li" and "Trophies" in child.get_text(strip=True):
                    continue
                # For text nodes
                text = str(child).strip()
                if text and text != "\n":
                    # Remove trailing commas or newlines
                    text = text.rstrip(",").strip()
                    trophies.append(text)

        info["Trophies"] = trophies

    # --- Add to dataset ---
    data.append({
        "Name": name,
        "Year": year,
        "NV": is_nv,
        "Tasting Notes": tasting_notes,
        "Detail Link": BASE_URL + url,
        "Medals": medals,
        "Wine Image": wine_img,
        **info
    })

# --- DataFrame ---
df = pd.DataFrame(data)
cols_to_move = ["Tasting Notes", "Wine Image", "Medals", "Detail Link"]

# Get all columns in the current DataFrame
all_cols = df.columns.tolist()

# Keep only columns not in cols_to_move
new_order = [col for col in all_cols if col not in cols_to_move] + cols_to_move

# Reorder DataFrame
df = df[new_order]
df.to_csv("wine_results.csv", index=False)

KeyboardInterrupt: 

### [Test] The following code helps to show how to find information on a page using BS4

In [None]:
url = links[0]
detail_req = request.Request(BASE_URL + url, headers={"User-Agent": "Mozilla/5.0"})
detail_text = request.urlopen(detail_req).read()
detail_soup = bs4.BeautifulSoup(detail_text, "html.parser")
with open("output.txt", "w", encoding="utf-8") as f:
    print(BASE_URL + url)
    f.write(str(detail_soup))
found = detail_soup.find('div',{'class':'information'})

https://www.internationalwinechallenge.com/canopy/beverage_details?wid=193376


In [None]:
found

<div class="information">
<p><span>Produced by</span> Mora Wines</p>
<p><span>Country</span> Central Otago, New Zealand</p>
<p><span>Grape</span>Pinot Noir</p>
<p><span>Alcohol level</span>13.5%</p>
<p><span>Wine style</span>  Still</p>
<p><span>Wine colour</span> Red</p>
<p><span>IWC Points</span>96</p>
<ul>
<li>Trophies</li>
						Central Otago Pinot Noir Trophy, 
					</ul>
</div>