# The code below saves all indivudual coffee links in 'coffee_links.json'

In [13]:
import requests
from bs4 import BeautifulSoup
import json
import time
from urllib.parse import urljoin

def get_links(url):
    """Scrape all coffee review links from a given page URL."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            coffee_reviews = soup.find_all(class_='review-title')

            coffee_links = []
            for review in coffee_reviews:
                a_tag = review.find('a')
                if a_tag and 'href' in a_tag.attrs:
                    full_link = urljoin(url, a_tag['href'])
                    coffee_links.append(full_link)

            print(f"‚úÖ Found {len(coffee_links)} links on {url}")
            return coffee_links
        else:
            print(f"‚ùå Failed to retrieve {url}: {response.status_code}")
            return []
    except Exception as e:
        print(f"‚ö†Ô∏è Error fetching {url}: {str(e)}")
        return []

def save_links_to_json(links, filename="coffee_links.json"):
    """Save all collected links to a JSON file after scraping all pages."""
    if links:
        with open(filename, "w") as file:
            json.dump(links, file, indent=4)
        print(f"‚úÖ Saved {len(links)} total links to {filename}")
    else:
        print("‚ö†Ô∏è No links found, skipping file save.")

# Main execution
all_links = []  # Master list for all collected links
base_url = "https://www.coffeereview.com/review/page/{}/"

for page in range(1, 415):  # Loop from page 1 to 414
    page_url = base_url.format(page)
    links = get_links(page_url)
    
    if links:  # Only add links if some are found
        all_links.extend(links)

# Save all collected links once all pages are scraped
save_links_to_json(all_links)


‚úÖ Found 20 links on https://www.coffeereview.com/review/page/1/
‚úÖ Found 20 links on https://www.coffeereview.com/review/page/2/
‚úÖ Found 20 links on https://www.coffeereview.com/review/page/3/
‚úÖ Found 20 links on https://www.coffeereview.com/review/page/4/
‚úÖ Found 20 links on https://www.coffeereview.com/review/page/5/
‚úÖ Found 20 links on https://www.coffeereview.com/review/page/6/
‚úÖ Found 20 links on https://www.coffeereview.com/review/page/7/
‚úÖ Found 20 links on https://www.coffeereview.com/review/page/8/
‚úÖ Found 20 links on https://www.coffeereview.com/review/page/9/
‚úÖ Found 20 links on https://www.coffeereview.com/review/page/10/
‚úÖ Found 20 links on https://www.coffeereview.com/review/page/11/
‚úÖ Found 20 links on https://www.coffeereview.com/review/page/12/
‚úÖ Found 20 links on https://www.coffeereview.com/review/page/13/
‚úÖ Found 20 links on https://www.coffeereview.com/review/page/14/
‚úÖ Found 20 links on https://www.coffeereview.com/review/page/15/
‚úÖ 

# get ratings from coffee 

# The code below takes links from 'coffee_links.json' and scrapes all information, it is appended in 'coffee_data.json'

In [26]:
import requests
from bs4 import BeautifulSoup
import json
import time

def get_coffee_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract coffee rating (class="review-template-rating")
        rating_tag = soup.find(class_="review-template-rating")
        rating_value = rating_tag.text.strip() if rating_tag else "N/A"

        # Extract coffee name (class="review-title")
        rating_tag2 = soup.find(class_="review-title")
        coffee_name = rating_tag2.text.strip() if rating_tag2 else "N/A"

        # Extract roaster name (class="review-roaster")
        rating_tag3 = soup.find(class_="review-roaster")
        coffee_roaster = rating_tag3.text.strip() if rating_tag3 else "N/A"

        # Create dictionary to store coffee data
        coffee_data = {
            "URL": url,
            "Name": coffee_name,
            "Roaster": coffee_roaster,
            "Rating": rating_value
        }

        table_rows = soup.find_all("tr")
        for row in table_rows:
            columns = row.find_all("td")
            if len(columns) == 2:
                label = columns[0].text.strip().replace(":", "")
                value = columns[1].text.strip()
                coffee_data[label] = value

        # Append to JSON file line by line
        with open('coffee_data.json', 'a') as file:
            file.write(json.dumps(coffee_data) + '\n')

        print(f"‚úÖ Data saved for {url}")
    else:
        print(f"‚ùå Failed to fetch data from {url}: {response.status_code}")

def scrape_coffee_links(file_name):
    """Read links from JSON file and scrape each coffee review."""
    try:
        with open(file_name, 'r') as file:
            coffee_links = json.load(file)
        
        for url in coffee_links:
            print(f"üîÑ Scraping: {url}")
            get_coffee_data(url)
            time.sleep(1)  # Add delay to prevent rate-limiting

    except FileNotFoundError:
        print(f"‚ùå Error: {file_name} not found.")
    except json.JSONDecodeError:
        print(f"‚ùå Error: {file_name} is not a valid JSON file.")

# Run the scraper
scrape_coffee_links('coffee_links.json')


üîÑ Scraping: https://www.coffeereview.com/review/kenya-kirinyaga-4/
‚úÖ Data saved for https://www.coffeereview.com/review/kenya-kirinyaga-4/
üîÑ Scraping: https://www.coffeereview.com/review/costa-rica-san-diego/
‚úÖ Data saved for https://www.coffeereview.com/review/costa-rica-san-diego/
üîÑ Scraping: https://www.coffeereview.com/review/costa-rica-la-candelilla-geisha/
‚úÖ Data saved for https://www.coffeereview.com/review/costa-rica-la-candelilla-geisha/
üîÑ Scraping: https://www.coffeereview.com/review/colombia-arbey-narvaez/
‚úÖ Data saved for https://www.coffeereview.com/review/colombia-arbey-narvaez/
üîÑ Scraping: https://www.coffeereview.com/review/guatemala-santa-ana-dilla-alghe-washed/
‚úÖ Data saved for https://www.coffeereview.com/review/guatemala-santa-ana-dilla-alghe-washed/
üîÑ Scraping: https://www.coffeereview.com/review/ethiopia-agaro-duromina-2/
‚úÖ Data saved for https://www.coffeereview.com/review/ethiopia-agaro-duromina-2/
üîÑ Scraping: https://www.coffeer

KeyboardInterrupt: 