In [4]:
from bs4 import BeautifulSoup
import requests
from minio import Minio
from io import BytesIO
from datetime import datetime
import csv
from io import StringIO
from textblob import TextBlob
import re

def setup_minio_client():
    minio_client = Minio('localhost:9000',
                         access_key='ROOTUSER',
                         secret_key='DATAINCUBATOR',
                         secure=False)
    # Create bronze bucket if it doesn't exist
    if not minio_client.bucket_exists('bronze'):
        minio_client.make_bucket('bronze')
        print("Bucket 'bronze' created successfully")
    # Create silver bucket if it doesn't exist
    if not minio_client.bucket_exists('silver'):
        minio_client.make_bucket('silver')
        print("Bucket 'silver' created successfully")
    # Create gold bucket if it doesn't exist
    if not minio_client.bucket_exists('gold'):
        minio_client.make_bucket('gold')
        print("Bucket 'gold' created successfully")
    
    return minio_client

minio_client = setup_minio_client()

def scrape_books_data():
    url = "https://books.toscrape.com/catalogue/page-1.html"
    response = requests.get(url)
    if response.status_code == 200:
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')
        book_rows = soup.find_all('article', class_='product_pod')  
        
        books_data = []
        for book in book_rows:
            title = book.find('h3').find('a')['title']
            price = book.find('p', class_='price_color').text.strip()
            availability = book.find('p', class_='instock availability').text.strip()
            rating = book.find('p', class_='star-rating')['class'][1]

            books_data.append({
                'title': title,
                'price': price,
                'availability': availability,
                'rating': rating
            })
        return books_data
    else:
        print(f"Failed to fetch page, status code: {response.status_code}")
        return None

def scrape_quotes_data():
    url = "http://quotes.toscrape.com/page/1/"
    response = requests.get(url)
    if response.status_code == 200:
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')
        quote_blocks = soup.find_all('div', class_='quote')  
        
        quotes_data = []
        for quote in quote_blocks:
            text = quote.find('span', class_='text').text.strip()
            author = quote.find('small', class_='author').text.strip()

            quotes_data.append({
                'text': text,
                'author': author,
            })
        return quotes_data
    else:
        print(f"Failed to fetch page, status code: {response.status_code}")
        return None

def clean_books_data(books_data):
    cleaned_data = []
    timestamp = datetime.now().isoformat()
    for book in books_data:
        # Remove non-numeric characters except the decimal point
        price_str = re.sub(r'[^\d.]', '', book['price'])
        try:
            price = float(price_str)
        except ValueError:
            print(f"Could not convert price to float for book: {book['title']}")
            price = None  # Set price to None if conversion fails

        # Standardize availability field
        availability = book['availability'].replace('\n', '').strip()
        
        # Add timestamp metadata
        enriched_book = {
            'title': book['title'],
            'price': price,
            'availability': availability,
            'rating': book['rating'],
            'scrape_timestamp': timestamp
        }
        cleaned_data.append(enriched_book)
    return cleaned_data

def clean_quotes_data(quotes_data):
    cleaned_data = []
    timestamp = datetime.now().isoformat()
    for quote in quotes_data:
        # Only keep entries with 'text' and 'author' fields
        if 'text' in quote and 'author' in quote:
            cleaned_quote = {
                'text': quote['text'],
                'author': quote['author'],
                'scrape_timestamp': timestamp
            }
            cleaned_data.append(cleaned_quote)
    return cleaned_data


def add_price_category(books_data):
    for book in books_data:
        if book['price'] < 10:
            book['price_category'] = 'cheap'
        elif book['price'] < 20:
            book['price_category'] = 'moderate'
        else:
            book['price_category'] = 'expensive'
    return books_data

def add_sentiment_analysis(quotes_data):
    enriched_data = []
    for quote in quotes_data:
        sentiment = TextBlob(quote['text']).sentiment.polarity  # Sentiment value between -1 and 1
        enriched_quote = quote.copy()
        enriched_quote['sentiment'] = sentiment
        enriched_data.append(enriched_quote)
    return enriched_data

def save_data_to_minio(data, minio_client, bucket_name, object_name):
    csv_data = StringIO()
    fieldnames = data[0].keys()
    writer = csv.DictWriter(csv_data, fieldnames=fieldnames)
    writer.writeheader()

    for record in data:
        writer.writerow(record)
    
    csv_data_bytes = BytesIO(csv_data.getvalue().encode('utf-8'))

    if not minio_client.bucket_exists(bucket_name):
        minio_client.make_bucket(bucket_name)
    
    minio_client.put_object(
        bucket_name, object_name, csv_data_bytes, len(csv_data_bytes.getvalue())
    )
    print(f"Data saved successfully as {object_name} in bucket '{bucket_name}'.")

# Bronze level data scraping
book_data = scrape_books_data()
quote_data = scrape_quotes_data()

# Silver level data cleaning and enrichment
if book_data:
    cleaned_books_data = clean_books_data(book_data)
    save_data_to_minio(cleaned_books_data, minio_client, 'silver', f'books_data_silver_{datetime.now().strftime("%Y%m%d")}.csv')

if quote_data:
    cleaned_quotes_data = clean_quotes_data(quote_data)  # Use the correct function
    save_data_to_minio(cleaned_quotes_data, minio_client, 'silver', f'quotes_data_silver_{datetime.now().strftime("%Y%m%d")}.csv')

# Gold level data with added analysis
if cleaned_books_data:
    gold_books_data = add_price_category(cleaned_books_data)
    save_data_to_minio(gold_books_data, minio_client, 'gold', f'books_data_gold_{datetime.now().strftime("%Y%m%d")}.csv')

if cleaned_quotes_data:
    gold_quotes_data = add_sentiment_analysis(cleaned_quotes_data)
    save_data_to_minio(gold_quotes_data, minio_client, 'gold', f'quotes_data_gold_{datetime.now().strftime("%Y%m%d")}.csv')


Data saved successfully as books_data_silver_20241106.csv in bucket 'silver'.
Data saved successfully as quotes_data_silver_20241106.csv in bucket 'silver'.
Data saved successfully as books_data_gold_20241106.csv in bucket 'gold'.
Data saved successfully as quotes_data_gold_20241106.csv in bucket 'gold'.
