**Importing necessary Libraries**

In [42]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sqlalchemy import create_engine

**Scraping Books Data**

In [59]:
def scrape_books_to_scrape():
    url = "http://books.toscrape.com/"
    response = requests.get(url)

    # Check response code
    if response.status_code != 200:
        print(f"Error: Unable to access site. Status code: {response.status_code}")
        return pd.DataFrame()

    soup = BeautifulSoup(response.text, 'html.parser')

    books = []
    rows = soup.select('article.product_pod')

    for row in rows[:50]:
        try:
            title = row.select_one('h3 a')['title']
            price = row.select_one('.price_color').text.strip()
            rating = row.select_one('p.star-rating')['class'][1]
            books.append({"title": title, "price": price, "rating": rating})
        except AttributeError as e:
            print(f"Error parsing row: {row}")
            print(e)

    # Create DataFrame
    book_df = pd.DataFrame(books)
    return book_df

**Cleaning the Scraped Data**

In [60]:
def clean_data(df):
    # Removing duplicates
    df = df.drop_duplicates()

    # Missing values
    df = df.dropna()

    # Trimming whitespace
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.strip()

    return df

**Feature Extraction and Data Transformation**

In [65]:
def transform_data(df):
    # Convert price to numeric
    df['price'] = df['price'].str.replace('£', '', regex=False).str.replace(',', '', regex=False).str.replace('Â', '', regex=False)
    df['price'] = pd.to_numeric(df['price'], errors='coerce')

    # Normalize price
    scaler = MinMaxScaler()
    df['price_normalized'] = scaler.fit_transform(df[['price']])

    return df

**Saving to CSV**

In [66]:
def save_to_csv(df, file_name="books_data.csv"):
    if not df.empty:
        df.to_csv(file_name, index=False)
        print(f"Data saved to {file_name}")
    else:
        print("Error: DataFrame is empty. No data saved to CSV.")

**Saving to SQLite Database**

In [67]:
def load_to_database(df, db_name="books_data.db", table_name="top_books"):
    engine = create_engine(f"sqlite:///{db_name}")
    df.to_sql(table_name, engine, if_exists="replace", index=False)
    print(f"Data loaded into table '{table_name}' in database '{db_name}'.")

**Unit Testing**

In [75]:
def test_clean_data():
    raw_data = scrape_books_to_scrape()

    if raw_data.empty:
        print("Error: No data scraped.")
        return

    cleaned_data = clean_data(raw_data)

    assert cleaned_data.shape[0] == raw_data.shape[0], f"Test failed: Expected {raw_data.shape[0] - 1} rows after cleaning, got {cleaned_data.shape[0]}"
    assert cleaned_data['title'].isnull().sum() == 0, "Test failed: Missing title value after cleaning"

    print("Data cleaning test passed!")

test_clean_data()


Data cleaning test passed!


**Integration Test**

In [76]:
def test_pipeline():
    print("Starting pipeline test...")

    raw_data = scrape_books_to_scrape()

    if raw_data.empty:
        print("No data scraped. Exiting test.")
        return

    cleaned_data = clean_data(raw_data)

    transformed_data = transform_data(cleaned_data)

    save_to_csv(transformed_data)
    load_to_database(transformed_data)

    print("Pipeline test passed!")

test_pipeline()


Starting pipeline test...
Data saved to books_data.csv
Data loaded into table 'top_books' in database 'books_data.db'.
Pipeline test passed!


**Calling the functions**

In [69]:
def full_pipeline():
    raw_data = scrape_books_to_scrape()

    if raw_data.empty:
        print("No data scraped. Exiting pipeline.")
        return

    cleaned_data = clean_data(raw_data)

    transformed_data = transform_data(cleaned_data)

    save_to_csv(transformed_data)

    load_to_database(transformed_data)

full_pipeline()

Data saved to books_data.csv
Data loaded into table 'top_books' in database 'books_data.db'.
