## Import Necessary Libraries
Import the required Python libraries for HTTP requests, data manipulation, HTML parsing, and time control.


In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from IPython.display import display
#import time from IPython.display import display

## MyTheresaScraper Class Definition
This class is designed to encapsulate the methods needed for scraping data from the MyTheresa website specifically for men's fashion. It provides functionalities to fetch and parse webpage content, extract categories, subcategories, and product details.




### Constructor
Initializes a new instance of the `MyTheresaScraper` class with a base URL. The base URL points to the men's section of the MyTheresa website.


### Get Request Page Method
Attempts to fetch content from a given URL derived from the base URL and a subpath. If the request fails, it retries up to a maximum number of retries (`max_retries`) and pauses (`retry_delay`) between retries. If all attempts fail, it returns `None`.


### Get Categories Method
Fetches the homepage to extract main categories and subcategories. It constructs a DataFrame to keep the data organized. Categories are assumed to be in specific divisions of the webpage, hence the slice `[2:7]`.


### Get Subcategories Method
For a given category page, this method extracts all subcategories and their corresponding URLs, updating the main categories DataFrame with these details.


### Get Detail Method
Extracts detailed information of all products listed on a category-specific page. It collects data such as designer name, item name, and price. This method returns a list of dictionaries, each containing a product's complete details.


In [2]:
class MyTheresaScraper:
    def __init__(self, base_url='https://www.mytheresa.com/ch/en/men'):
        self.base_url = base_url
    
    def get_request_page(self, sub_url, max_retries=10, retry_delay=5):
        retries = 0
        url = f'{self.base_url}/{sub_url}'
        while retries < max_retries:
            try:
                response = requests.get(url)
                response.raise_for_status()
                return BeautifulSoup(response.content, 'html.parser')
            except requests.RequestException as error:
                if "Max retries exceeded" in str(error):
                    retries += 1
                    time.sleep(retry_delay)
                    print('Retry...')
                else:
                    print(f'Error fetching page: {error}')
                    return None
        print(f'Max retries exceeded. Skipping {url}')
        return None

    def get_categories(self):
        categories_df = pd.DataFrame(columns=['Category', 'Subcategory', 'Link'])
        homepage = self.get_request_page('')
        if homepage:
            categories_pages = homepage.find_all('div', class_='nav__item')
            for category_page in categories_pages[2:7]:
                category = category_page.find('a', class_='nav__item__text__link', href=True).text.strip()
                categories_df = self.get_subcategories(categories_df, category_page, category)
        return categories_df

    def get_subcategories(self, categories_df, sub_page, category):
        subpage = sub_page.find('ul', class_='flyout__col__content__list')
        if subpage:
            subcategories_pages = subpage.find_all('a', class_='flyout__col__content__list__item__link', href=True)
            new_rows = [{
                'Category': category,
                'Subcategory': subcategory_page.text.strip(),
                'Link': subcategory_page['href'].split("/men/", 1)[-1]
            } for subcategory_page in subcategories_pages]
            categories_df = pd.concat([categories_df, pd.DataFrame(new_rows)], ignore_index=True)
        return categories_df

    def get_detail(self, category_row, detail_page):
        items = detail_page.find_all('div', class_="item")
        detail_data = []
        for item in items:
            designer_name = item.find('div', class_="item__info__header__designer").text.strip()
            item_name = item.find('div', class_="item__info__name").find('a').text.strip()
            price = item.find('span', class_="pricing__prices__price").text.strip()
            detail_data.append({
                'Category': category_row['Category'],
                'Subcategory': category_row['Subcategory'],
                'Designer': designer_name,
                'Name': item_name,
                'Price': price
            })
        return detail_data

## Main Execution Block
This block checks if the script is run as the main program and performs the scraping process through multiple pages, collecting all necessary product details and eventually saving them to a CSV file.


In [None]:
if __name__ == '__main__':
    scraper = MyTheresaScraper()
    categories = scraper.get_categories()
    all_data = []
    
    for index, row in categories.iterrows():
        page = scraper.get_request_page(f'{row["Link"]}?sortBy=new_item&page=1')
        if page:
            all_data.extend(scraper.get_detail(row, page))
            # Attempt to get the maximum page number
            max_page_element = page.find('a', {'data-label': 'lastPage'})
            if max_page_element:
                max_page = int(max_page_element['data-index'])
                for x in range(2, max_page + 1):
                    page = scraper.get_request_page(f'{row["Link"]}?sortBy=new_item&page={x}')
                    if page:
                        all_data.extend(scraper.get_detail(row, page))

    # Save DataFrame to CSV
    df = pd.DataFrame(all_data)
    try:
        df.to_csv('mytheresa_men_data.csv', index=False)
        print("CSV file saved successfully.")
    except Exception as e:
        print(f"Error occurred while saving CSV file: {e}")


## Load Data and Preliminary Analysis

Loading the data using Pandas and examining the first few rows along with a summary to identify potential impurities.


In [None]:
# Load the dataset
data_path = 'mytheresa_men_data.csv'  
data = pd.read_csv(data_path)

# Display the first few rows of the dataframe
print("First few rows of the dataframe:")
print(data.head())

# General information about the dataframe
print("\\nGeneral information and data types:")
print(data.info())

# Description of data in dataframe
print("\\nSummary statistics and unique value counts:")
print(data.describe(include='all'))


### Data Overview

From the loaded data, here are a few observations:

- **Total Entries**: There are 11,371 rows across 5 columns.
- **Columns**: The columns included are `Category`, `Subcategory`, `Designer`, `Name`, and `Price`.
- **Missing Values**: There appear to be no missing values as each column has entries equal to the row count.
- **Data Types**: All columns are currently of type object (string).
- **Unique Values**:
  - `Category`: 4 unique values.
  - `Subcategory`: 46 unique values.
  - `Designer`: 140 unique designers.
  - `Name`: 7,005 unique product names.
  - `Price`: Prices are formatted as strings with the CHF symbol, and there are 1,658 unique price points.


### Identified Impurities

1. **Price Formatting**:
   - Prices are prefixed with "CHF" and may contain commas in thousands. This needs to be cleaned for numerical operations.
2. **Text Formatting**:
   - Textual data in `Category`, `Subcategory`, `Designer`, and `Name` should be standardized (e.g., consistent capitalization).
3. **Duplicate Entries**:
   - With products like "Cotton polo shirt" appearing 88 times, it's possible some entries are duplicates that should be investigated and removed.
4. **Incorrect Data Types**:
   - All columns are read as objects; `Price` should be converted to a numerical format for any financial analysis.
5. **Potential Outliers or Incorrect Entries**:
   - There could be misentries especially in product names or unusually high or low prices which should be investigated.


In [None]:
class DataCleaner:
    def __init__(self, dataframe):
        self.df = dataframe

    def remove_currency_symbols(self):
        """Remove currency symbols from the 'Price' column and convert to float."""
        self.df['Price'] = self.df['Price'].replace('[\$,CHF]', '', regex=True).replace(',', '', regex=True).astype(float)

    def standardize_text(self):
        """Standardize textual data to capitalize (Title case)."""
        text_cols = ['Category', 'Subcategory', 'Designer', 'Name']
        for col in text_cols:
            self.df[col] = self.df[col].str.title()

    #def remove_duplicates(self):
        #"""Remove duplicate rows from the DataFrame."""
        #self.df.drop_duplicates(inplace=True)
        
    def check_and_remove_duplicates(self):
        """Check for and remove duplicate rows from the DataFrame if any."""
        if self.df.duplicated().sum() > 0:
            self.df.drop_duplicates(inplace=True)
            print("Duplicates were found and removed.")
        else:
            print("No duplicates found.")


    def convert_data_types(self):
        """Convert data types after cleaning."""
        self.df['Price'] = pd.to_numeric(self.df['Price'], errors='coerce')

    def clean_data(self):
        """Run all cleaning methods."""
        self.standardize_text()
        self.remove_currency_symbols()
        #self.remove_duplicates()
        self.check_and_remove_duplicates()
        self.convert_data_types()
        return self.df

In [None]:
# Instantiate and clean the data
cleaner = DataCleaner(data)
clean_data = cleaner.clean_data()

# Show cleaned data info and head
clean_data_info = clean_data.info()
clean_data_head = clean_data.head()

clean_data_head, clean_data_info

## Cleaned Data Overview

The dataset has been successfully cleaned with the following adjustments:

### Standardized Text
- **Text Fields Affected**: `Category`, `Subcategory`, `Designer`, `Name`
- **Action Taken**: All text fields have been converted to title case to ensure consistency across entries.

### Price Formatting
- **Original Format**: Prices were formatted as strings with "CHF" symbols and commas (e.g., "CHF 2,380").
- **Actions Taken**:
  - Removed the "CHF" currency symbol and any commas.
  - Converted the `Price` field from string to float for numerical operations.

### Duplicate Rows
- **Initial Check**: A check for duplicate entries was conducted to ensure data quality.
- **Result of Check**: No duplicates were found in the dataset. Hence, no entries were removed on this basis.
- **Entries Before**: 11,371 entries.
- **Entries After**: 11,371 entries (unchanged due to no duplicates).

### Data Types Corrections
- **Price Field**:
  - Converted from string to float, correctly aligning the data type with the content for financial analysis.
- **Other Fields**:
  - The fields `Category`, `Subcategory`, `Designer`, and `Name` remain as objects (appropriate for string data).

### DataFrame Overview After Cleaning

```plaintext
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11,371 entries, 0 to 11,370
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Category     11,371 non-null  object 
 1   Subcategory  11,371 non-null  object 
 2   Designer     11,371 non-null  object 
 3   Name         11,371 non-null  object 
 4   Price        11,371 non-null  float64
dtypes: float64(1), object(4)
memory usage: 444.3+ KB


Scenario Overview

I will:

Dirty the Data: Intentionally introduce common data issues into a clean dataset.
Clean the Data: Apply various cleaning techniques to restore the dataset to its original quality.

Step 1: Dirtying the Data
Objective: Simulate common data issues including random noise in numerical data, text case inconsistencies, and duplicate entries.

Step 2: Cleaning the Data
Objective: Implement cleaning operations to correct the issues introduced in Step 1, ensuring the data is standardized and duplicates are removed.

Data Dirtier Class

This class introduces three types of impurities: random noise in prices, random text case errors in string fields, and duplicate rows.

In [None]:
class DataDirtier:
    def __init__(self, dataframe):
        self.df = dataframe

    def add_noise_to_prices(self):
        """Add random noise to price: +/- 10% of the current price, ensure 'Price' is float."""
        self.df['Price'] = self.df['Price'].astype(float)  # Ensure Price is float before adding noise
        noise = np.random.uniform(-0.1, 0.1, size=len(self.df))
        self.df['Price'] += self.df['Price'] * noise

    def introduce_text_errors(self):
        """Introduce random uppercase letters to 'Designer' and 'Name' fields."""
        self.df['Designer'] = self.df['Designer'].apply(lambda x: ''.join(
            np.random.choice([k.upper(), k.lower()]) for k in x))
        self.df['Name'] = self.df['Name'].apply(lambda x: ''.join(
            np.random.choice([k.upper(), k.lower()]) for k in x))

    def add_duplicate_entries(self):
        """Duplicate random 5% of the entries."""
        duplicates = self.df.sample(frac=0.05, replace=False)
        self.df = pd.concat([self.df, duplicates], ignore_index=True)

    def dirty_data(self):
        """Run all methods to dirty the data."""
        self.add_noise_to_prices()
        self.introduce_text_errors()
        self.add_duplicate_entries()
        return self.df


In [None]:
# Assume 'clean_data' is the DataFrame loaded with original clean data
#dirtier = DataDirtier(clean_data.copy())  # Use a copy to preserve the original clean data
#dirty_data = dirtier.dirty_data()

#cleaner = DataCleaner(dirty_data)
#re_cleaned_data = cleaner.clean_data()

# Display and save using DataShowcase
#data_showcase = DataShowcase(dirty_data, re_cleaned_data)
#data_showcase.display_data()
#data_showcase.save_data('dirty_data.csv', 'clean_data.csv')


In [None]:
class DataShowcase:
    def __init__(self, dirty_df, clean_df):
        """
        Initializes the DataShowcase object with dirty and clean dataframes.
        
        :param dirty_df: DataFrame before cleaning
        :param clean_df: DataFrame after cleaning
        """
        self.dirty_df = dirty_df
        self.clean_df = clean_df

    def display_data(self):
        """
        Displays the first few rows of the dirty and clean dataframes for comparison.
        """
        print("Dirty Data (first 5 rows):")
        display(self.dirty_df.head())
        print("\nClean Data (first 5 rows):")
        display(self.clean_df.head())

    def save_data(self, dirty_file_path, clean_file_path):
        """
        Saves the dirty and clean dataframes to CSV files and provides download links.
        
        :param dirty_file_path: Path to save the dirty data CSV
        :param clean_file_path: Path to save the clean data CSV
        """
        # Save dirty data
        self.dirty_df.to_csv(dirty_file_path, index=False)
        print(f"Dirty data saved to {dirty_file_path}")

        # Save clean data
        self.clean_df.to_csv(clean_file_path, index=False)
        print(f"Clean data saved to {clean_file_path}")

        # Provide download links (Assuming running in Jupyter or compatible environment)
        print("\nDownload Links:")
        print(f"Download Dirty Data: [here](./{dirty_file_path})")
        print(f"Download Clean Data: [here](./{clean_file_path})")


In [None]:
# Usage of DataShowcase class
data_showcase = DataShowcase(dirty_data, re_cleaned_data)
data_showcase.display_data()
data_showcase.save_data('dirty_data.csv', 'clean_data.csv')

Data Cleaner Class

This class rectifies the issues: it normalizes text fields, corrects numerical fields, and removes duplicate records.

In [None]:
class DataCleaner:
    def __init__(self, dataframe):
        self.df = dataframe

    def remove_currency_symbols_and_commas(self):
        """Remove currency symbols and commas from 'Price' and convert to float."""
        if self.df['Price'].dtype == 'O':  # Checks if 'Price' is object type (string)
            self.df['Price'] = self.df['Price'].str.replace('[\$,CHF]', '', regex=True).replace(',', '', regex=True)
        self.df['Price'] = self.df['Price'].astype(float)

    def standardize_text(self):
        """Standardize textual data to capitalize (Title case)."""
        text_cols = ['Category', 'Subcategory', 'Designer', 'Name']
        for col in text_cols:
            self.df[col] = self.df[col].str.title()

    def check_and_remove_duplicates(self):
        """Check for and remove duplicate rows from the DataFrame if any."""
        if self.df.duplicated().sum() > 0:
            self.df.drop_duplicates(inplace=True)
            print("Duplicates were found and removed.")
        else:
            print("No duplicates found.")

    def clean_data(self):
        """Run all cleaning methods."""
        self.standardize_text()
        self.remove_currency_symbols_and_commas()
        self.check_and_remove_duplicates()
        return self.df

Output and Validation

This process ensures that despite the intentional introduction of errors, the data can be restored effectively using the DataCleaner class. The output will verify that text fields are standardized, prices are correctly formatted as numeric values without any symbols or commas, and any inadvertently introduced duplicates are removed.

In [None]:
# Assume 'clean_data' is the DataFrame obtained from previous clean operations
# Instantiate the dirtier and apply dirty methods
dirtier = DataDirtier(clean_data)
dirty_data = dirtier.dirty_data()

# Now clean the dirty data
cleaner = DataCleaner(dirty_data)
re_cleaned_data = cleaner.clean_data()

# Display the info of the re-cleaned data
print(re_cleaned_data.info())
print(re_cleaned_data.head())

