In [10]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import re

def load_data(filepath):
    """Load data from a CSV file."""
    return pd.read_csv(filepath)

def save_data(df, filepath):
    """Save DataFrame to a CSV file."""
    df.to_csv(filepath, index=False)

class DataCleaner:
    def __init__(self, filepath):
        self.df = load_data(filepath)
        self.valid_brands = [
            'apple', 'xiaomi', 'samsung', 'nothing', 'motorola', 'fairphone', 'google', 'doro', 'inoi', 
            'emporia', 'one', 'nokia', 'ruggear', 'oppo', 'crosscall', 'wiko', 'peaq', 'huawei', 'lg', 
            'sony', 'htc', 'oneplus', 'zte', 'alcatel', 'asus', 'blackberry', 'realme', 'vivo', 'tecno', 
            'lenovo', 'meizu', 'honor', 'ulefone', 'cat'
        ]

    def clean_data(self):
        """Apply various cleaning functions."""
        self.df['brand'] = self.df['brand'].apply(self.validate_brand)
        self.df['model'] = self.df['model'].apply(lambda x: x.strip().lower())
        self.df = self.filter_category('Smartphone')
        self.df['storage'] = self.df['storage'].apply(self.convert_storage)
        self.df['color'] = self.df['color'].apply(self.extract_color)
        self.df['price'] = self.df['price'].apply(self.clean_price).astype(float)
        self.df['rating'] = self.df['rating'].fillna(0).astype(float)
        self.clean_reviews()
        self.df['delivery_time'] = self.df['delivery_time'].apply(self.extract_days)

    def validate_brand(self, brand):
        """Standardize brand names and alert on unrecognized brands."""
        brand_lower = brand.lower()
        if brand_lower not in self.valid_brands:
            print(f"Unrecognized brand, please verify: {brand}")
        return brand_lower

    def filter_category(self, category):
        """Filter DataFrame by category."""
        return self.df[self.df['category'].str.lower() == category.lower()]

    def convert_storage(self, value):
        """Convert TB to GB if necessary, and handle non-string inputs gracefully."""
        if isinstance(value, str) and 'TB' in value:
            return str(int(float(value.replace('TB', '')) * 1000)) + ' GB'
        elif isinstance(value, str):
            return value.replace('GB', '').strip()
        return value

    def extract_color(self, value):
        """Extract color from description."""
        colors = ['black', 'blue', 'green', 'red', 'yellow', 'white', 'gray', 'purple', 'pink', 'orange', 
                  'brown', 'silver', 'gold', 'titanium', 'platinum', 'schwarz', 'weiss']
        value_lower = value.lower()
        for color in colors:
            if color in value_lower:
                return color
        return 'unknown'

    def clean_price(self, value):
        """Extract numerical price from string."""
        matches = re.findall(r'\d+', value)
        if matches:
            return max(matches, key=len)
        return value

    def clean_reviews(self):
        """Extract and clean numeric review counts from strings."""
        self.df['n_of_reviews'] = self.df['n_of_reviews'].astype(str).str.extract('(\d+)').fillna(0).astype(int)

    def extract_days(self, text):
        """Extract delivery days from text."""
        if pd.isnull(text) or "nicht mehr verfügbar" in text or "nicht lieferbar" in text or "ausverkauft" in text or "kein Liefertermin" in text:
            return None
        numbers = [int(num) for num in re.findall(r'\d+', text)]
        return max(numbers) if numbers else None

    def generate_profile_report(self):
        """Generate a data profiling report."""
        profile = ProfileReport(self.df, title='Pandas Profiling Report', explorative=True)
        profile.to_file("data_profiling_report.html")

    def save(self, filepath):
        """Save the cleaned data to a CSV file."""
        save_data(self.df, filepath)

if __name__ == "__main__":
    cleaner = DataCleaner('data/stage01_scraped_mediamarkt.csv')
    cleaner.clean_data()
    cleaner.generate_profile_report()
    cleaner.save('data/stage02_cleaned_mediamarkt.csv')
    print("Data cleaning process completed and saved successfully.")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: 'apple'')
  annotation = ("{:" + self.fmt + "}").format(val)
(using `df.profile_report(missing_diagrams={"Heatmap": False}`)
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: '--'')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Data cleaning process completed and saved successfully.


In [12]:
cleaner

<__main__.DataCleaner at 0x286268f10>