In [1]:
import pandas as pd 
from sklearn.impute import SimpleImputer


def load_data(file , **kwargs):
    import pandas as pd  
    from pathlib import Path 
    
    file_path = Path(file).suffix.lower()
    
    file_type = {
        ".csv" : pd.read_csv,
        ".xlsx" : pd.read_excel,
        ".json" : pd.read_json,
    }
    
    read_data = file_type.get(file_path)
    if  read_data is None:
        raise ValueError(f"unspported file.{file_path}")
    
    df = read_data(file,**kwargs)
    
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(" ","_")
    
    return df

def check_data_quality(df):
    
    quality_report = {
        'missing_values': df.isnull().sum().to_dict(),
        'duplicates': df.duplicated().sum(),
        'total_rows': len(df),
        'memory_usage': df.memory_usage().sum() / 1024**2  # in MB
    }
    return quality_report

def standardize_datatypes(df):
    for column in df.columns:
        
        if df[column].dtype == 'object':
            try:
                df[column] = pd.to_datetime(df[column])
                print(f"Converted {column} to datetime")
            except ValueError:
                
                try:
                    df[column] = pd.to_numeric(df[column])
                    print(f"Converted {column} to numeric")
                except:
                    pass
    return df

from sklearn.impute import SimpleImputer

def handle_missing_values(df):
    
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    if len(numeric_columns) > 0:
        num_imputer = SimpleImputer(strategy='median')
        df[numeric_columns] = num_imputer.fit_transform(df[numeric_columns])
    
    
    categorical_columns = df.select_dtypes(include=['object']).columns
    if len(categorical_columns) > 0:
        cat_imputer = SimpleImputer(strategy='most_frequent')
        df[categorical_columns] = cat_imputer.fit_transform(df[categorical_columns])
    
    return df

def remove_outliers(df):
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    outliers_removed = {}
    
    for column in numeric_columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
       
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)].shape[0]
        
       
        df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
        
        if outliers > 0:
            outliers_removed[column] = outliers
            
    return df, outliers_removed

def validate_cleaning(df, original_shape, cleaning_report):
    validation_results = {
        'rows_remaining': len(df),
        'missing_values_remaining': df.isnull().sum().sum(),
        'duplicates_remaining': df.duplicated().sum(),
        'data_loss_percentage': (1 - len(df)/original_shape[0]) * 100
    }
    
    cleaning_report['validation'] = validation_results
    return cleaning_report


def automated_cleaning_pipeline(df):
   
    original_shape = df.shape
     
    cleaning_report = {}
    
    cleaning_report['initial_quality'] = check_data_quality(df)
    
    df = standardize_datatypes(df)
    df = handle_missing_values(df)
    df, outliers = remove_outliers(df)
    cleaning_report['outliers_removed'] = outliers
    
    # Validate and finalize report
    cleaning_report = validate_cleaning(df, original_shape, cleaning_report)
    
    return df, cleaning_report

In [3]:
df = load_data("demand_forecasting.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'demand_forecasting.csv'