<a href="https://colab.research.google.com/github/Festuskipkoech/Festus_data-science/blob/main/DataPreprocessingMachineLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def preprocess_data(df):
    """
    Comprehensive data preprocessing pipeline

    Parameters:
    df (pandas.DataFrame): Input dataset with mixed feature types

    Returns:
    numpy.ndarray: Preprocessed features
    """
    # Identify feature types
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Create preprocessing steps
    # Numerical features preprocessing
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
        ('scaler', StandardScaler())  # Scale numerical features
    ])

    # Categorical features preprocessing
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Fit and transform the data
    X_preprocessed = preprocessor.fit_transform(df)

    return X_preprocessed

def main():
    # Example usage with synthetic dataset
    # Create a sample dataset with mixed feature types and missing values
    np.random.seed(42)

    # Numeric features with missing values
    numeric_data = np.random.randn(100, 3)
    numeric_data[np.random.rand(100, 3) < 0.1] = np.nan

    # Categorical features
    categorical_data = np.random.choice(['A', 'B', 'C'], size=(100, 2))

    # Combine into a DataFrame
    df = pd.DataFrame(
        np.hstack([numeric_data, categorical_data]),
        columns=['num1', 'num2', 'num3', 'cat1', 'cat2']
    )

    # Preprocess the data
    X_processed = preprocess_data(df)

    # Print preprocessing results
    print("Original DataFrame Shape:", df.shape)
    print("Preprocessed Data Shape:", X_processed.shape)
    print("\nPreprocessing Complete: Handled missing values, scaled numerics, and encoded categoricals.")

if __name__ == "__main__":
    main()

Original DataFrame Shape: (100, 5)
Preprocessed Data Shape: (100, 278)

Preprocessing Complete: Handled missing values, scaled numerics, and encoded categoricals.
