# Data Preprocessing Pipeline using Python

1.Imputing missing values

2.Scaling numeric features

3.Finding and removing outliers

4.Encoding categorical variables

In [1]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler


def data_preprocessing_pipeline(data):
    #identify the numeric and categorical features
    numerical_feature = data.select_dtypes(include=['int', 'float']).columns
    categorical_feature = data.select_dtypes(include=['object']).columns
    
    
    
    # handle missing values in numeric feature
    data[numerical_feature] = data[numerical_feature].fillna(data[numerical_feature].mean())
    
    # Handle outlier in numeric features
    for feature in numerical_feature:
        Q1 = data[feature].quantile(0.25)
        Q3 = data[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - (1.5*IQR)
        upper_bound = Q3 + (1.5*IQR)
        data[feature] = np.where((data[feature]< lower_bound) | (data[feature] > upper_bound), data[feature].mean(), 
                                 data[feature])
        
    # Normalize numeric features
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data[numerical_feature])
    data[numerical_feature] = scaler.transform(data[numerical_feature])
    
    # Handle missing values in categorical features
    data[categorical_feature] = data[categorical_feature].fillna(data[categorical_feature].mode().iloc[0])
    
    return data

In [2]:
data = pd.read_csv(r'C:\Users\ASUS\Downloads\data.csv')

In [3]:
data.head()

Unnamed: 0,NumericFeature1,NumericFeature2,CategoricalFeature
0,1.0,7,A
1,2.0,8,B
2,,9,
3,4.0,10,A
4,5.0,11,B


In [4]:
cleaned_data = data_preprocessing_pipeline(data)

In [5]:
print(cleaned_data)

   NumericFeature1  NumericFeature2 CategoricalFeature
0        -1.535624        -1.099370                  A
1        -0.944999        -0.749128                  B
2         0.000000        -0.398886                  A
3         0.236250        -0.048645                  A
4         0.826874         0.301597                  B
5         1.417499         1.994431                  C
