# Data Preprocessing Techniques

This notebook demonstrates various data preprocessing techniques commonly used in machine learning and data analysis using the California Housing dataset.

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, LabelEncoder,OneHotEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer
import category_encoders as ce


# Set display options
# pd.set_option('display.max_columns', None)

# Set random seed for reproducibility
np.random.seed(42)

# Load the housing dataset
d = pd.read_csv('housing_sample.csv')
print('Housing dataset shape:', d.shape)
print('\nFirst few rows:')
d.head(20)

Housing dataset shape: (1000, 10)

First few rows:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,
1,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,
2,-122.44,37.8,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,
3,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
4,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.725,278000.0,NEAR OCEAN
5,-117.61,34.08,12.0,4427.0,,2400.0,843.0,4.7147,158700.0,
6,-118.02,33.89,36.0,1375.0,,670.0,221.0,5.0839,198200.0,<1H OCEAN
7,-118.08,33.92,38.0,1335.0,,1011.0,269.0,3.6908,157500.0,<1H OCEAN
8,-122.08,37.39,4.0,2292.0,,1050.0,584.0,4.8036,340000.0,NEAR BAY
9,-118.23,34.18,45.0,2332.0,,943.0,339.0,8.1132,446600.0,<1H OCEAN


In [8]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           1000 non-null   float64
 1   latitude            1000 non-null   float64
 2   housing_median_age  1000 non-null   float64
 3   total_rooms         1000 non-null   float64
 4   total_bedrooms      626 non-null    float64
 5   population          1000 non-null   float64
 6   households          1000 non-null   float64
 7   median_income       900 non-null    float64
 8   median_house_value  1000 non-null   float64
 9   ocean_proximity     991 non-null    object 
dtypes: float64(9), object(1)
memory usage: 78.3+ KB


In [9]:
# null values
d.isnull().count()

longitude             1000
latitude              1000
housing_median_age    1000
total_rooms           1000
total_bedrooms        1000
population            1000
households            1000
median_income         1000
median_house_value    1000
ocean_proximity       1000
dtype: int64

## 1. Data Imputation

Data imputation is the process of replacing missing values in a dataset.

##### Mean Imputation
Mean imputation replaces missing values with the mean of the non-missing values in the same column.

- Mostly used for numerical data.

In [3]:
# Mean imputation for numerical columns
imputer_mean = SimpleImputer(strategy='mean')
d[['total_bedrooms','median_income']] = imputer_mean.fit_transform(d[['total_bedrooms','median_income']])

print('Dataset after mean imputation:')
d.head()

Dataset after mean imputation:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-119.01,36.06,25.0,1505.0,551.0,1392.0,359.0,1.6812,47700.0,
1,-119.46,35.14,30.0,2943.0,551.0,1565.0,584.0,2.5313,45800.0,
2,-122.44,37.8,52.0,3830.0,551.0,1310.0,963.0,3.4801,500001.0,
3,-118.72,34.28,17.0,3051.0,551.0,1705.0,495.0,5.7376,218600.0,<1H OCEAN
4,-121.93,36.62,34.0,2351.0,551.0,1063.0,428.0,3.725,278000.0,NEAR OCEAN


In [11]:
# Mean imputation for numerical columns
imputer_mean = SimpleImputer(strategy='mean')
data_mean_imputed = d.copy()
data_mean_imputed[['total_bedrooms','median_income']] = imputer_mean.fit_transform(d[['total_bedrooms','median_income']])

print('Dataset after mean imputation:')
data_mean_imputed.head()

Dataset after mean imputation:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-119.01,36.06,25.0,1505.0,551.0,1392.0,359.0,1.6812,47700.0,
1,-119.46,35.14,30.0,2943.0,551.0,1565.0,584.0,2.5313,45800.0,
2,-122.44,37.8,52.0,3830.0,551.0,1310.0,963.0,3.4801,500001.0,
3,-118.72,34.28,17.0,3051.0,551.0,1705.0,495.0,5.7376,218600.0,<1H OCEAN
4,-121.93,36.62,34.0,2351.0,551.0,1063.0,428.0,3.725,278000.0,NEAR OCEAN


##### Median Imputation
Median imputation replaces missing values with the median of the non-missing values in the same column.

- Mostly used for numerical data.

In [12]:
# Median imputation
imputer_median = SimpleImputer(strategy='median')
data_median_imputed = d.copy()
data_median_imputed[['total_bedrooms','median_income']] = imputer_median.fit_transform(d[['total_bedrooms','median_income']])

print('\nDataset after median imputation:')
data_median_imputed.head()


Dataset after median imputation:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-119.01,36.06,25.0,1505.0,446.5,1392.0,359.0,1.6812,47700.0,
1,-119.46,35.14,30.0,2943.0,446.5,1565.0,584.0,2.5313,45800.0,
2,-122.44,37.8,52.0,3830.0,446.5,1310.0,963.0,3.4801,500001.0,
3,-118.72,34.28,17.0,3051.0,446.5,1705.0,495.0,5.7376,218600.0,<1H OCEAN
4,-121.93,36.62,34.0,2351.0,446.5,1063.0,428.0,3.725,278000.0,NEAR OCEAN


##### Mode Imputation
Mode imputation replaces missing values with the mode (most frequent value) of the non-missing values in the same column.

- It is useful for categorical data.

In [16]:
imputer_mode = SimpleImputer(strategy='most_frequent')
data_mode_imputed = d.copy()
data_mode_imputed[['ocean_proximity']] = imputer_mode.fit_transform(d[['ocean_proximity']])

print('\nDataset after mode imputation:')
data_mode_imputed.head(20)


Dataset after mode imputation:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,<1H OCEAN
1,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,<1H OCEAN
2,-122.44,37.8,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,<1H OCEAN
3,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
4,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.725,278000.0,NEAR OCEAN
5,-117.61,34.08,12.0,4427.0,,2400.0,843.0,4.7147,158700.0,<1H OCEAN
6,-118.02,33.89,36.0,1375.0,,670.0,221.0,5.0839,198200.0,<1H OCEAN
7,-118.08,33.92,38.0,1335.0,,1011.0,269.0,3.6908,157500.0,<1H OCEAN
8,-122.08,37.39,4.0,2292.0,,1050.0,584.0,4.8036,340000.0,NEAR BAY
9,-118.23,34.18,45.0,2332.0,,943.0,339.0,8.1132,446600.0,<1H OCEAN


##### Forward Fill Imputation
Forward fill imputation replaces missing values with the last observed value in the same column.


In [25]:
# Forward fill imputation
data_ffill = d.copy()
data_ffill[['total_bedrooms','median_income','ocean_proximity']] = data_ffill[['total_bedrooms','median_income','ocean_proximity']].ffill()

print('\nDataset after forward fill imputation:')   

data_ffill.tail(20)
# print(data.tail(20))


Dataset after forward fill imputation:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
980,-121.37,38.69,35.0,1093.0,192.0,590.0,190.0,2.7009,80200.0,INLAND
981,-122.42,37.75,52.0,2164.0,533.0,1122.0,469.0,3.2632,306000.0,NEAR BAY
982,-118.29,34.11,48.0,1448.0,295.0,681.0,287.0,3.2632,436400.0,<1H OCEAN
983,-117.78,33.69,16.0,4702.0,806.0,2529.0,814.0,5.1299,238900.0,<1H OCEAN
984,-117.83,33.88,18.0,2112.0,340.0,1048.0,315.0,5.1299,231700.0,<1H OCEAN
985,-118.19,34.05,47.0,1273.0,340.0,1193.0,260.0,2.4375,122900.0,<1H OCEAN
986,-118.11,33.82,37.0,1756.0,340.0,836.0,335.0,2.4375,218200.0,<1H OCEAN
987,-122.28,37.9,52.0,2003.0,250.0,658.0,244.0,10.0825,397000.0,NEAR BAY
988,-122.83,38.39,19.0,1765.0,394.0,868.0,388.0,2.462,260300.0,<1H OCEAN
989,-122.54,37.95,38.0,2310.0,394.0,971.0,386.0,5.697,435700.0,NEAR BAY


##### Backward Fill Imputation
Backward fill imputation replaces missing values with the next observed value in the same column.

In [26]:
# Backward fill imputation
data_bfill = d.copy()
data_bfill[['total_bedrooms','median_income','ocean_proximity']] = data_bfill[['total_bedrooms','median_income','ocean_proximity']].bfill()
# print('\nDataset before backward fill imputation:')

print('\nDataset after backward fill imputation:')
data_bfill.head(20)


Dataset after backward fill imputation:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-119.01,36.06,25.0,1505.0,30.0,1392.0,359.0,1.6812,47700.0,<1H OCEAN
1,-119.46,35.14,30.0,2943.0,30.0,1565.0,584.0,2.5313,45800.0,<1H OCEAN
2,-122.44,37.8,52.0,3830.0,30.0,1310.0,963.0,3.4801,500001.0,<1H OCEAN
3,-118.72,34.28,17.0,3051.0,30.0,1705.0,495.0,5.7376,218600.0,<1H OCEAN
4,-121.93,36.62,34.0,2351.0,30.0,1063.0,428.0,3.725,278000.0,NEAR OCEAN
5,-117.61,34.08,12.0,4427.0,30.0,2400.0,843.0,4.7147,158700.0,<1H OCEAN
6,-118.02,33.89,36.0,1375.0,30.0,670.0,221.0,5.0839,198200.0,<1H OCEAN
7,-118.08,33.92,38.0,1335.0,30.0,1011.0,269.0,3.6908,157500.0,<1H OCEAN
8,-122.08,37.39,4.0,2292.0,30.0,1050.0,584.0,4.8036,340000.0,NEAR BAY
9,-118.23,34.18,45.0,2332.0,30.0,943.0,339.0,8.1132,446600.0,<1H OCEAN


## 2. Feature Scaling

Feature scaling is the process of normalizing or standardizing the range of independent variables or features of data. It is important for algorithms that compute distances between data points, such as KNN and SVM.

- **Standardization**: Rescales the feature to have a mean of 0 and a standard deviation of 1.
- **Normalization**: Rescales the feature to a range of [0, 1] or [-1, 1].
- **Min-Max Scaling**: Rescales the feature to a range of [0, 1] using the formula: (X - min(X)) / (max(X) - min(X)).
- **MaxAbs Scaling**: Rescales the feature to a range of [-1, 1] using the formula: X / max(abs(X)).

In [13]:
# Select numerical features for scaling
numerical_features = ['median_income', 'total_rooms', 'housing_median_age']
data_scaling = d[numerical_features].copy()

print('Original numerical features:')
data_scaling.head()


Original numerical features:


Unnamed: 0,median_income,total_rooms,housing_median_age
0,1.6812,1505.0,25.0
1,2.5313,2943.0,30.0
2,3.4801,3830.0,52.0
3,5.7376,3051.0,17.0
4,3.725,2351.0,34.0


In [14]:
data_scaling.shape


(1000, 3)

In [12]:
data_reshaped = data_scaling.reshape(data_scaling.shape[0], -1)
data_reshaped.head()

AttributeError: 'tuple' object has no attribute 'reshape'

### 2.1 StandardScaler and MaxAbsScaler

**StandardScaler**: Standardizes features by removing the mean and scaling to unit variance.
- Formula: X_scaled = (X - mean(X)) / std(X)




In [10]:
# Standardization
std_scaler = StandardScaler()
data_standardized = pd.DataFrame(
    std_scaler.fit_transform(data_scaling),
    columns = data_scaling.columns
)
    

print('Standardized dataset:')
data_standardized.head()

ValueError: Found array with dim 3. StandardScaler expected <= 2.

**MaxAbsScaler**: Scales each feature by its maximum absolute value, preserving the sparsity of the data.
- Formula: X_scaled = X / max(abs(X))

In [11]:
# Max Absolute scaling
max_abs_scaler = MaxAbsScaler()
data_maxabs = pd.DataFrame(
    max_abs_scaler.fit_transform(data_scaling),
    columns = data_scaling.columns
)

print('\nMax Absolute scaled dataset:')
data_maxabs.head()

ValueError: Found array with dim 3. MaxAbsScaler expected <= 2.

### 2.2 MinMaxScaler (Normalization)

**MinMaxScaler**: Scales features to a given range, usually [0, 1]. It is sensitive to outliers.
- Formula: X_scaled = (X - min(X)) / (max(X) - min(X))

In [30]:
# Min-Max scaling
min_max_scaler = MinMaxScaler()
data_normalized = pd.DataFrame(
    min_max_scaler.fit_transform(data_scaling),
    columns = data_scaling.columns
)

print('Normalized dataset:')
data_normalized.head()

Normalized dataset:


Unnamed: 0,median_income,total_rooms,housing_median_age
0,0.069714,0.067965,0.46
1,0.129091,0.13369,0.56
2,0.195362,0.174231,1.0
3,0.353042,0.138626,0.3
4,0.212468,0.106632,0.64


## 3. Categorical Encoding

Converting categorical variables into numerical format using the housing dataset's 'ocean_proximity' feature.

In [31]:
# Prepare categorical data
print('Original categorical variable distribution:')
print(d['ocean_proximity'].value_counts())
data_cat = d[['ocean_proximity']].copy()
print('\nSample of original data:')
data_cat.head()

Original categorical variable distribution:
ocean_proximity
<1H OCEAN     447
INLAND        299
NEAR OCEAN    144
NEAR BAY      101
Name: count, dtype: int64

Sample of original data:


Unnamed: 0,ocean_proximity
0,
1,
2,
3,<1H OCEAN
4,NEAR OCEAN


### 3.1 Ordinal Encoding

Ordinal encoding assigns a unique integer to each category in the feature. It is suitable for ordinal categorical variables where the order matters.
- Example: ['low', 'medium', 'high'] -> [0, 1, 2]
- It is not suitable for nominal categorical variables where the order does not matter.
- Example: ['red', 'green', 'blue'] -> [0, 1, 2] (not meaningful)

In [35]:


# Initialize the encoder
oe = OrdinalEncoder()

# Select the categorical columns (assuming data_cat contains only categorical columns)
# Fit and transform the data (sklearn requires 2D array)
data_ordinal = d.copy()


print('Ordinal encoded dataset:')
print(data_ordinal.head())

print('\nOrdinal mapping:')
# The categories_ attribute contains the mapping for each column
for i, column in enumerate(data_cat.columns):
    print(f"{column}: {list(enumerate(oe.categories[i]))}")

Ordinal encoded dataset:
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -119.01     36.06                25.0       1505.0             NaN   
1    -119.46     35.14                30.0       2943.0             NaN   
2    -122.44     37.80                52.0       3830.0             NaN   
3    -118.72     34.28                17.0       3051.0             NaN   
4    -121.93     36.62                34.0       2351.0             NaN   

   population  households  median_income  median_house_value ocean_proximity  
0      1392.0       359.0         1.6812             47700.0             NaN  
1      1565.0       584.0         2.5313             45800.0             NaN  
2      1310.0       963.0         3.4801            500001.0             NaN  
3      1705.0       495.0         5.7376            218600.0       <1H OCEAN  
4      1063.0       428.0         3.7250            278000.0      NEAR OCEAN  

Ordinal mapping:
ocean_proximity: [(0, 'a')]


We can also give a custom order to the categories using the `categories` parameter in `OrdinalEncoder`.

In [36]:
# Binary encoding
be = ce.BinaryEncoder()
data_binary = be.fit_transform(data_cat)
print('Binary encoded dataset:')
print(data_binary.head(20))

Binary encoded dataset:
    ocean_proximity_0  ocean_proximity_1  ocean_proximity_2
0                   1                  0                  1
1                   1                  0                  1
2                   1                  0                  1
3                   0                  0                  1
4                   0                  1                  0
5                   1                  0                  1
6                   0                  0                  1
7                   0                  0                  1
8                   0                  1                  1
9                   0                  0                  1
10                  1                  0                  0
11                  0                  0                  1
12                  0                  0                  1
13                  0                  1                  0
14                  0                  0                  1
15              

### 3.3 One-Hot Encoding

**One-hot encoding** creates binary columns for each category in the feature. It is suitable for nominal categorical variables where the order does not matter.
- Example: ['red', 'green', 'blue'] -> [1, 0, 0], [0, 1, 0], [0, 0, 1]
- It is not suitable for ordinal categorical variables where the order matters.
- Example: ['low', 'medium', 'high'] -> [1, 0, 0], [0, 1, 0], [0, 0, 1] (not meaningful)
- It can lead to a high number of features if the categorical variable has many unique values, which can increase the dimensionality of the dataset and lead to the curse of dimensionality.

In [48]:
# One-hot encoding

data_onehot = pd.get_dummies(data_cat)
print('One-hot encoded dataset:')
data_onehot.head()




One-hot encoded dataset:


Unnamed: 0,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,True,False,False,False
4,False,False,False,True


In [37]:
# One-hot encoding
oh = ce.OneHotEncoder()
data_onehot = oh.fit_transform(data_cat)
print('One-hot encoded dataset:')
data_onehot.head()




One-hot encoded dataset:


Unnamed: 0,ocean_proximity_1,ocean_proximity_2,ocean_proximity_3,ocean_proximity_4,ocean_proximity_5
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,1,0,0,0,0
4,0,1,0,0,0


In [39]:

encoder = OneHotEncoder()

encoded = encoder.fit_transform(data_cat)
print(encoded)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1000 stored elements and shape (1000, 5)>
  Coords	Values
  (0, 4)	1.0
  (1, 4)	1.0
  (2, 4)	1.0
  (3, 0)	1.0
  (4, 3)	1.0
  (5, 4)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 2)	1.0
  (9, 0)	1.0
  (10, 1)	1.0
  (11, 0)	1.0
  (12, 0)	1.0
  (13, 3)	1.0
  (14, 0)	1.0
  (15, 0)	1.0
  (16, 2)	1.0
  (17, 1)	1.0
  (18, 3)	1.0
  (19, 1)	1.0
  (20, 2)	1.0
  (21, 0)	1.0
  (22, 1)	1.0
  (23, 3)	1.0
  (24, 0)	1.0
  :	:
  (975, 0)	1.0
  (976, 0)	1.0
  (977, 3)	1.0
  (978, 1)	1.0
  (979, 3)	1.0
  (980, 1)	1.0
  (981, 2)	1.0
  (982, 0)	1.0
  (983, 0)	1.0
  (984, 0)	1.0
  (985, 0)	1.0
  (986, 0)	1.0
  (987, 2)	1.0
  (988, 0)	1.0
  (989, 2)	1.0
  (990, 0)	1.0
  (991, 3)	1.0
  (992, 1)	1.0
  (993, 1)	1.0
  (994, 1)	1.0
  (995, 4)	1.0
  (996, 4)	1.0
  (997, 4)	1.0
  (998, 4)	1.0
  (999, 4)	1.0


### 3.4 Label Encoding

**Label encoding** does not care about the order of the categories and assigns a unique integer to each category. It is suitable for nominal categorical variables where the order does not matter.
- Example: ['red', 'green', 'blue'] -> [0, 1, 2]
- It is not suitable for ordinal categorical variables where the order matters.


In [50]:
# Label encoding
le = LabelEncoder()
data = d.copy()

data['ocean_proximity'] = le.fit_transform(data['ocean_proximity'])
print('Label encoded dataset:')
data.head()

Label encoded dataset:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,4
1,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,4
2,-122.44,37.8,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,4
3,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,0
4,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.725,278000.0,3
