Data Preprocessing Techniques in ML Ops

Objective :To understand and apply different data preprocessing techniques, including data cleaning, transformation, reduction, compression, and normalization, to prepare a dataset for machine learning model training.

In [None]:
import pandas as pd

# Load data
data = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5, 6, 7],
    'Name': ['John Doe', 'Jane Smith', 'Bob Johnson', 'Alice Brown', 'Charlie Black', 'David White', 'Emma Green'],
    'Age': [28, 35, 45, None, 32, 29, 26],
    'Department': ['HR', 'IT', 'HR', 'IT', 'Finance', 'Marketing', 'HR'],
    'Salary': [50000, 70000, 60000, None, 65000, 70000, 50000],
    'Date of Joining': ['2022-01-15', '2021-06-20', '2020-12-10', '2023-03-01', None, '2022-09-15', '2024-01-10']
})
data

Unnamed: 0,ID,Name,Age,Department,Salary,Date of Joining
0,1,John Doe,28.0,HR,50000.0,2022-01-15
1,2,Jane Smith,35.0,IT,70000.0,2021-06-20
2,3,Bob Johnson,45.0,HR,60000.0,2020-12-10
3,4,Alice Brown,,IT,,2023-03-01
4,5,Charlie Black,32.0,Finance,65000.0,
5,6,David White,29.0,Marketing,70000.0,2022-09-15
6,7,Emma Green,26.0,HR,50000.0,2024-01-10


In [None]:
# Fill missing values
data['Age'] = data['Age'].fillna(data['Age'].median())
data['Salary'] = data['Salary'].fillna(data['Salary'].mean()).round(2)
data['Date of Joining'] = data['Date of Joining'].fillna('Unknown')

# Drop duplicate rows
data = data.drop_duplicates()

data

Unnamed: 0,ID,Name,Age,Department,Salary,Date of Joining
0,1,John Doe,28.0,HR,50000.0,2022-01-15
1,2,Jane Smith,35.0,IT,70000.0,2021-06-20
2,3,Bob Johnson,45.0,HR,60000.0,2020-12-10
3,4,Alice Brown,30.5,IT,60833.33,2023-03-01
4,5,Charlie Black,32.0,Finance,65000.0,Unknown
5,6,David White,29.0,Marketing,70000.0,2022-09-15
6,7,Emma Green,26.0,HR,50000.0,2024-01-10


In [None]:
# Convert 'Department' to numerical values
data['Department'] = data['Department'].astype('category').cat.codes

# Convert 'Date of Joining' to datetime
data['Date of Joining'] = pd.to_datetime(data['Date of Joining'], errors='coerce')

# Normalize 'Salary' column
#data['Salary'] = (data['Salary'] - data['Salary'].min()) / (data['Salary'].max() - data['Salary'].min())
data

Unnamed: 0,ID,Name,Age,Department,Salary,Date of Joining
0,1,John Doe,28.0,1,50000.0,2022-01-15
1,2,Jane Smith,35.0,2,70000.0,2021-06-20
2,3,Bob Johnson,45.0,1,60000.0,2020-12-10
3,4,Alice Brown,30.5,2,60833.33,2023-03-01
4,5,Charlie Black,32.0,0,65000.0,NaT
5,6,David White,29.0,3,70000.0,2022-09-15
6,7,Emma Green,26.0,1,50000.0,2024-01-10


In [None]:
# Drop the 'Name' column as it's not useful for model training
data_reduced = data.drop(columns=['Name'])
data_reduced

Unnamed: 0,ID,Age,Department,Salary,Date of Joining
0,1,28.0,1,50000.0,2022-01-15
1,2,35.0,2,70000.0,2021-06-20
2,3,45.0,1,60000.0,2020-12-10
3,4,30.5,2,60833.33,2023-03-01
4,5,32.0,0,65000.0,NaT
5,6,29.0,3,70000.0,2022-09-15
6,7,26.0,1,50000.0,2024-01-10


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the scalar
scaler = MinMaxScaler()

# Apply Normalization
data_normalized = data_reduced.copy()
data_normalized[['Age', 'Salary']] = scaler.fit_transform(data_normalized[['Age', 'Salary']])
data_normalized


Unnamed: 0,ID,Age,Department,Salary,Date of Joining
0,1,0.105263,1,0.0,2022-01-15
1,2,0.473684,2,1.0,2021-06-20
2,3,1.0,1,0.5,2020-12-10
3,4,0.236842,2,0.541667,2023-03-01
4,5,0.315789,0,0.75,NaT
5,6,0.157895,3,1.0,2022-09-15
6,7,0.0,1,0.0,2024-01-10


In [None]:
import zlib

# Convert DataFrame to CSV string
csv_data = data_reduced.to_csv(index=False)

# Compress the CSV data
compressed_data = zlib.compress(csv_data.encode())

csv_data, compressed_data

('ID,Age,Department,Salary,Date of Joining\n1,28.0,1,50000.0,2022-01-15\n2,35.0,2,70000.0,2021-06-20\n3,45.0,1,60000.0,2020-12-10\n4,30.5,2,60833.33,2023-03-01\n5,32.0,0,65000.0,\n6,29.0,3,70000.0,2022-09-15\n7,26.0,1,50000.0,2024-01-10\n',
 b'x\x9c]\x8eM\x0e\x82@\x0c\x85\xf7=\x05\x07xC\xfa\xc3\x0c\xb04a\xa3[O0\x8b\x91\x90(\x18\xc3\xc6\xdb[p\x81\xb1i\xd2\x97\xbe\xbe\xf4;\x0f8\x8d\x05Cy\xe6\xd7\xfa(\xf3\x8ak\xbe\xe7\xd7\x1bC^K\xb5\xdc\xaa\xcb2\xcd\xd3<\x92@\xbb\x9a!\x88\xec\xe5JY5\xb0\x04\x89\xa4\xb0\xb8m\xd0\x1e\x9e\x04NA\x99\x0cM\xdcs\xe9\xf08\x88\x06aj`\\G\xcf%\xee\xccj\xb3\xcd\xb4\xc0\xdeB\x11\xa6~\xceH\xf1\x1b\xa4\x04\xed}\xda\xef\x1bG\xe87\x84\x16\x9a\xfe\xf1\x9a\x1d\x8f\xe9\x03,`0|')

In [None]:
print("Cleaned Data:")
print(data)

print("\nTransformed Data:")
print(data_reduced)

print("\n Before Compression Data Size (bytes):")
print(len(csv_data))

print("\nCompressed Data Size (bytes):")
print(len(compressed_data))

print("\nNormalized Data:")
print(data_normalized)



Cleaned Data:
   ID           Name   Age  Department    Salary Date of Joining
0   1       John Doe  28.0           1  50000.00      2022-01-15
1   2     Jane Smith  35.0           2  70000.00      2021-06-20
2   3    Bob Johnson  45.0           1  60000.00      2020-12-10
3   4    Alice Brown  30.5           2  60833.33      2023-03-01
4   5  Charlie Black  32.0           0  65000.00             NaT
5   6    David White  29.0           3  70000.00      2022-09-15
6   7     Emma Green  26.0           1  50000.00      2024-01-10

Transformed Data:
   ID   Age  Department    Salary Date of Joining
0   1  28.0           1  50000.00      2022-01-15
1   2  35.0           2  70000.00      2021-06-20
2   3  45.0           1  60000.00      2020-12-10
3   4  30.5           2  60833.33      2023-03-01
4   5  32.0           0  65000.00             NaT
5   6  29.0           3  70000.00      2022-09-15
6   7  26.0           1  50000.00      2024-01-10

 Before Compression Data Size (bytes):
228

Co

## On Iris Dataset

In [None]:
import pandas as pd

# Load Iris dataset from url or JSON file
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
df = pd.read_csv(url, header=None, names=columns)

# Initial data overview
print(df.head())
print(df.info())
print(df.describe())



   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean

Data Cleaning

In [None]:
# Check for missing values and duplicates
print("Missing values:\n", df.isnull().sum())
df.drop_duplicates(inplace= True)


Missing values:
 sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64


Data Transformation

In [None]:
# Encode species as numeric
df['species'] = pd.Categorical(df['species']).codes

Dimensionality Reduction (PCA)

In [None]:
from sklearn.decomposition import PCA

# Apply PCA (2 components)
pca = PCA(n_components=2)
reduced_df = pca.fit_transform(df.iloc[:, :-1])

# Create DataFrame for reduced features
reduced_df = pd.DataFrame(data=reduced_df, columns=['PC1', 'PC2'])
reduced_df['species'] = df['species']

print(" Reduced Data:")
print(reduced_df.head())



 Reduced Data:
        PC1       PC2  species
0 -2.710782  0.322125      0.0
1 -2.741763 -0.175061      0.0
2 -2.916691 -0.141509      0.0
3 -2.773363 -0.315205      0.0
4 -2.755418  0.330133      0.0


Data Compression

In [None]:
import zlib
import pickle

# Compress and decompress data
compressed_data = zlib.compress(pickle.dumps(df))
decompressed_data = pickle.loads(zlib.decompress(compressed_data))

# Verify compression
print("Compressed Data Size (bytes):", len(compressed_data))
print("Data Integrity Check:", df.equals(decompressed_data))

Compressed Data Size (bytes): 1710
Data Integrity Check: True


Data Normalization

In [None]:
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df.iloc[:, :-1])
scaled_df = pd.DataFrame(data=scaled_df, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])

scaled_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
142,0.666667,0.416667,0.711864,0.916667
143,0.555556,0.208333,0.677966,0.750000
144,0.611111,0.416667,0.711864,0.791667
145,0.527778,0.583333,0.745763,0.916667


Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(scaled_df, df.iloc[:, -1], test_size=0.3, random_state=42)

# Train and Evaluate a logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.98
