## Data Preprocessing Techniques in ML Ops


Objective :To understand and apply different data preprocessing techniques, including data cleaning, transformation, reduction, compression, and normalization, to prepare a dataset for machine learning model training.


In [None]:
import pandas as pd

# Load data
data = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5, 6, 7],
    'Name': ['John Doe', 'Jane Smith', 'Bob Johnson', 'Alice Brown', 'Charlie Black', 'David White', 'Emma Green'],
    'Age': [28, 35, 45, None, 32, 29, 26],
    'Department': ['HR', 'IT', 'HR', 'IT', 'Finance', 'Marketing', 'HR'],
    'Salary': [50000, 70000, 60000, None, 65000, 70000, 50000],
    'Date of Joining': ['2022-01-15', '2021-06-20', '2020-12-10', '2023-03-01', None, '2022-09-15', '2024-01-10']
})


In [None]:
data

Unnamed: 0,ID,Name,Age,Department,Salary,Date of Joining
0,1,John Doe,28.0,HR,50000.0,2022-01-15
1,2,Jane Smith,35.0,IT,70000.0,2021-06-20
2,3,Bob Johnson,45.0,HR,60000.0,2020-12-10
3,4,Alice Brown,,IT,,2023-03-01
4,5,Charlie Black,32.0,Finance,65000.0,
5,6,David White,29.0,Marketing,70000.0,2022-09-15
6,7,Emma Green,26.0,HR,50000.0,2024-01-10


In [None]:
# Fill missing values
data['Age'] = data['Age'].fillna(data['Age'].median())
data['Salary'] = data['Salary'].fillna(data['Salary'].mean()).round(2)  # Round to 2 decimal places
data['Date of Joining'] = data['Date of Joining'].fillna('Unknown')

# Drop duplicate rows
data = data.drop_duplicates()


In [None]:
data

Unnamed: 0,ID,Name,Age,Department,Salary,Date of Joining
0,1,John Doe,28.0,HR,50000.0,2022-01-15
1,2,Jane Smith,35.0,IT,70000.0,2021-06-20
2,3,Bob Johnson,45.0,HR,60000.0,2020-12-10
3,4,Alice Brown,30.5,IT,60833.33,2023-03-01
4,5,Charlie Black,32.0,Finance,65000.0,Unknown
5,6,David White,29.0,Marketing,70000.0,2022-09-15
6,7,Emma Green,26.0,HR,50000.0,2024-01-10


In [None]:
# Convert 'Department' to numerical values
data['Department'] = data['Department'].astype('category').cat.codes

# Convert 'Date of Joining' to datetime
data['Date of Joining'] = pd.to_datetime(data['Date of Joining'], errors='coerce')

In [None]:
data

Unnamed: 0,ID,Name,Age,Department,Salary,Date of Joining
0,1,John Doe,28.0,1,50000.0,2022-01-15
1,2,Jane Smith,35.0,2,70000.0,2021-06-20
2,3,Bob Johnson,45.0,1,60000.0,2020-12-10
3,4,Alice Brown,30.5,2,60833.33,2023-03-01
4,5,Charlie Black,32.0,0,65000.0,NaT
5,6,David White,29.0,3,70000.0,2022-09-15
6,7,Emma Green,26.0,1,50000.0,2024-01-10


In [None]:
# Drop the 'Name' column as it's not useful for model training
data_reduced = data.drop(columns=['Name'])

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Apply normalization
data_normalized = data_reduced.copy()
data_normalized[['Age', 'Salary']] = scaler.fit_transform(data_reduced[['Age', 'Salary']])


In [None]:
import zlib

# Convert DataFrame to CSV string
csv_data = data_reduced.to_csv(index=False)

# Compress the CSV data
compressed_data = zlib.compress(csv_data.encode())


In [None]:
print("Cleaned Data:")
print(data)

print("\nTransformed Data:")
print(data_reduced)

print("\nBefore Compression Data Size (bytes):")
print(len(compressed_data))

print("\nCompressed Data Size (bytes):")
print(len(compressed_data))

print("\nNormalized Data:")
print(data_normalized)


Cleaned Data:
   ID           Name   Age  Department    Salary Date of Joining
0   1       John Doe  28.0           1  50000.00      2022-01-15
1   2     Jane Smith  35.0           2  70000.00      2021-06-20
2   3    Bob Johnson  45.0           1  60000.00      2020-12-10
3   4    Alice Brown  30.5           2  60833.33      2023-03-01
4   5  Charlie Black  32.0           0  65000.00             NaT
5   6    David White  29.0           3  70000.00      2022-09-15
6   7     Emma Green  26.0           1  50000.00      2024-01-10

Transformed Data:
   ID   Age  Department    Salary Date of Joining
0   1  28.0           1  50000.00      2022-01-15
1   2  35.0           2  70000.00      2021-06-20
2   3  45.0           1  60000.00      2020-12-10
3   4  30.5           2  60833.33      2023-03-01
4   5  32.0           0  65000.00             NaT
5   6  29.0           3  70000.00      2022-09-15
6   7  26.0           1  50000.00      2024-01-10

Before Compression Data Size (bytes):
154

Com

On Iris dataset

In [None]:
import pandas as pd

# Load Iris dataset from URL or JSON file
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
df = pd.read_csv(url, header=None, names=columns)

# Initial data overview
print(df.head())
print(df.info())

   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None


Data Cleaning

In [None]:
# Check for missing values and duplicates
print("Missing values:\n", df.isnull().sum())
df.drop_duplicates(inplace=True)


Missing values:
 sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64


Data Transformation

In [None]:
# Encode species as numeric
df['species'] = pd.Categorical(df['species']).codes

Dimensionality Reduction (PCA)

In [None]:
from sklearn.decomposition import PCA

# Apply PCA (2 components)
pca = PCA(n_components=2)
reduced_df = pca.fit_transform(df.iloc[:, :-1])

# Create DataFrame for reduced features
reduced_df = pd.DataFrame(reduced_df, columns=['PC1', 'PC2'])
reduced_df['species'] = df['species']

print("Reduced Data:\n", reduced_df.head())


Reduced Data:
         PC1       PC2  species
0 -2.710782  0.322125      0.0
1 -2.741763 -0.175061      0.0
2 -2.916691 -0.141509      0.0
3 -2.773363 -0.315205      0.0
4 -2.755418  0.330133      0.0


Data Compression

In [None]:
import zlib
import pickle

# Compress and decompress data
compressed_data = zlib.compress(pickle.dumps(df))
decompressed_data = pickle.loads(zlib.decompress(compressed_data))

# Verify compression
print("Compressed Data Size (bytes):", len(compressed_data))
print("Data Integrity Check:", df.equals(decompressed_data))


Compressed Data Size (bytes): 1710
Data Integrity Check: True


Data Normalization

In [None]:
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df.iloc[:, :-1])


In [None]:
scaled_df

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667],
       [0.30555556, 0.79166667, 0.11864407, 0.125     ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667],
       [0.02777778, 0.375     , 0.06779661, 0.04166667],
       [0.16666667, 0.45833333, 0.08474576, 0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667],
       [0.13888889, 0.41666667, 0.06779661, 0.        ],
       [0.        , 0.41666667, 0.01694915, 0.        ],
       [0.41666667, 0.83333333, 0.03389831, 0.04166667],
       [0.38888889, 1.        , 0.08474576, 0.125     ],
       [0.30555556, 0.79166667, 0.05084746, 0.125     ],
       [0.22222222, 0.625     ,

Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(scaled_df, df.iloc[:, -1],test_size=0.3, random_state=2025)

# Train and evaluate a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.89
