In [None]:
# Machine Learning Data Preprocessing Lab - Week 3
# Topics: Missing Data Handling, Feature Scaling, Encoding, Binning, Normalization, Standardization

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, load_wine
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
df

In [None]:
df_missing

In [None]:
# Create dataset
df_missing = create_sample_data_with_missing()
print("Original dataset with missing values:")
print(df_missing)
print(f"\nMissing values count:\n{df_missing.isnull().sum()}")

In [None]:
# Method 1: Drop rows with missing values
df_drop_rows = df_missing.dropna()
print(f"\nAfter dropping rows with missing values: {df_drop_rows.shape}")

In [None]:
df_drop_rows

In [None]:
# Method 2: Drop columns with missing values
df_drop_cols = df_missing.dropna(axis=1)
df_drop_cols

In [None]:
# Method 3: Fill missing values with mean/mode
df_fill_mean = df_missing.copy()
# Fill numerical columns with mean
numerical_cols = ['age', 'salary', 'experience', 'performance_score']
for col in numerical_cols:
    df_fill_mean[col].fillna(df_fill_mean[col].mean(), inplace=True)

In [None]:
df_fill_mean

In [None]:
# Fill categorical columns with mode
df_fill_mean['department'].fillna(df_fill_mean['department'].mode()[0], inplace=True)
print(f"\nAfter filling with mean/mode:")
df_fill_mean

In [None]:
# Method 4: Forward fill and backward fill
df_ffill = df_missing.fillna(method='ffill')
df_bfill = df_missing.fillna(method='bfill')
df_ffill

In [None]:
df_bfill

In [None]:
# Method 5: Using SimpleImputer
imputer_mean = SimpleImputer(strategy='mean')
imputer_mode = SimpleImputer(strategy='most_frequent')

df_imputed = df_missing.copy()
df_imputed[numerical_cols] = imputer_mean.fit_transform(df_imputed[numerical_cols])
df_imputed[['department']] = imputer_mode.fit_transform(df_imputed[['department']])
print(f"\nAfter SimpleImputer:")
df_imputed

In [None]:
# Method 6: KNN Imputer (for numerical data)
knn_imputer = KNNImputer(n_neighbors=2)
df_knn = df_missing.copy()
df_knn[numerical_cols] = knn_imputer.fit_transform(df_knn[numerical_cols])
print(f"\nAfter KNN Imputer (numerical columns only):")
print(df_knn[numerical_cols])

In [None]:
# Load sample dataset (Boston Housing - using alternative since it's deprecated)
# Creating synthetic housing data
np.random.seed(42)
housing_data = pd.DataFrame({
    'rooms': np.random.normal(6, 1.5, 100),
    'age': np.random.uniform(1, 100, 100),
    'distance': np.random.exponential(3, 100),
    'tax_rate': np.random.uniform(200, 800, 100),
    'price': np.random.normal(25, 10, 100)
})

print("Original housing dataset (first 5 rows):")
print(housing_data.head())
print(f"\nDataset statistics:")
print(housing_data.describe())

In [None]:
housing_data.sample(5)

In [None]:
# Method 1: Min-Max Scaling (Normalization)
scaler_minmax = MinMaxScaler()
housing_minmax = pd.DataFrame(
    scaler_minmax.fit_transform(housing_data),
    columns=housing_data.columns
)
print(f"\nAfter Min-Max Scaling (0-1 range):")
print(housing_minmax.describe())

In [None]:
housing_minmax.sample(5)

In [None]:
# Method 2: Standardization (Z-score normalization)
scaler_standard = StandardScaler()
housing_standard = pd.DataFrame(
    scaler_standard.fit_transform(housing_data),
    columns=housing_data.columns
)
print(f"\nAfter Standardization (mean=0, std=1):")
print(housing_standard.describe())

In [None]:
housing_standard.sample(5)

In [None]:
# Method 3: Robust Scaling (less sensitive to outliers)
scaler_robust = RobustScaler()
housing_robust = pd.DataFrame(
    scaler_robust.fit_transform(housing_data),
    columns=housing_data.columns
)
print(f"\nAfter Robust Scaling:")
print(housing_robust.describe())

In [None]:
# Method 5: Unit Vector Scaling (L2 normalization)
from sklearn.preprocessing import normalize
housing_unit = pd.DataFrame(
    normalize(housing_data, norm='l2'),
    columns=housing_data.columns
)
print(f"\nAfter Unit Vector Scaling:")
print(housing_unit.describe())

In [None]:
housing_unit.head(5)

In [None]:
# Create sample dataset with categorical variables
categorical_data = pd.DataFrame({
    'city': ['Mumbai', 'Delhi', 'Bangalore', 'Chennai', 'Mumbai', 'Delhi', 'Pune', 'Bangalore'],
    'education': ['Graduate', 'Post-Graduate', 'Graduate', 'High School', 'Post-Graduate', 'Graduate', 'High School', 'Graduate'],
    'experience_level': ['Junior', 'Senior', 'Mid', 'Junior', 'Senior', 'Mid', 'Junior', 'Senior'],
    'salary_range': ['Low', 'High', 'Medium', 'Low', 'High', 'Medium', 'Low', 'High'],
    'performance': [85, 92, 78, 65, 95, 82, 70, 88]
})

print("Original categorical dataset:")
categorical_data.head(5)

In [None]:
# Method 1: Label Encoding (for ordinal data)
le_education = LabelEncoder()
le_experience = LabelEncoder()
le_salary = LabelEncoder()

categorical_label = categorical_data.copy()
categorical_label['education_encoded'] = le_education.fit_transform(categorical_label['education'])
categorical_label['experience_encoded'] = le_experience.fit_transform(categorical_label['experience_level'])
categorical_label['salary_encoded'] = le_salary.fit_transform(categorical_label['salary_range'])

print(f"\nAfter Label Encoding:")
print(categorical_label[['education', 'education_encoded', 'experience_level', 'experience_encoded']])

In [None]:
categorical_label

In [None]:
# Method 2: One-Hot Encoding (for nominal data)
categorical_onehot = pd.get_dummies(categorical_data, columns=['city', 'education'], prefix=['city', 'edu'])
print(f"\nAfter One-Hot Encoding:")
print(categorical_onehot.head())

In [None]:
categorical_onehot.head()

In [None]:
# Method 5: Ordinal Encoding (when order matters)
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
salary_ordinal = ordinal_encoder.fit_transform(categorical_data[['salary_range']])
print(f"\nOrdinal Encoding for 'salary_range':")
print(f"Original: {categorical_data['salary_range'].tolist()}")
print(f"Encoded: {salary_ordinal.flatten().tolist()}")

In [None]:
categorical_data['salary_range'] = ordinal_encoder.fit_transform(categorical_data[['salary_range']])

In [None]:
categorical_data

In [None]:
# Create sample dataset for binning
age_data = pd.DataFrame({
    'age': [22, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 18, 28, 33, 38, 42, 48, 52, 58, 63],
    'income': [30000, 45000, 60000, 75000, 80000, 90000, 95000, 100000, 85000, 70000,
               60000, 25000, 50000, 70000, 78000, 85000, 92000, 98000, 88000, 75000]
})

In [None]:
age_data

In [None]:
# Method 1: Equal-width binning
age_data['age_bins_equal'] = pd.cut(age_data['age'], bins=4, labels=['Young', 'Adult', 'Middle-aged', 'Senior'])
print(f"\nEqual-width binning for age:")
print(age_data[['age', 'age_bins_equal']].head(10))

In [None]:
age_data.head(5)

In [None]:
# Method 2: Equal-frequency binning (quantile-based)
age_data['age_bins_quantile'] = pd.qcut(age_data['age'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
print(f"\nEqual-frequency binning for age:")
print(age_data[['age', 'age_bins_quantile']].head(10))

In [None]:
age_data.head(5)

In [None]:
custom_bins = [0, 30, 50, 70, 100]
custom_labels = ['Youth', 'Young Adult', 'Middle Age', 'Senior']
age_data['age_bins_custom'] = pd.cut(age_data['age'], bins=custom_bins, labels=custom_labels)
print(f"\nCustom binning for age:")
print(age_data[['age', 'age_bins_custom']].head(10))

In [None]:
age_data.head(5)