# Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Normalization and Standardization

In [None]:
# Load the Iris dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)

# We'll compare original, normalized, and standardized features
print("Original data (first 5 rows):")
print(X.head())

plt.figure(figsize=(10, 6))
plt.scatter(X['sepal length (cm)'], X['sepal width (cm)'], c=iris.target, cmap='viridis', edgecolor='k', s=100)
plt.title('Iris Dataset: Sepal Length vs Sepal Width')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.colorbar(label='Species')
plt.grid()
plt.show()


In [None]:
# ⚙️ Normalization: [0, 1]
minmax_scaler = MinMaxScaler().fit(X)
X_norm = pd.DataFrame(minmax_scaler.transform(X), columns=X.columns)

# ⚙️ Standardization: mean=0, std=1
std_scaler = StandardScaler().fit(X)
X_std = pd.DataFrame(std_scaler.transform(X), columns=X.columns)

# Plotting distributions for comparison
feature = 'sepal length (cm)'

plt.figure(figsize=(15, 4))

# Original
plt.subplot(1, 3, 1)
plt.hist(X[feature], bins=20, color='skyblue', edgecolor='black')
plt.title('Original')

# Normalized
plt.subplot(1, 3, 2)
plt.hist(X_norm[feature], bins=20, color='lightgreen', edgecolor='black')
plt.title('Normalized (0-1)')

# Standardized
plt.subplot(1, 3, 3)
plt.hist(X_std[feature], bins=20, color='salmon', edgecolor='black')
plt.title('Standardized (mean=0, std=1)')

plt.suptitle(f'Distribution of "{feature}"', fontsize=16)
plt.tight_layout()
plt.show()

## Ourliers in Normalization and Standardization

In [None]:
from sklearn.preprocessing import RobustScaler

# Generate synthetic data
np.random.seed(42)

# Generate 1000 values from a normal distribution (mean=50, std=10)
data = np.random.normal(loc=50, scale=10, size=(1000, 1))

# Add 5 extreme outliers
outliers = np.array([[300], [310], [320], [330], [340]])
data_with_outliers = np.vstack((data, outliers))

# Convert to DataFrame
df = pd.DataFrame(data_with_outliers, columns=["value"])

# Apply scalers
minmax = MinMaxScaler().fit(df[["value"]])
standard = StandardScaler().fit(df[["value"]])
robust = RobustScaler().fit(df[["value"]])

df["minmax"] = minmax.transform(df[["value"]])
df["standard"] = standard.transform(df[["value"]])
df["robust"] = robust.transform(df[["value"]])

# Plot all scalings
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Original data
axes[0, 0].hist(df["value"], bins=50, color='gray', edgecolor='black')
axes[0, 0].set_title("Original Data")

# Min-Max Scaling
axes[0, 1].hist(df["minmax"], bins=50, color='lightblue', edgecolor='black')
axes[0, 1].set_title("Min-Max Normalized")

# Standardization
axes[1, 0].hist(df["standard"], bins=50, color='orange', edgecolor='black')
axes[1, 0].set_title("Standardized")

# Robust Scaling
axes[1, 1].hist(df["robust"], bins=50, color='green', edgecolor='black')
axes[1, 1].set_title("Robust Scaled")

# Layout
plt.tight_layout()
plt.show()


## Example of Applying Normalizaction and Standardization in a ML model

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# 🔍 Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# 🧪 Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

breast_cancer_df = pd.DataFrame(data.data, columns=data.feature_names)
breast_cancer_df

In [None]:

# -----------------------------
# Logistic Regression WITHOUT scaling
# -----------------------------
model_no_scaling = LogisticRegression(max_iter=1000, penalty=None)
model_no_scaling.fit(X_train, y_train)
y_pred_ns = model_no_scaling.predict(X_test)
acc_ns = accuracy_score(y_test, y_pred_ns)

# -----------------------------
# Logistic Regression WITH StandardScaler in Pipeline
# -----------------------------
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=1000, penalty=None))
])

pipeline.fit(X_train, y_train)
y_pred_scaled = pipeline.predict(X_test)
acc_scaled = accuracy_score(y_test, y_pred_scaled)

# -----------------------------
# Logistic Regression WITH Normalization in Pipeline
# -----------------------------

pipeline = Pipeline([
    ("scaler", MinMaxScaler()),
    ("logreg", LogisticRegression(max_iter=1000, penalty=None))
])

pipeline.fit(X_train, y_train)
y_pred_norm = pipeline.predict(X_test)
acc_norm = accuracy_score(y_test, y_pred_norm)

# -----------------------------
# Compare results
# -----------------------------
print(f"Accuracy without standardization: {acc_ns:.4f}")
print(f"Accuracy with standardization:    {acc_scaled:.4f}")
print(f"Accuracy with normalization:    {acc_norm:.4f}")


## Class imbalance

In [None]:
from sklearn.datasets import make_classification

from collections import Counter

# Generate synthetic dataset
X, y = make_classification(
    n_samples=5000,          # Total number of samples
    n_features=10,           # Number of features
    n_informative=5,        # Number of informative features
    n_redundant=2,           # Number of redundant features
    n_classes=5,            # Number of classes
    n_clusters_per_class=1,  # Clusters per class
    weights=[0.13, 0.5, 0.1, 0.07, 0.2],  # Imbalanced class weights
    flip_y=0,                # No label noise
    random_state=42          # Reproducibility
)

# Check class distribution
class_counts = Counter(y)
print("Class distribution:", class_counts)

# Visualize class distribution
plt.bar(class_counts.keys(), class_counts.values())
plt.xlabel("Class")
plt.ylabel("Number of Samples")
plt.title("Class Distribution")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from collections import Counter

# Split the dataset without stratification
X_train_ns, X_test_ns, y_train_ns, y_test_ns = train_test_split(
    X, y, test_size=0.2, stratify=None
)

# Split the dataset with stratification
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X, y, test_size=0.2, stratify=y
)

# Function to calculate class distribution and percentages
def calculate_distribution(y_data):
    class_counts = Counter(y_data)
    total = sum(class_counts.values())
    percentages = {cls: (count / total) * 100 for cls, count in class_counts.items()}
    return pd.DataFrame({
        "Class": list(class_counts.keys()),
        "Count": list(class_counts.values()),
        "Percentage": list(percentages.values())
    }).sort_values(by="Class").reset_index(drop=True)

# Calculate distributions
df_train_ns = calculate_distribution(y_train_ns)
df_test_ns = calculate_distribution(y_test_ns)
df_train_s = calculate_distribution(y_train_s)
df_test_s = calculate_distribution(y_test_s)

# Combine results into a single DataFrame for comparison
comparison_df = pd.concat([
    df_train_ns.rename(columns={"Count": "Train Count (NS)", "Percentage": "Train % (NS)"}),
    df_test_ns.rename(columns={"Count": "Test Count (NS)", "Percentage": "Test % (NS)"}),
    df_train_s.rename(columns={"Count": "Train Count (S)", "Percentage": "Train % (S)"}),
    df_test_s.rename(columns={"Count": "Test Count (S)", "Percentage": "Test % (S)"})
], axis=1)

# Display the comparison DataFrame
print(comparison_df)