# 📊 Binning & Binarization in Machine Learning

This notebook contains Python examples of **Binning**, **Binarization**, and **K-Means Binning** techniques for data preprocessing.


In [None]:
# ----------------------------------------
# 📌 1. BINNING EXAMPLES
# ----------------------------------------

import pandas as pd
import numpy as np

data = pd.DataFrame({"age": [5, 17, 25, 32, 45, 62, 70]})

# Equal-width binning (3 bins)
data['age_bin_width'] = pd.cut(data['age'], bins=3, labels=["Young", "Middle", "Old"])

# Equal-frequency (quantile) binning (3 bins)
data['age_bin_quantile'] = pd.qcut(data['age'], q=3, labels=["Low", "Medium", "High"])

# Custom binning
bins = [0, 12, 19, 59, 100]
labels = ["Child", "Teen", "Adult", "Senior"]
data['age_bin_custom'] = pd.cut(data['age'], bins=bins, labels=labels)

print("Binning Results:")
print(data)


In [None]:
# ----------------------------------------
# 📌 2. BINARIZATION EXAMPLES
# ----------------------------------------

import numpy as np
from sklearn.preprocessing import Binarizer, LabelBinarizer

# Threshold Binarization
data_num = np.array([[1.5], [2.5], [0.5]])
binarizer = Binarizer(threshold=1.0)
print("Threshold Binarization:\n", binarizer.transform(data_num))

# One-Hot Encoding
df = pd.DataFrame({"Gender": ["Male", "Female", "Male"]})
df_one_hot = pd.get_dummies(df, columns=["Gender"])
print("\nOne-Hot Encoding:\n", df_one_hot)

# Label Binarization (multi-class)
labels = [1, 2, 3, 1]
lb = LabelBinarizer()
print("\nLabel Binarization:\n", lb.fit_transform(labels))


In [None]:
# ----------------------------------------
# 📌 3. K-MEANS BINNING EXAMPLE
# ----------------------------------------

from sklearn.cluster import KMeans

# Sample data
data_income = pd.DataFrame({"income": [1500, 1800, 2200, 3500, 4000, 4200, 8000, 12000, 15000]})

# Reshape for KMeans
X = data_income['income'].values.reshape(-1, 1)

# Apply KMeans with 3 bins
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
data_income['income_bin_kmeans'] = kmeans.fit_predict(X)

# Sort bins by cluster centers
centers = kmeans.cluster_centers_.flatten()
sorted_idx = np.argsort(centers)
mapping = {old: new for new, old in enumerate(sorted_idx)}
data_income['income_bin_kmeans'] = data_income['income_bin_kmeans'].map(mapping)

print("K-Means Cluster Centers (bin representatives):", centers[sorted_idx])
print("K-Means Binning Results:\n", data_income)
