In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
class LabelEncoder:
    def __init__(self):
        self.classes_ = None

    def fit(self, y):
        y = pd.Series(y)
        self.classes_ = np.unique(y)  
        return self

    def transform(self, y):
        y = pd.Series(y)
        class_to_index = {cls: idx for idx, cls in enumerate(self.classes_)}
        return y.map(class_to_index).to_numpy()

    def fit_transform(self, y):
        return self.fit(y).transform(y)

    def inverse_transform(self, y):
        return np.array([self.classes_[idx] for idx in y])


In [4]:
labels = ["cat", "dog", "cat", "mouse", "dog"]
encoder = LabelEncoder()

# Fit and check classes
encoder.fit(labels).transform(labels)

array([0, 1, 0, 2, 1])

In [5]:
class OrdinalEncoder:
    def __init__(self, categories='auto', handle_unknown='error', unknown_value=None, encoded_missing_value=np.nan):
        self.categories = categories
        self.handle_unknown = handle_unknown
        self.unknown_value = unknown_value
        self.encoded_missing_value = encoded_missing_value

        self.categories_ = None

    def fit(self, X):
        X = pd.DataFrame(X)
        self.categories_ = []

        if self.categories == 'auto':
            for col in X.columns:
                cats = pd.Series(X[col].dropna().unique()).tolist()
                self.categories_.append(cats)
        else:
            if len(self.categories) != X.shape[1]:
                raise ValueError("Length of categories must match number of features")
            self.categories_ = [list(cats) for cats in self.categories]

        return self

    def transform(self, X):
        if self.categories_ is None:
            raise ValueError("This OrdinalEncoder instance is not fitted yet.")

        X = pd.DataFrame(X)
        X_out = np.empty(X.shape, dtype=float)

        for i, col in enumerate(X.columns):
            mapping = {cat: idx for idx, cat in enumerate(self.categories_[i])}

            def encode_value(val):
                if pd.isna(val):
                    return self.encoded_missing_value
                if val in mapping:
                    return mapping[val]
                else:
                    if self.handle_unknown == 'error':
                        raise ValueError(f"Unknown category {val} in column {i}")
                    elif self.handle_unknown == 'use_encoded_value':
                        return self.unknown_value
                    else:
                        raise ValueError(f"Invalid handle_unknown={self.handle_unknown}")

            X_out[:, i] = X[col].apply(encode_value).to_numpy()

        return X_out

    def fit_transform(self, X):
        return self.fit(X).transform(X)

    def inverse_transform(self, X):
        if self.categories_ is None:
            raise ValueError("This OrdinalEncoder instance is not fitted yet.")

        X = np.array(X)
        X_out = pd.DataFrame(index=range(X.shape[0]), columns=range(X.shape[1]))

        for i in range(X.shape[1]):
            cats = self.categories_[i]

            def decode_value(val):
                if pd.isna(val) or val == self.encoded_missing_value:
                    return np.nan
                if val == self.unknown_value:
                    return None  # preserve "unknown"
                if 0 <= int(val) < len(cats):
                    return cats[int(val)]
                else:
                    return None  # safeguard against invalid indices

            X_out.iloc[:, i] = [decode_value(v) for v in X[:, i]]

        return X_out.to_numpy()


In [6]:
# Test data
X = pd.DataFrame({
    "color": ["red", "blue", "green", np.nan, "yellow", "blue"],
    "shape": ["circle", "square", "triangle", "circle", np.nan, "hexagon"]
})

# Initialize encoder
enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

# Fit on data
enc.fit(X)
print("Categories learned:")
for i, cats in enumerate(enc.categories_):
    print(f"  Column {i}: {cats}")

# Transform data
Xt = enc.transform(X)
print("\nTransformed values:")
print(Xt)

# Try unseen categories
X_new = pd.DataFrame({
    "color": ["red", "purple", np.nan],
    "shape": ["circle", "pentagon", "triangle"]
})
Xt_new = enc.transform(X_new)
print("\nTransformed new values (with unknowns):")
print(Xt_new)

# Inverse transform
X_inv = enc.inverse_transform(Xt_new)
print("\nInverse transformed back:")
print(X_inv)


Categories learned:
  Column 0: ['red', 'blue', 'green', 'yellow']
  Column 1: ['circle', 'square', 'triangle', 'hexagon']

Transformed values:
[[ 0.  0.]
 [ 1.  1.]
 [ 2.  2.]
 [nan  0.]
 [ 3. nan]
 [ 1.  3.]]

Transformed new values (with unknowns):
[[ 0.  0.]
 [-1. -1.]
 [nan  2.]]

Inverse transformed back:
[['red' 'circle']
 [None None]
 [nan 'triangle']]
