 Build a system that takes raw data and applies a series of transformations (normalization, handling missing values, encoding categorical variables, scaling). You should be able to chain these transformations together and apply the same pipeline to both training and test data. The pipeline should track what transformations were applied during training so it can apply identical transformations to new data.

In [1]:
import pandas as pd
import numpy as np
from abc import ABC,abstractmethod

In [2]:
class BaseTransformer(ABC):

    def __init__(self):
        is_fitted=False

    @abstractmethod
    def fit(self,X:pd.DataFrame,features:list):
        pass

    @abstractmethod
    def transform(self,X:pd.DataFrame)->pd.DataFrame:
        pass

    def fit_transform(self,X:pd.DataFrame,features:list)->pd.DataFrame:
        self.fit(X,features)
        return self.transform(X)




In [3]:
class ImputeMissingValues(BaseTransformer):

    def __init__(self):
        super().__init__()
        self.skew={}
        self.impute_method={}
        self.impute_values={}


    def fit(self,X:pd.DataFrame,features:list):

        for feature in features:
            if feature not in X.columns:
                raise ValueError(f"The {feature} is not in the dataframe")
            
        for feature in features:
            if X[feature].dtype in ["int32","float32","float64","int64"]:
                self.skew[feature]=X[feature].skew()
                if -0.5<=X[feature].skew()<=0.5:
                    self.impute_method[feature]="mean"
                    self.impute_values[feature]=X[feature].mean()
                else:
                    self.impute_method[feature]="median"
                    self.impute_values[feature]=X[feature].median()

            else:
                self.impute_method[feature]="mode"
                self.impute_values[feature]=X[feature].mode()[0]
        self.is_fitted=True
        print("The data is fitted")

        return self


    def transform(self,X:pd.DataFrame):

        X_transformed=X.copy()

        for feature,values in self.impute_values.items():
                X_transformed[feature]=X_transformed[feature].fillna(values)

        return X_transformed




In [4]:
X_train = pd.DataFrame({
    'age': [25, 30, 35, np.nan, 45, 50, np.nan, 28],
    'salary': [50000, 60000, np.nan, 75000, 85000, np.nan, 70000, 55000],
    'experience': [2, 5, 7, 10, 12, 15, 8, 3],
    'city': ['NYC', 'LA', 'NYC', np.nan, 'Chicago', 'LA', 'NYC', 'Chicago']
})

X_test = pd.DataFrame({
    'age': [26, np.nan, 48],
    'salary': [52000, 65000, np.nan],
    'experience': [3, 6, 13],
    'city': [np.nan, 'LA', 'NYC']
})

print("TRAINING DATA:")
print(X_train)
print("\n" + "=" * 80 + "\n")
print("TEST DATA:")
print(X_test)

TRAINING DATA:
    age   salary  experience     city
0  25.0  50000.0           2      NYC
1  30.0  60000.0           5       LA
2  35.0      NaN           7      NYC
3   NaN  75000.0          10      NaN
4  45.0  85000.0          12  Chicago
5  50.0      NaN          15       LA
6   NaN  70000.0           8      NYC
7  28.0  55000.0           3  Chicago


TEST DATA:
    age   salary  experience city
0  26.0  52000.0           3  NaN
1   NaN  65000.0           6   LA
2  48.0      NaN          13  NYC


In [5]:
imputer = ImputeMissingValues()
imputer.fit(X_train, ['age', 'salary', 'city'])
X_train_transformed = imputer.transform(X_train)
X_test_transformed = imputer.transform(X_test)

print("\nTRAINED IMPUTER VALUES:")
print(imputer.impute_values)

print("\nTRAINED IMPUTER METHODS:")
print(imputer.impute_method)

print("\nTRANSFORMED TRAINING DATA:")
print(X_train_transformed)

print("\nTRANSFORMED TEST DATA:")
print(X_test_transformed)

The data is fitted

TRAINED IMPUTER VALUES:
{'age': 32.5, 'salary': np.float64(65833.33333333333), 'city': 'NYC'}

TRAINED IMPUTER METHODS:
{'age': 'median', 'salary': 'mean', 'city': 'mode'}

TRANSFORMED TRAINING DATA:
    age        salary  experience     city
0  25.0  50000.000000           2      NYC
1  30.0  60000.000000           5       LA
2  35.0  65833.333333           7      NYC
3  32.5  75000.000000          10      NYC
4  45.0  85000.000000          12  Chicago
5  50.0  65833.333333          15       LA
6  32.5  70000.000000           8      NYC
7  28.0  55000.000000           3  Chicago

TRANSFORMED TEST DATA:
    age        salary  experience city
0  26.0  52000.000000           3  NYC
1  32.5  65000.000000           6   LA
2  48.0  65833.333333          13  NYC


In [18]:
class OrdinalEncoder(BaseTransformer):

    def __init__(self):
        super().__init__()
        self.encodings={}
    
    def fit(self,X:pd.DataFrame,feature:list):
        
        for i in feature:
            if i not in X.columns:
                raise ValueError("The column does not exist")
            
        for i in feature:
            categories=sorted(X[i].unique().tolist())
            self.encodings[i]={}
            for index,value in enumerate(categories):
                self.encodings[i][value] = index

        self.is_fitted=True

        return self

    def transform(self,X:pd.DataFrame):

        X_transformed=X.copy()

        for i in self.encodings:
            X_transformed[i]=X_transformed[i].map(self.encodings[i])

        return X_transformed
                

In [19]:
if __name__ == "__main__":

    X_train = pd.DataFrame({
        'city': ['NYC', 'LA', 'NYC', 'Chicago', 'LA', 'NYC'],
        'state': ['NY', 'CA', 'NY', 'IL', 'CA', 'NY'],
        'age': [25, 30, 35, 40, 45, 50]
    })
    
    X_test = pd.DataFrame({
        'city': ['LA', 'Chicago', 'NYC'],
        'state': ['CA', 'IL', 'NY'],
        'age': [28, 38, 48]
    })
    
    print("=" * 70)
    print("TRAINING DATA:")
    print("=" * 70)
    print(X_train)
    
    print("\n" + "=" * 70)
    print("TEST DATA:")
    print("=" * 70)
    print(X_test)
    
    print("\n" + "=" * 70)
    print("FITTING ENCODER ON TRAINING DATA")
    print("=" * 70)
    
    encoder = OrdinalEncoder()
    encoder.fit(X_train, ['city', 'state'])
    
    print("\nLearned Encodings:")
    for feature, mapping in encoder.encodings.items():
        print(f"   {feature}: {mapping}")
    
    print("\n" + "=" * 70)
    print("TRANSFORMED TRAINING DATA:")
    print("=" * 70)
    X_train_transformed = encoder.transform(X_train)
    print(X_train_transformed)
    
    print("\n" + "=" * 70)
    print("TRANSFORMED TEST DATA:")
    print("=" * 70)
    X_test_transformed = encoder.transform(X_test)
    print(X_test_transformed)



TRAINING DATA:
      city state  age
0      NYC    NY   25
1       LA    CA   30
2      NYC    NY   35
3  Chicago    IL   40
4       LA    CA   45
5      NYC    NY   50

TEST DATA:
      city state  age
0       LA    CA   28
1  Chicago    IL   38
2      NYC    NY   48

FITTING ENCODER ON TRAINING DATA

Learned Encodings:
   city: {'Chicago': 0, 'LA': 1, 'NYC': 2}
   state: {'CA': 0, 'IL': 1, 'NY': 2}

TRANSFORMED TRAINING DATA:
   city  state  age
0     2      2   25
1     1      0   30
2     2      2   35
3     0      1   40
4     1      0   45
5     2      2   50

TRANSFORMED TEST DATA:
   city  state  age
0     1      0   28
1     0      1   38
2     2      2   48
