In [25]:

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib
from scipy.stats.mstats import winsorize



In [27]:
# Import the dataset.
file_path = r"C:\Users\USER\Documents\Happy customers\ACME-HappinessSurvey2020.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Y,X1,X2,X3,X4,X5,X6
0,0,3,3,3,4,2,4
1,0,3,2,3,5,4,3
2,1,5,3,3,3,3,5
3,0,5,4,3,3,3,5
4,0,5,4,3,3,3,5


In [19]:
# Develope a pipeline to preprocess the dataset.

# Custom transformers 

# 1. Convert all columns to float
class ToFloatTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.astype(float)

# 2. Drop exact duplicate rows
class DropDuplicatesTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop_duplicates()

# 3. Winsorization transformer
class Winsorizer(BaseEstimator, TransformerMixin):
    def __init__(self, column, limits=(0.05, 0.05)):
        self.column = column
        self.limits = limits
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X[self.column] = winsorize(X[self.column], limits=self.limits)
        return X

# 4. Standardization transformer (z-score)
class ZScoreStandardizer(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        self.scaler = StandardScaler()
    def fit(self, X, y=None):
        self.scaler.fit(X[[self.column]])
        return self
    def transform(self, X):
        X = X.copy()
        X[self.column] = self.scaler.transform(X[[self.column]])
        return X

#  Build the pipeline 

pipeline = Pipeline([
    ("to_float", ToFloatTransformer()),
    ("drop_duplicates", DropDuplicatesTransformer()),

    ("std_X1", ZScoreStandardizer("X1")),

    ("winso_X2", Winsorizer("X2", limits=(0.05, 0.05))),
    ("std_X2", ZScoreStandardizer("X2")),

    ("winso_X3", Winsorizer("X3", limits=(0.10, 0.10))),
    ("std_X3", ZScoreStandardizer("X3")),

    ("winso_X4", Winsorizer("X4", limits=(0.05, 0.05))),
    ("std_X4", ZScoreStandardizer("X4")),

    ("winso_X5", Winsorizer("X5", limits=(0.10, 0.10))),
    ("std_X5", ZScoreStandardizer("X5")),

    ("winso_X6", Winsorizer("X6", limits=(0.05, 0.05))),
    ("std_X6", ZScoreStandardizer("X6")),
])

# Apply pipeline 
df_transformed = pipeline.fit_transform(df)

# Save pipeline for reuse 
joblib.dump(pipeline, "preprocessing_pipeline.pkl")
print("Pipeline saved as preprocessing_pipeline.pkl")

Pipeline saved as preprocessing_pipeline.pkl


In [21]:
df_transformed.head()

Unnamed: 0,Y,X1,X2,X3,X4,X5,X6
0,0.0,-1.623831,0.5012,-0.361376,0.290794,-1.552892,-0.310228
1,0.0,-1.623831,-0.4833,-0.361376,1.475509,0.32423,-1.622731
2,1.0,0.857022,0.5012,-0.361376,-0.893921,-0.614331,1.002275
3,0.0,0.857022,1.4857,-0.361376,-0.893921,-0.614331,1.002275
5,1.0,0.857022,1.4857,-0.361376,1.475509,1.262791,1.002275


In [23]:
# Save DataFrame as pickle

import os
file_path = os.path.abspath("df_transformed.pkl")
df_transformed.to_pickle(file_path)

# Print the full path
print(f"DataFrame saved at: {file_path}")

DataFrame saved at: C:\Users\USER\df_transformed.pkl
