In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

titanic = pd.read_csv('titanic.csv')
# Create FamilySize feature
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1

print(titanic[['SibSp', 'Parch', 'FamilySize']].head())
print(titanic[['FamilySize']])

   SibSp  Parch  FamilySize
0      1      0           2
1      1      0           2
2      0      0           1
3      1      0           2
4      0      0           1
     FamilySize
0             2
1             2
2             1
3             2
4             1
..          ...
886           1
887           1
888           4
889           1
890           1

[891 rows x 1 columns]


Column Transformer

In [39]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

df = pd.read_csv('titanic.csv')

# Select only relevant columns
df = df[['Age', 'Fare', 'SibSp', 'Parch', 'Sex', 'Embarked']]

numeric_features = ['Age', 'Fare', 'SibSp', 'Parch']
categorical_features = ['Sex', 'Embarked']

preprocessor = ColumnTransformer(transformers=[
    ('age_imputer', SimpleImputer(strategy='mean'), numeric_features),
    ('ohe', OneHotEncoder(), categorical_features)
], remainder='passthrough')


transformed_data = preprocessor.fit_transform(df)

transformed_df = pd.DataFrame(transformed_data)


print(transformed_df)

             0        1    2    3    4    5    6    7    8    9
0    22.000000   7.2500  1.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0
1    38.000000  71.2833  1.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0
2    26.000000   7.9250  0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0
3    35.000000  53.1000  1.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0
4    35.000000   8.0500  0.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0
..         ...      ...  ...  ...  ...  ...  ...  ...  ...  ...
886  27.000000  13.0000  0.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0
887  19.000000  30.0000  0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0
888  29.699118  23.4500  1.0  2.0  1.0  0.0  0.0  0.0  1.0  0.0
889  26.000000  30.0000  0.0  0.0  0.0  1.0  1.0  0.0  0.0  0.0
890  32.000000   7.7500  0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0

[891 rows x 10 columns]


In [40]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.read_csv('titanic.csv')

# Select only relevant columns
df = df[['Age', 'Fare', 'SibSp', 'Parch', 'Sex', 'Embarked']]

numeric_features = ['Age', 'Fare', 'SibSp', 'Parch']
categorical_features = ['Sex', 'Embarked']

preprocessor = ColumnTransformer(transformers=[
    ('age_imputer', SimpleImputer(strategy='mean'), numeric_features),
    ('ohe', OneHotEncoder(), categorical_features)
], remainder='passthrough')


transformed_data = preprocessor.fit_transform(df)

numeric_columns = numeric_features
ohe_columns = preprocessor.named_transformers_['ohe'].get_feature_names_out(categorical_features)

# Create DataFrame with proper column names
all_columns = numeric_features + list(ohe_columns) + [col for col in df.columns if col not in numeric_features + categorical_features]
transformed_df = pd.DataFrame(transformed_data, columns=all_columns)


print(transformed_df)

           Age     Fare  SibSp  Parch  Sex_female  Sex_male  Embarked_C  \
0    22.000000   7.2500    1.0    0.0         0.0       1.0         0.0   
1    38.000000  71.2833    1.0    0.0         1.0       0.0         1.0   
2    26.000000   7.9250    0.0    0.0         1.0       0.0         0.0   
3    35.000000  53.1000    1.0    0.0         1.0       0.0         0.0   
4    35.000000   8.0500    0.0    0.0         0.0       1.0         0.0   
..         ...      ...    ...    ...         ...       ...         ...   
886  27.000000  13.0000    0.0    0.0         0.0       1.0         0.0   
887  19.000000  30.0000    0.0    0.0         1.0       0.0         0.0   
888  29.699118  23.4500    1.0    2.0         1.0       0.0         0.0   
889  26.000000  30.0000    0.0    0.0         0.0       1.0         1.0   
890  32.000000   7.7500    0.0    0.0         0.0       1.0         0.0   

     Embarked_Q  Embarked_S  Embarked_nan  
0           0.0         1.0           0.0  
1          

In [55]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer

data = pd.read_csv('titanic.csv')

def impute_embarked(X):
    X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])
    return X

preprocessor = ColumnTransformer(transformers=[
    ('age_imputer', SimpleImputer(strategy='mean'), ['Age']),
    ('embarked_imputer', FunctionTransformer(impute_embarked), ['Embarked']),
    ('ohe', OneHotEncoder(handle_unknown='ignore'), ['Sex', 'Embarked'])
], remainder='passthrough')

transformed_data = preprocessor.fit_transform(data)

#  Extract column names
ohe = preprocessor.named_transformers_['ohe']
ohe_features = ohe.get_feature_names_out(['Sex', 'Embarked'])

#Combine all column names
all_columns = (
    ['Age'] + ['Embarked'] + list(ohe_features) + [col for col in data.columns if col not in ['Age', 'Embarked', 'Sex']]  # remainder='passthrough'
)

transformed_df = pd.DataFrame(transformed_data, columns=all_columns)

print("\nTransformed DataFrame with Proper Column Names:")
print(transformed_df.head())


Transformed DataFrame with Proper Column Names:
    Age Embarked Sex_female Sex_male Embarked_C Embarked_Q Embarked_S  \
0  22.0        S        0.0      1.0        0.0        0.0        1.0   
1  38.0        C        1.0      0.0        1.0        0.0        0.0   
2  26.0        S        1.0      0.0        0.0        0.0        1.0   
3  35.0        S        1.0      0.0        0.0        0.0        1.0   
4  35.0        S        0.0      1.0        0.0        0.0        1.0   

  Embarked_nan PassengerId Survived Pclass  \
0          0.0           1        0      3   
1          0.0           2        1      1   
2          0.0           3        1      3   
3          0.0           4        1      1   
4          0.0           5        0      3   

                                                Name SibSp Parch  \
0                            Braund, Mr. Owen Harris     1     0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...     1     0   
2                             H

Pipeline

In [60]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

data = pd.read_csv('titanic.csv')

def impute_embarked(X):
    X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])
    return X

preprocessor = ColumnTransformer(transformers=[
    ('age_imputer', SimpleImputer(strategy='mean'), ['Age']),
    ('embarked_encoder', Pipeline(steps=[
        ('imputer', FunctionTransformer(impute_embarked)),
        ('onehot', OneHotEncoder())
    ]), ['Embarked']),
    ('ohe', OneHotEncoder(), ['Sex'])
], remainder='passthrough')


# Transform the data
transformed_data = preprocessor.fit_transform(data)

transformed_df = pd.DataFrame(transformed_data)

print("\nTransformed DataFrame:")
print(transformed_df)


Transformed DataFrame:
            0    1    2    3    4    5    6  7  8   \
0         22.0  0.0  0.0  1.0  0.0  1.0    1  0  3   
1         38.0  1.0  0.0  0.0  1.0  0.0    2  1  1   
2         26.0  0.0  0.0  1.0  1.0  0.0    3  1  3   
3         35.0  0.0  0.0  1.0  1.0  0.0    4  1  1   
4         35.0  0.0  0.0  1.0  0.0  1.0    5  0  3   
..         ...  ...  ...  ...  ...  ...  ... .. ..   
886       27.0  0.0  0.0  1.0  0.0  1.0  887  0  2   
887       19.0  0.0  0.0  1.0  1.0  0.0  888  1  1   
888  29.699118  0.0  0.0  1.0  1.0  0.0  889  0  3   
889       26.0  1.0  0.0  0.0  0.0  1.0  890  1  1   
890       32.0  0.0  1.0  0.0  0.0  1.0  891  0  3   

                                                    9  10 11  \
0                              Braund, Mr. Owen Harris  1  0   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  1  0   
2                               Heikkinen, Miss. Laina  0  0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  1  0   
4      

### **Transofmers on Heart Disease Dataset**

In [72]:
import pandas as pd

heart_df = pd.read_csv('heart_disease.csv')
print(heart_df.columns)


Index(['State', 'Sex', 'GeneralHealth', 'PhysicalHealthDays',
       'MentalHealthDays', 'LastCheckupTime', 'PhysicalActivities',
       'SleepHours', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina',
       'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory', 'AgeCategory',
       'HeightInMeters', 'WeightInKilograms', 'BMI', 'AlcoholDrinkers',
       'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap',
       'HighRiskLastYear', 'CovidPos'],
      dtype='object')


In [69]:
heart_df = pd.read_csv('heart_disease.csv')
print(heart_df.dtypes)

State                         object
Sex                           object
GeneralHealth                 object
PhysicalHealthDays           float64
MentalHealthDays             float64
LastCheckupTime               object
PhysicalActivities            object
SleepHours                   float64
RemovedTeeth                  object
HadHeartAttack                object
HadAngina                     object
HadStroke                     object
HadAsthma                     object
HadSkinCancer                 object
HadCOPD                       object
HadDepressiveDisorder         object
HadKidneyDisease              object
HadArthritis                  object
HadDiabetes                   object
DeafOrHardOfHearing           object
BlindOrVisionDifficulty       object
DifficultyConcentrating       object
DifficultyWalking             object
DifficultyDressingBathing     object
DifficultyErrands             object
SmokerStatus                  object
ECigaretteUsage               object
C

Column Transformer

In [73]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load dataset
df = pd.read_csv('heart_disease.csv')

# Select relevant columns
numeric_features = ['PhysicalHealthDays', 'MentalHealthDays', 'SleepHours', 'HeightInMeters', 'WeightInKilograms', 'BMI']
categorical_features = ['Sex', 'GeneralHealth', 'RaceEthnicityCategory', 'SmokerStatus']

df = df[numeric_features + categorical_features]

preprocessor = ColumnTransformer(transformers=[
    ('num_imputer', SimpleImputer(strategy='mean'), numeric_features),
    ('ohe', OneHotEncoder(), categorical_features)
], remainder='passthrough')

transformed_data = preprocessor.fit_transform(df)

ohe_columns = preprocessor.named_transformers_['ohe'].get_feature_names_out(categorical_features)

all_columns = numeric_features + list(ohe_columns)
transformed_df = pd.DataFrame(transformed_data, columns=all_columns)

print(transformed_df.head())


   PhysicalHealthDays  MentalHealthDays  SleepHours  HeightInMeters  \
0                 0.0               0.0         8.0        1.702691   
1                 0.0               0.0         6.0        1.600000   
2                 2.0               3.0         5.0        1.570000   
3                 0.0               0.0         7.0        1.650000   
4                 2.0               0.0         9.0        1.570000   

   WeightInKilograms        BMI  Sex_Female  Sex_Male  \
0           83.07447  28.529842         1.0       0.0   
1           68.04000  26.570000         1.0       0.0   
2           63.50000  25.610000         1.0       0.0   
3           63.50000  23.300000         1.0       0.0   
4           53.98000  21.770000         1.0       0.0   

   GeneralHealth_Excellent  GeneralHealth_Fair  ...  \
0                      0.0                 0.0  ...   
1                      1.0                 0.0  ...   
2                      0.0                 0.0  ...   
3         

Function Transformer


In [91]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer

data = pd.read_csv('heart_disease.csv')

def impute_race(X):
    X = X.copy()
    X['RaceEthnicityCategory'] = X['RaceEthnicityCategory'].fillna(X['RaceEthnicityCategory'].mode()[0])
    return X

preprocessor = ColumnTransformer(transformers=[
    ('sleep_imputer', SimpleImputer(strategy='mean'), ['SleepHours']),
    ('race_imputer', FunctionTransformer(impute_race), ['RaceEthnicityCategory']),
    ('ohe', OneHotEncoder(), ['Sex', 'RaceEthnicityCategory'])
], remainder='passthrough')

transformed_data = preprocessor.fit_transform(data)

ohe = preprocessor.named_transformers_['ohe']
ohe_features = ohe.get_feature_names_out(['Sex', 'RaceEthnicityCategory'])

# Combine column names
all_columns = (
    ['SleepHours'] + ['RaceEthnicityCategory'] + list(ohe_features) +
    [col for col in data.columns if col not in ['SleepHours', 'RaceEthnicityCategory', 'Sex']]
)

transformed_df = pd.DataFrame(transformed_data, columns=all_columns)
print("\nTransformed DataFrame with Proper Column Names:")
print(transformed_df.head())



Transformed DataFrame with Proper Column Names:
  SleepHours     RaceEthnicityCategory Sex_Female Sex_Male  \
0        8.0  White only, Non-Hispanic        1.0      0.0   
1        6.0  White only, Non-Hispanic        1.0      0.0   
2        5.0  White only, Non-Hispanic        1.0      0.0   
3        7.0  White only, Non-Hispanic        1.0      0.0   
4        9.0  White only, Non-Hispanic        1.0      0.0   

  RaceEthnicityCategory_Black only, Non-Hispanic  \
0                                            0.0   
1                                            0.0   
2                                            0.0   
3                                            0.0   
4                                            0.0   

  RaceEthnicityCategory_Hispanic  \
0                            0.0   
1                            0.0   
2                            0.0   
3                            0.0   
4                            0.0   

  RaceEthnicityCategory_Multiracial, Non-Hispani

Pipeline

In [88]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

data = pd.read_csv('heart_disease.csv')

numeric_columns = ['SleepHours', 'HeightInMeters', 'WeightInKilograms', 'BMI', 'PhysicalHealthDays', 'MentalHealthDays']
categorical_columns = ['RaceEthnicityCategory', 'Sex']

def impute_race(X):
    X = X.copy()
    X['RaceEthnicityCategory'] = X['RaceEthnicityCategory'].fillna(X['RaceEthnicityCategory'].mode()[0])
    return X

preprocessor = ColumnTransformer(transformers=[
    ('numeric_imputer', SimpleImputer(strategy='mean'), numeric_columns),

    ('race_encoder', Pipeline(steps=[
        ('imputer', FunctionTransformer(impute_race)),
        ('onehot', OneHotEncoder())
    ]), ['RaceEthnicityCategory']),

    ('sex_encoder', OneHotEncoder(), ['Sex'])
], remainder='passthrough')

transformed_data = preprocessor.fit_transform(data)

transformed_df = pd.DataFrame(transformed_data)

print("\nTransformed DataFrame:")
print(transformed_df.head())



Transformed DataFrame:
    0         1         2          3    4    5    6    7    8    9   ...  \
0  8.0  1.702691  83.07447  28.529842  0.0  0.0  0.0  0.0  0.0  0.0  ...   
1  6.0       1.6     68.04      26.57  0.0  0.0  0.0  0.0  0.0  0.0  ...   
2  5.0      1.57      63.5      25.61  2.0  3.0  0.0  0.0  0.0  0.0  ...   
3  7.0      1.65      63.5       23.3  0.0  0.0  0.0  0.0  0.0  0.0  ...   
4  9.0      1.57     53.98      21.77  2.0  0.0  0.0  0.0  0.0  0.0  ...   

                                          35   36               37   38  39  \
0                     Not at all (right now)   No  Age 80 or older   No  No   
1  Never used e-cigarettes in my entire life   No  Age 80 or older   No  No   
2  Never used e-cigarettes in my entire life   No     Age 55 to 59   No  No   
3  Never used e-cigarettes in my entire life  Yes              NaN   No  No   
4  Never used e-cigarettes in my entire life  Yes     Age 40 to 44  Yes  No   

    40   41                                 