In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Create a sample DataFrame
data = {'Category': ['A', 'B', 'A', 'C'],
        'Value': [10, 20, 30, 40],
        'Status': ['Active', 'Inactive', 'Active', 'Active']}

df = pd.DataFrame(data)

print(df)

# Define which columns to encode
columns_to_encode = ['Category', 'Status']

# Create a ColumnTransformer
# The transformers parameter takes a list of tuples where each tuple consists of a name, a transformer, and a list of columns
# In this case, we apply OneHotEncoder to the specified columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), columns_to_encode)
    ],
    remainder='passthrough'  # This means that the remaining columns will not be affected
)

# Fit and transform the data
transformed_data = preprocessor.fit_transform(df)

transformed_data = pd.DataFrame(transformed_data)

# Print the transformed data
print(transformed_data)


  Category  Value    Status
0        A     10    Active
1        B     20  Inactive
2        A     30    Active
3        C     40    Active
     0    1    2    3    4     5
0  1.0  0.0  0.0  1.0  0.0  10.0
1  0.0  1.0  0.0  0.0  1.0  20.0
2  1.0  0.0  0.0  1.0  0.0  30.0
3  0.0  0.0  1.0  1.0  0.0  40.0


In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd

# Create a sample DataFrame
data = {'Category': ['A', 'B', 'A', 'C'],
        'Value': [10, 20, 30, 40],
        'Status': ['Active', 'Inactive', 'Active', 'Active']}

df = pd.DataFrame(data)

# print(df.columns)
cat_features = ['Category', 'Status']

# separate numerical and categorical columns
df_cat = df[cat_features].copy()
df_num = df.drop(cat_features, axis=1)

# get the column names
cat_feature_names = df_cat.columns
num_feature_names = df_num.columns

# print(df_cat)
# print(df_num)
# print("Cat names: ", df_cat.columns)

# Define which columns to encode
columns_to_encode = ['Category', 'Status']

#fit encoder
enc = OneHotEncoder()
enc.fit(df_cat)

#transform categorical features
X_encoded = enc.transform(df_cat).toarray()

#create feature matrix
new_cat_feature_names = enc.get_feature_names_out(cat_feature_names)

df_cat_encoded = pd.DataFrame(X_encoded, columns= new_cat_feature_names)

print(f"Values for df_cat_encoded:\n {df_cat_encoded}")

# Initialize the StandardScaler for df_cat_encoded
scaler_df_cat_encoded = StandardScaler()

# Fit and transform the data
scaled_data = scaler_df_cat_encoded.fit_transform(df_cat_encoded)

# Convert the scaled data back to a dataframe
scaled_df_cat = pd.DataFrame(scaled_data, columns=new_cat_feature_names)

print(f"Values for scaled_df_cat:\n {scaled_df_cat}")


# Initialize the StandardScaler for df_num
scaler_df_num = StandardScaler()

# Fit and transform the data
scaled_data = scaler_df_num.fit_transform(df_num)

# Convert the scaled data back to a dataframe
scaled_df_num = pd.DataFrame(scaled_data, columns=num_feature_names)

print(f"Values for scaled_df_num:\n {scaled_df_num}")

combined_df = pd.concat([scaled_df_cat, scaled_df_num], axis=1)

print(f"Values for combined_df:\n {combined_df}")



# combined_df = pd.concat([X, df_num], axis=1)

# print(f"Values for combined_df:\n {combined_df}")

# # Extract the column names
# columns = combined_df.columns

# # Initialize the StandardScaler
# scaler = StandardScaler()

# # Fit and transform the data
# scaled_data = scaler.fit_transform(combined_df)

# # Convert the scaled data back to a dataframe
# scaled_df = pd.DataFrame(scaled_data, columns=columns)

# print(f"Values for scaled_df:\n {scaled_df}")


Values for df_cat_encoded:
    Category_A  Category_B  Category_C  Status_Active  Status_Inactive
0         1.0         0.0         0.0            1.0              0.0
1         0.0         1.0         0.0            0.0              1.0
2         1.0         0.0         0.0            1.0              0.0
3         0.0         0.0         1.0            1.0              0.0
Values for scaled_df_cat:
    Category_A  Category_B  Category_C  Status_Active  Status_Inactive
0         1.0   -0.577350   -0.577350       0.577350        -0.577350
1        -1.0    1.732051   -0.577350      -1.732051         1.732051
2         1.0   -0.577350   -0.577350       0.577350        -0.577350
3        -1.0   -0.577350    1.732051       0.577350        -0.577350
Values for scaled_df_num:
       Value
0 -1.341641
1 -0.447214
2  0.447214
3  1.341641
Values for combined_df:
    Category_A  Category_B  Category_C  Status_Active  Status_Inactive  \
0         1.0   -0.577350   -0.577350       0.577350        