# Combine ColumnTransformer and Pipeline

## First Example: ChatGPT (adjusted) simple

In [7]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [34]:
# Example dataset with both numerical and categorical features(categorical features only work if the value exists more than once!!!)
X = [[25, 'Male', 'Engineer'],
     [30, 'Female', 'Teacher'],
     [35, 'Male', 'Doctor'],
     [40, 'Female', 'Engineer'],
     [40, 'Female', 'Teacher'],
     [40, 'Female', 'Engineer']]

df = pd.DataFrame(X, columns=['age', 'gender', 'profession'])

df['y'] = [0, 1, 1, 0, 1, 0]

df

Unnamed: 0,age,gender,profession,y
0,25,Male,Engineer,0
1,30,Female,Teacher,1
2,35,Male,Doctor,1
3,40,Female,Engineer,0
4,40,Female,Teacher,1
5,40,Female,Engineer,0


In [35]:
X = df[['age', 'gender', 'profession']]
y = df['y']
y

0    0
1    1
2    1
3    0
4    1
5    0
Name: y, dtype: int64

In [36]:
# Define the column transformer
preprocessor = ColumnTransformer([
    ('numeric', StandardScaler(), ['age']),
    ('categorical', OneHotEncoder(), ['gender', 'profession'])
])

In [37]:
# Define the pipeline with the column transformer and a classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [38]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

In [40]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5


## Second Example: ChatGPT (not-adjusted) with random generated Data

In [41]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Set random seed for reproducibility
np.random.seed(42)

# Create randomized data using NumPy
n_instances = 100

age = np.random.randint(18, 65, size=n_instances)
gender = np.random.choice(['Male', 'Female'], size=n_instances)
occupation = np.random.choice(['Engineer', 'Doctor', 'Teacher'], size=n_instances)
target = np.random.randint(0, 2, size=n_instances)

# Create a DataFrame
data = {'age': age, 'gender': gender, 'occupation': occupation, 'target': target}
df = pd.DataFrame(data)

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Define the column transformer
preprocessor = ColumnTransformer([
    ('numeric', StandardScaler(), ['age']),
    ('categorical', OneHotEncoder(), ['gender', 'occupation'])
])

# Define the pipeline with the column transformer and a classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.3


Unnamed: 0,age,gender,occupation,target
0,56,Female,Engineer,0
1,46,Female,Teacher,1
2,32,Male,Doctor,0
3,60,Male,Engineer,0
4,25,Male,Engineer,0
