# Feature Engineering II
putting things together

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline


### 1. Load the data

In [2]:
df = pd.read_csv('penguins_simple.csv', sep=';')
df.shape

(333, 6)

### 2. Train-Test Split

In [3]:
X = df.iloc[:, 1:]
y = df['Species']

In [11]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42)
Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape

((249, 5), (84, 5), (249,), (84,))

### 3. Define a ColumnTransformer

In [37]:
pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'), 
    KBinsDiscretizer(n_bins=5, *, encode='onehot', strategy='uniform'),
)

In [38]:
trans = ColumnTransformer([
    ('kristians_onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Sex']),
    ('kristians_scale', MinMaxScaler(), ['Body Mass (g)', 'Culmen Depth (mm)']),
    ('impute_then_scale', pipeline, ['Flipper Length (mm)']),
    ('do_nothing', 'passthrough', ['Culmen Length (mm)']),
])

### 4. fit + transform training data

In [39]:
trans.fit(Xtrain)

Xtrain_transformed = trans.transform(Xtrain)  # result is a single numpy array
Xtrain_transformed.shape

(249, 6)

### 5. fit a LogReg model

In [40]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(Xtrain_transformed, ytrain)

LogisticRegression(max_iter=1000)

### 6. transform test data

In [41]:
Xtest_transform = trans.transform(Xtest)
Xtest_transform.shape

(84, 6)

### 7. predict

In [42]:
ypred = model.predict(Xtest_transform)
ypred[:5]

array(['Adelie', 'Gentoo', 'Adelie', 'Chinstrap', 'Adelie'], dtype=object)