In [27]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression




## Exercise 1: Imputer 1


In [None]:
train_data = [[7, 6, 5],
              [4, np.nan, 5],
              [1, 20, 8]]


imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit(train_data)

X_imputed.statistics_

In [None]:
train_data_imputed = imputer.transform(train_data)

train_data_imputed


In [None]:
test_data = [[np.nan, 1, 2],
             [7, np.nan, 9],
             [np.nan, 2, 4]]

test_data_imputed = imputer.transform(test_data)

test_data_imputed

## Exercise 2: Scaler

In [None]:
X_train = np.array([[ 1., -1.,  2.],
                     [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_train_scaled

In [None]:
X_test = np.array([[ 2., -1.,  1.],
                     [ 3.,  3.,  -1.],
                     [ 1.,  1., 1.]])

X_text_scaled = scaler.transform(X_test)

X_text_scaled

## Exercise 3: One hot Encoder

In [None]:
X_train = [['Python'], ['Java'], ['Java'], ['C++']]


encoder = OneHotEncoder(handle_unknown='ignore') 

X_train_encoded = encoder.fit_transform(X_train).toarray()

categories = encoder.categories_[0]  # Extracting the list of categories for the first (and only) feature

# Creating the DataFrame
X_train_df_categories = pd.DataFrame(X_train_encoded, columns=categories)

X_train_df_categories


In [None]:
X_test = [['Python'], ['Java'], ['C'], ['C++']]

X_test_encoded = encoder.transform(X_test).toarray()

test_categories = encoder.categories_[0]

X_test_df_categories = pd.DataFrame(X_test_encoded, columns=test_categories)

X_test_df_categories


## Exercise 4: Ordinal Encoder

In [None]:
X_train = [['good'], ['bad'], ['neutral']]

encoder = OrdinalEncoder(categories=[['bad', 'neutral', 'good']])

X_train_encoded = encoder.fit_transform(X_train)

X_train_encoded

In [None]:
X_test = [['good'], ['good'], ['bad']]

X_test_encoded = encoder.transform(X_test)

X_test_encoded

## Exercise 5: Categorical variables

In [8]:
column_names = ['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 
                'deg-malig', 'breast', 'breast-quad', 'irradiat', 'Class']


df = pd.read_csv('breast-cancer.csv', names=column_names)

df = df.drop(columns=['Class'])
# Drop NaN values
df = df.dropna()

# Split the data into train and test sets
X_train, X_test = train_test_split(df, test_size=0.20, random_state=43)

# Count the number of unique values per feature in the train set
unique_values_per_feature = X_train.nunique()

unique_values_per_feature

age             6
menopause       3
tumor-size     11
inv-nodes       6
node-caps       2
deg-malig       3
breast          2
breast-quad     5
irradiat        2
dtype: int64

In [9]:
ohe = OneHotEncoder(sparse_output=False,handle_unknown='ignore')

ohe_cols = ['node-caps','breast', 'breast-quad','irradiat']

ohe.fit(X_train[ohe_cols])

X_test_encoded = ohe.transform(X_test[ohe_cols])

# Display the first 10 rows of the transformed test set and the feature names
X_test_encoded[:10], ohe.get_feature_names_out(ohe_cols)


(array([[1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.],
        [1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.],
        [0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
        [0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
        [1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.],
        [1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0.],
        [1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0.],
        [1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.],
        [1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1.],
        [1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.]]),
 array(['node-caps_no', 'node-caps_yes', 'breast_left', 'breast_right',
        'breast-quad_central', 'breast-quad_left_low',
        'breast-quad_left_up', 'breast-quad_right_low',
        'breast-quad_right_up', 'irradiat_no', 'irradiat_yes'],
       dtype=object))

In [24]:
# Specifying the order for each ordinal feature
categories_order = [
    ['lt40', 'premeno', 'ge40'],  # menopause
    ['20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99'],  # age
    ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59'],  # tumor-size
    ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '18-20', '21-23', '24-26', '27-29', '30-32', '33-35', '36-39'],  # inv-nodes
    [1, 2, 3]  # deg-malig
]

# Create an OrdinalEncoder with the specified categories
oe = OrdinalEncoder(categories=categories_order)

# Specifying the ordinal columns
ordinal_cols = ["menopause", "age", "tumor-size", "inv-nodes", "deg-malig"]

# Fit the encoder on the train set
oe.fit(X_train[ordinal_cols])

# Transform the test set
X_test_ordinal_encoded = oe.transform(X_test[ordinal_cols])

# Display the first 5 rows of the transformed test set
X_test_ordinal_encoded[:10]


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [14]:
# Combine the two encoders using make_column_transformer
# Note: We use the previously defined OneHotEncoder `ohe` and OrdinalEncoder `oe_corrected`
column_transformer = make_column_transformer(
    (ohe, ohe_cols),
    (oe, ordinal_cols),
    remainder='passthrough'
)

# Fit the column transformer on the train set
column_transformer.fit(X_train)

# Transform the test set
X_test_transformed = column_transformer.transform(X_test)

# Display the first 5 rows of the transformed test set
X_test_transformed[:5]



array([[1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 2., 4., 2., 0., 1.],
       [1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 2., 4., 2., 0., 0.],
       [0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 2., 4., 4., 5., 2.],
       [0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 3., 5., 1., 1.],
       [1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 2., 4., 5., 0., 2.]])

## Exercise 6: Pipeline


In [21]:
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris['data'], iris['target']

#add missing values
X[[1,20,50,100,135], 0] = np.nan
X[[2,5,88,135], 1] = np.nan
X[[4,15], 2] = np.nan
X[[40,135], 3] = np.nan

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=43)

In [28]:
# Create a pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

In [30]:
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
score = pipeline.score(X_test, y_test)

score

0.98