In [2]:
#import libraries
import numpy as np # used for handling numbers
import pandas as pd # used for handling the dataset
from sklearn.impute import SimpleImputer # used for handling missing data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # used for encoding categorical data
from sklearn.model_selection import train_test_split # used for splitting training and testing data
from sklearn.preprocessing import StandardScaler # used for feature scaling(std)
from sklearn.compose import ColumnTransformer

In [3]:
df = pd.DataFrame({
    'B': [100, 30, np.nan, 80, 50, np.nan ,89],
    'C': ['white', 'yellow', 'black', 'black', 'white',np.nan ,'white'],
    'D': [np.nan, 0.07, 0.9, 0.22,0.57, np.nan, 0.2],
    'target': ['dog', 'cat', 'dog', 'dog', 'cat', 'dog' , "dog"]
    
})

df

Unnamed: 0,B,C,D,target
0,100.0,white,,dog
1,30.0,yellow,0.07,cat
2,,black,0.9,dog
3,80.0,black,0.22,dog
4,50.0,white,0.57,cat
5,,,,dog
6,89.0,white,0.2,dog


In [4]:
df.isna().any().any()

True

In [8]:
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent') 


preprocessor = ColumnTransformer(transformers=[
    ('num', num_imputer, ['B', 'D']),
    ('cat', cat_imputer, ['C'])
])


df[['B', 'D', 'C']] = preprocessor.fit_transform(df[['B', 'D', 'C']])
df

0    100.0
1     30.0
2     69.8
3     80.0
4     50.0
5     69.8
6     89.0
Name: B, dtype: object

In [5]:
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

preprocessor = ColumnTransformer(transformers = [('num',num_imputer,['B','D']),('cat',cat_imputer,['C'])])

df[['B','D','C']] = preprocessor.fit_transform(df[['B','D','C']])
df

Unnamed: 0,B,C,D,target
0,100.0,white,0.392,dog
1,30.0,yellow,0.07,cat
2,69.8,black,0.9,dog
3,80.0,black,0.22,dog
4,50.0,white,0.57,cat
5,69.8,white,0.392,dog
6,89.0,white,0.2,dog


In [26]:
le = LabelEncoder()

df['C'] = le.fit_transform(df['C'])
df['target'] = le.fit_transform(df['target'])

In [27]:
df

Unnamed: 0,B,C,D,target
0,1.0,1,3.4,1
1,2.0,2,1.0,0
2,4.0,0,2.0,1
3,4.0,0,7.0,1
4,5.0,1,3.0,0
5,4.0,1,3.4,1
6,8.0,1,4.0,1


In [22]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(df[['C', 'target']])

feature_names = encoder.get_feature_names_out(['C', 'target'])

df_encoded = pd.DataFrame(encoded_features, columns=feature_names)


df_final = pd.concat([df.drop(['C', 'target'], axis=1), df_encoded], axis=1)

X = df_final.drop(['target_dog', 'target_cat'], axis=1)
y = df_final[['target_dog', 'target_cat']]
df_final

# Split the data into a training set and a test set


Unnamed: 0,B,D,C_black,C_white,C_yellow,target_cat,target_dog
0,100.0,0.392,0.0,1.0,0.0,0.0,1.0
1,30.0,0.07,0.0,0.0,1.0,1.0,0.0
2,69.8,0.9,1.0,0.0,0.0,0.0,1.0
3,80.0,0.22,1.0,0.0,0.0,0.0,1.0
4,50.0,0.57,0.0,1.0,0.0,1.0,0.0
5,69.8,0.392,0.0,1.0,0.0,0.0,1.0
6,89.0,0.2,0.0,1.0,0.0,0.0,1.0


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [81]:
X_train

Unnamed: 0,B,D,C_black,C_white,C_yellow
5,69.8,0.392,0.0,1.0,0.0
2,69.8,0.9,1.0,0.0,0.0
4,50.0,0.57,0.0,1.0,0.0
3,80.0,0.22,1.0,0.0,0.0
6,89.0,0.2,0.0,1.0,0.0


In [26]:
scaler = StandardScaler()
X_train_sk = scaler.fit_transform(X_train)
X_test_sk = scaler.transform(X_test)
X_train_sk

array([[-0.14757988, -0.24862856, -0.81649658,  0.81649658,  0.        ],
       [-0.14757988,  1.71260294,  1.22474487, -1.22474487,  0.        ],
       [-1.66949734,  0.4385746 , -0.81649658,  0.81649658,  0.        ],
       [ 0.63643821, -0.91266757,  1.22474487, -1.22474487,  0.        ],
       [ 1.32821888, -0.98988141, -0.81649658,  0.81649658,  0.        ]])