**Q1. You are working on a machine learning project where you have a dataset containing numerical and
categorical features. You have identified that some of the features are highly correlated and there are
missing values in some of the columns. You want to build a pipeline that automates the feature
engineering process and handles the missing values?**

In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor


# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model', RandomForestRegressor())])


pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

NameError: name 'X' is not defined

**Q2. Build a pipeline that includes a random forest classsfier and a logistic regression classifier, and then
use a voting classfier to combine their predictions. Train the pipeline on the iris dataset and evaluate its
accuracy.**

In [16]:
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score
X,y = datasets.load_iris(return_X_y=True)
print(X.shape)
print(y.shape)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
preprocessor = StandardScaler()

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
lr_classifier = LogisticRegression(max_iter=1000, random_state=42)

rfc_p = Pipeline(steps=[('preprocess',preprocessor),('classifier',rf_classifier)])
lrc_p = Pipeline(steps=[('preprocess',preprocessor),('classifier',lr_classifier)])

vtc = VotingClassifier(estimators=[('rf', rfc_p), ('lr', lrc_p)], voting='soft')

vtc.fit(X_train,y_train)

preds = vtc.predict(X_test)
print(accuracy_score(y_test,preds))

(150, 4)
(150,)
0.9333333333333333
