In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif

# Q1: Pipeline with Feature Engineering and Handling Missing Values

# Load dataset (example with iris dataset, but it can be any dataset)
# For this example, we will assume 'X' contains both categorical and numerical features
# Replace it with your own dataset if needed
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target

# Simulate missing values in both categorical and numerical features
X.loc[0:10, 'sepal length (cm)'] = np.nan
X.loc[15:25, 'sepal width (cm)'] = np.nan

# Create a numerical pipeline
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler())  # Standardize the numerical features
])

# Create a categorical pipeline (assuming there are categorical features in X)
# For demonstration, let's assume 'species' is categorical
X['species'] = iris.target_names[y]  # Add the species column to simulate categorical data

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode the categorical features
])

# Combine the numerical and categorical pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, X.columns[:4]),  # Use first 4 features for numerical
        ('cat', categorical_pipeline, ['species'])  # Use 'species' column as categorical
    ]
)

# Create a Random Forest Classifier model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Create a final pipeline that includes preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf_classifier)
])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the Random Forest Classifier pipeline:", accuracy)

# Q2: Pipeline with Voting Classifier (Random Forest + Logistic Regression)

# Create individual models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
lr_model = LogisticRegression(max_iter=200, random_state=42)

# Create a voting classifier that combines both models
voting_clf = VotingClassifier(estimators=[('rf', rf_model), ('lr', lr_model)], voting='hard')

# Create a pipeline that includes preprocessing and the voting classifier
voting_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', voting_clf)
])

# Train the voting classifier model
voting_pipeline.fit(X_train, y_train)

# Make predictions and evaluate the voting classifier
y_pred_voting = voting_pipeline.predict(X_test)
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print("\nAccuracy of the Voting Classifier pipeline:", accuracy_voting)

# Interpretation and Suggestions:
# - Both models in the first pipeline handle preprocessing, such as imputing missing values, scaling, and one-hot encoding, automatically.
# - In the Random Forest pipeline, we performed feature engineering and imputation based on the column type (numerical/categorical).
# - The Voting Classifier combines two models (Random Forest and Logistic Regression), improving the overall performance by aggregating predictions.
# - For improvements, you could explore hyperparameter tuning (e.g., using GridSearchCV) or trying different imputation strategies (e.g., KNN imputation).
