In [None]:
## Q1. Automated Feature Engineering and Model Pipeline

#### Step-by-Step Solution

# 1. Import necessary libraries and load the dataset:
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.feature_selection import SelectFromModel
    from sklearn.metrics import accuracy_score

    # Load dataset (assuming a pandas DataFrame 'df' with target variable 'target')
    df = pd.read_csv('your_dataset.csv')
    X = df.drop(columns=['target'])
    y = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    

# 2. Automated Feature Selection:
    feature_selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    X_test_selected = feature_selector.transform(X_test)
    

# 3. Numerical Pipeline:

    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    

# 4. Categorical Pipeline:

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    

# 5. Combine Pipelines using ColumnTransformer:

    from sklearn.compose import make_column_selector as selector

    preprocessor = ColumnTransformer([
        ('num', numerical_pipeline, selector(dtype_exclude='object')),
        ('cat', categorical_pipeline, selector(dtype_include='object'))
    ])
    

# 6. Build the final pipeline with Random Forest Classifier:

    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])

    model_pipeline.fit(X_train, y_train)
    

# 7. Evaluate the model:

    y_pred = model_pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Model Accuracy: {accuracy:.4f}')
    

#### Interpretation of the Results and Possible Improvements
- The model accuracy indicates how well the Random Forest Classifier performs on the test set.
- If the accuracy is satisfactory, the pipeline can be used for predictions. If not, consider the following improvements:
  - *Hyperparameter tuning*: Use GridSearchCV or RandomizedSearchCV to find the best parameters for the Random Forest Classifier.
  - *Feature Engineering*: Create new features based on domain knowledge.
  - *Handling Correlated Features*: Use PCA or remove highly correlated features before model training.


In [None]:
## Q2. Voting Classifier Pipeline on Iris Dataset

#### Step-by-Step Solution

# 1. Import necessary libraries and load the Iris dataset:
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier, VotingClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.metrics import accuracy_score

    # Load Iris dataset
    iris = load_iris()
    X, y = iris.data, iris.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    

# 2. Create Pipelines for Random Forest and Logistic Regression:
    rf_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])

    lr_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(random_state=42))
    ])
    

# 3. Combine the pipelines using a Voting Classifier:
    voting_classifier = VotingClassifier(estimators=[
        ('rf', rf_pipeline),
        ('lr', lr_pipeline)
    ], voting='soft')

    voting_classifier.fit(X_train, y_train)
    

# 4. Evaluate the Voting Classifier:

    y_pred = voting_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Voting Classifier Accuracy: {accuracy:.4f}')
    

# Interpretation of the Results
# - The accuracy of the voting classifier indicates how well the combined predictions of the Random Forest and Logistic Regression models perform.
# - If the accuracy is satisfactory, the model can be used for predictions.
# - If not, consider the following improvements:
#  - Hyperparameter tuning: Optimize the parameters for each individual classifier.
#  - Different classifiers: Experiment with other classifiers to see if they improve the ensemble's performance.
#  - Feature Engineering: Enhance the dataset with additional relevant features or transformations.