# Answer1

In [1]:
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import pandas as pd
# Load the tips dataset from seaborn
df = sns.load_dataset('tips')

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [9]:
# Assume 'smoker' is the target variable
X = df.drop('smoker', axis=1)
y = df['smoker']
X

Unnamed: 0,total_bill,tip,sex,day,time,size
0,16.99,1.01,Female,Sun,Dinner,2
1,10.34,1.66,Male,Sun,Dinner,3
2,21.01,3.50,Male,Sun,Dinner,3
3,23.68,3.31,Male,Sun,Dinner,2
4,24.59,3.61,Female,Sun,Dinner,4
...,...,...,...,...,...,...
239,29.03,5.92,Male,Sat,Dinner,3
240,27.18,2.00,Female,Sat,Dinner,2
241,22.67,2.00,Male,Sat,Dinner,2
242,17.82,1.75,Male,Sat,Dinner,2


In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
#numerical columns 
num_cols = ['total_bill','tip','size']
#categorical columns:
cat_cols = ['sex','time','day']

## Feature Selection Pipeline:

We create a pipeline for feature selection using SelectFromModel with a RandomForestClassifier. This step is optional and can be adjusted based on your specific needs.

In [11]:
# Feature Selection Pipeline
feature_selection_pipeline = Pipeline([
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))),
])

## Numerical Pipeline:

We create a pipeline for numerical features, including imputation with the mean and standard scaling.

In [12]:
#numerical pipeline:
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

## Categorical Pipeline:

We create a pipeline for categorical features, including imputation with the most frequent value and one-hot encoding. We use drop='first' to avoid the dummy variable trap.

In [13]:
#categorical pipeline:
cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehotencoder',OneHotEncoder())
    ]
)

In [14]:
# Combine the Numerical and Categorical Pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,num_cols),
    ('cat_pipeline',cat_pipeline,cat_cols)]
)

In [20]:
# Final Pipeline combining Feature Selection, Numerical, and Categorical Pipelines
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [21]:
# Train the model
full_pipeline.fit(X_train, y_train)

In [23]:
# Make predictions on the test set
y_pred = full_pipeline.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')

Model Accuracy: 0.67


## Interpretation:
The pipeline handles missing values, scales numerical features, encodes categorical features, performs feature selection, and trains a Random Forest Classifier. The accuracy on the test set gives an indication of how well the model performs.

## Possible Improvements:

Fine-tune hyperparameters for both the imputers and the classifier.
Experiment with different feature selection methods.
Explore other models or ensemble methods.
Handle class imbalance if present in the target variable.
Perform cross-validation for a more robust evaluation.

# Answer2

In [26]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load the Iris dataset (replace with your dataset)
data = load_iris()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create individual classifiers
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
logistic_regression_classifier = LogisticRegression(random_state=42)

# Create a voting classifier with 'hard' voting (you can use 'soft' voting as well)
voting_classifier = VotingClassifier(
    estimators=[
        ('rf', random_forest_classifier),
        ('lr', logistic_regression_classifier)
    ],
    voting='hard'
)

# Create a pipeline with the voting classifier
pipeline = Pipeline([
    ('voting_classifier', voting_classifier)
])

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0
