Q1. You are working on a machine learning project where you have a dataset containing numerical and
categorical features. You have identified that some of the features are highly correlated and there are
missing values in some of the columns. You want to build a pipeliine that automates the feature
engineering process and handles the missing valuesD

Design a p#pel#ne that includes the following steps"
Use an automated feature selection method to identify the important features in the datasetC
Create a numer#cal pipeline that includes the following steps"
Impute the missing values in the numerical columns using the mean of the column valuesC
Scale the numerical columns using standardisationC
Create a categorical pipeline that includes the following steps"
Impute the missing values in the categorical columns using the most frequent value of the columnC
One-hot encode the categorical columnsC
Combine the numerical and categorical pipelines using a ColumnTransformerC
Use a Random Forest Classifier to build the final modelC
Evaluate the accuracy of the model on the test datasetD

In [164]:
#import libraries
import seaborn as sns
import pandas as pd

In [165]:
#load dataset
df=sns.load_dataset('penguins')

In [166]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [167]:
df.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [168]:
##As there are 3 categories in dependent feature remove the nan values
df.dropna(inplace=True)

In [169]:
##Label encode the dependent feature
encoder=LabelEncoder()

In [170]:
df['sex']=encoder.fit_transform(df['sex'])

In [171]:
df.sex.unique()

array([1, 0])

In [172]:
##seperate categorical and numeric features
categorical_features=['species','island']
numerical_features=['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']

In [173]:
##seperate depenedent and independent data
X=df.drop(columns=['sex'])
y=df['sex']

In [174]:
X

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0
...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0


In [175]:
y

0      1
1      0
2      0
4      0
5      1
      ..
338    0
340    0
341    1
342    0
343    1
Name: sex, Length: 333, dtype: int64

In [122]:
from sklearn.model_selection import train_test_split

In [176]:
##split train and test data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=42)

In [177]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [178]:
num_pipline=Pipeline(
    steps=[
        ('Imputer',SimpleImputer(strategy='mean')),##for handdling missing value with mean
        ('Scaler',StandardScaler())## to scale down the data
    ]
)

cat_pipline=Pipeline(
    steps=[
        ('Imputer',SimpleImputer(strategy='most_frequent')),##for handdling missing value with most frequent
        ('onehot',OneHotEncoder())##to convert categorrical data to numeric
    ]
)

In [179]:
##make preprocessor and column trasnform
preprocessor=ColumnTransformer([
    ('numpipeline',num_pipline,numerical_features),
    ('catpipeline',cat_pipline,categorical_features)
])

In [180]:
##fit and transform X_test and X_train
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [181]:
X_train

array([[-1.34704007, -0.10695772, -1.15587483, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.80331803, -1.39483957,  0.14266465, ...,  1.        ,
         0.        ,  0.        ],
       [-1.13954938, -0.31301882, -0.21804076, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.83774473,  0.30516447, -0.57874617, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.25629711, -1.08574793,  1.36906305, ...,  1.        ,
         0.        ,  0.        ],
       [-1.13954938, -0.05544245, -1.44443916, ...,  1.        ,
         0.        ,  0.        ]])

In [182]:
X_test

array([[-0.87547031, -0.20998827, -1.66086241, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.52037617, -1.29180902,  1.51334522, ...,  1.        ,
         0.        ,  0.        ],
       [-0.38503777,  1.02637831, -0.43446401, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.80331803, -1.29180902,  0.86407548, ...,  1.        ,
         0.        ,  0.        ],
       [-1.36590286,  0.76880194, -0.8673105 , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.35033895,  0.56274084, -0.29018184, ...,  0.        ,
         1.        ,  0.        ]])

In [185]:
#import random forest and do prediction
from sklearn.ensemble import RandomForestClassifier

In [186]:
random=RandomForestClassifier()

In [187]:
random.fit(X_train,y_train)

In [188]:
y_pred=random.predict(X_test)

In [189]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [190]:
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[47  6]
 [ 4 53]]
0.9090909090909091
              precision    recall  f1-score   support

           0       0.92      0.89      0.90        53
           1       0.90      0.93      0.91        57

    accuracy                           0.91       110
   macro avg       0.91      0.91      0.91       110
weighted avg       0.91      0.91      0.91       110



Q2. Build a pipeline that includes a random forest classifier and a logistic regression classifier, and then
use a voting classifier to combine their predictions. Train the pipeline on the iris dataset and evaluate its
accuracy.

In [193]:
##laod dataset
from sklearn.datasets import load_iris

In [194]:
dataset=load_iris()

In [195]:
df=pd.DataFrame(data=dataset.data,columns=dataset.feature_names)

In [196]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [197]:
df.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

In [198]:
##seperate independent and dependent features
X=df
y=dataset.target

In [199]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [200]:
from sklearn.linear_model import LogisticRegressionCV


In [201]:
##make a dictionary of models for automation
models={
    "Logistic Classifier":LogisticRegressionCV(),
    "Random forest Classifier":RandomForestClassifier()
}

In [202]:
from sklearn.ensemble import VotingClassifier

In [203]:
##use the votinf classifier for voting models
vc=VotingClassifier(estimators=[
    ("Logistic Classifier",models["Logistic Classifier"]),
    ("Random forest Classifier",models["Random forest Classifier"]),
],voting='hard')

In [204]:
vc.fit(X_train,y_train)

In [205]:
y_pred=vc.predict(X_test)

In [207]:
##check the accuracy of voting classifier
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[50  3]
 [ 6 51]]
0.9181818181818182
              precision    recall  f1-score   support

           0       0.89      0.94      0.92        53
           1       0.94      0.89      0.92        57

    accuracy                           0.92       110
   macro avg       0.92      0.92      0.92       110
weighted avg       0.92      0.92      0.92       110

