Q1. You are work#ng on a machine learning project where you have a dataset containing numerical and
categorical features. You have identified that some of the features are highly correlated and there are
missing values in some of the columns. You want to build a pipel#ne that automates the feature
engineering process and handles the missing values

#### Des#gn a pipeline that includes the following steps:

- Use an automated feature select#on method to identify the important features in the dataset
- Create a numerical pipeline that includes the following steps"
- Impute the missing values in the numerical columns us#ng the mean of the column values
- Scale the numerical columns us#ng standardisationC
- Create a categorical pipeline that includes the following steps"
- Impute the missing values in the categorical columns using the most frequent value of the column
- One-hot encode the categorical columns
- Combine the numerical and categorical pipelines using a ColumnTransformer
- Use a Random Forest Classifier to build the final model
- Evaluate the accuracy of the model on the test dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
titanic=sns.load_dataset('titanic')

In [3]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
titanic.isnull().sum()/len(titanic)*100

survived        0.000000
pclass          0.000000
sex             0.000000
age            19.865320
sibsp           0.000000
parch           0.000000
fare            0.000000
embarked        0.224467
class           0.000000
who             0.000000
adult_male      0.000000
deck           77.216611
embark_town     0.224467
alive           0.000000
alone           0.000000
dtype: float64

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [6]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer

In [7]:
titanic.nunique()

survived         2
pclass           3
sex              2
age             88
sibsp            7
parch            7
fare           248
embarked         3
class            3
who              3
adult_male       2
deck             7
embark_town      3
alive            2
alone            2
dtype: int64

In [8]:
df=titanic.copy()

In [9]:
X=df.drop('survived',axis=1)
y=df['survived']

In [10]:
for col in X.columns:
    if X[col].dtypes==bool:
        X[col]=X[col].map({True:1,False:0})

In [11]:
X.dtypes

pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male        int64
deck           category
embark_town      object
alive            object
alone             int64
dtype: object

In [12]:
num_col=[]
cat_col=[]
for col in X.columns:
    if X[col].dtypes in (int,float):
        num_col.append(col)
    else:
        cat_col.append(col)

In [13]:
#num_col=X.select_dtypes(include=[int,float]).columns

In [14]:
num_col

['pclass', 'age', 'sibsp', 'parch', 'fare', 'adult_male', 'alone']

In [15]:
#cat_col=X.select_dtypes(exclude=[int,float]).columns

In [16]:
cat_col

['sex', 'embarked', 'class', 'who', 'deck', 'embark_town', 'alive']

In [17]:
num_pipeline=Pipeline(
    steps=[('imputer',SimpleImputer(strategy='median')),
           ('scaler',StandardScaler())])
cat_pipeline=Pipeline(
    steps=[('imputer',SimpleImputer(strategy='most_frequent')),
           ('One hot Encoding',OneHotEncoder())])

In [18]:
preprocessing=ColumnTransformer([('numerical Pipeline',num_pipeline,num_col),('Categorical Pipeline',cat_pipeline,cat_col)])

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=34)

In [21]:
x_train=preprocessing.fit_transform(X_train)
x_test=preprocessing.transform(X_test)

## Model Dictionary

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
models={'Random Forest':RandomForestClassifier(),'Decision Tree':DecisionTreeClassifier(),'Naive Bayes':BernoulliNB()}

In [25]:
def evaluate_models(X_train,X_test,y_train,y_test,models):
    report={}
    for i in range(len(models)):
        model=list(models.values())[i]
        model.fit(X_train,y_train)
        y_pred=model.predict(X_test)
        accuracy=accuracy_score(y_test,y_pred)
        report[list(models.keys())[i]]=accuracy
    return report

In [26]:
evaluate_models(x_train,x_test,y_train,y_test,models)

{'Random Forest': 1.0, 'Decision Tree': 1.0, 'Naive Bayes': 1.0}

In [27]:
x_train

array([[-0.36785928,  0.11590575, -0.47874848, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.83358937, -0.03564399, -0.47874848, ...,  0.        ,
         0.        ,  1.        ],
       [-0.36785928, -0.18719373,  0.45770459, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.83358937, -0.11141886,  0.45770459, ...,  1.        ,
         1.        ,  0.        ],
       [-0.36785928,  0.22956806,  0.45770459, ...,  0.        ,
         1.        ,  0.        ],
       [-0.36785928, -0.86916756, -0.47874848, ...,  1.        ,
         0.        ,  1.        ]])

# Train on iris dataset
Q2. Bu#ld a p#pel#ne that #ncludes a random forest class#f#er and a log#st#c regress#on class#f#er, and then
use a vot#ng class#f#er to comb#ne the#r pred#ct#ons. Tra#n the p#pel#ne on the #r#s dataset and evaluate #ts
accuracy.

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [29]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
iris=sns.load_dataset('iris')

In [32]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [33]:
iris['species']=iris['species'].map({'setosa':0,'versicolor':1,'virginica':2})

In [34]:
X=iris.drop('species',axis=1)
y=iris['species']

In [35]:
for col in X.columns:
    if X[col].dtypes==bool:
        X[col]=X[col].map({True:1,False:0})

In [36]:
num_col=[]
cat_col=[]
for col in X.columns:
    if X[col].dtypes in (int,float):
        num_col.append(col)
    else:
        cat_col.append(col)

In [37]:
num_pipeline=Pipeline(
    steps=[('imputer',SimpleImputer(strategy='median')),
           ('scaler',StandardScaler())])
cat_pipeline=Pipeline(
    steps=[('imputer',SimpleImputer(strategy='most_frequent')),
           ('One hot Encoding',OneHotEncoder())])

In [38]:
preprocessing=ColumnTransformer([('numerical Pipeline',num_pipeline,num_col),('Categorical Pipeline',cat_pipeline,cat_col)])

In [39]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=34)

In [40]:
num_col

['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [41]:
cat_col

[]

In [42]:
x_train=preprocessing.fit_transform(X_train)
x_test=preprocessing.transform(X_test)

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

In [44]:
from sklearn.metrics import accuracy_score

In [45]:
models={'Random Forest':RandomForestClassifier(),"Logistic Regression":LogisticRegression(),'Decision Tree':DecisionTreeClassifier(),'Naive Bayes':BernoulliNB()}

In [46]:
def evaluate_models(X_train,X_test,y_train,y_test,models):
    report={}
    for i in range(len(models)):
        model=list(models.values())[i]
        model.fit(X_train,y_train)
        y_pred=model.predict(X_test)
        accuracy=accuracy_score(y_test,y_pred)
        report[list(models.keys())[i]]=accuracy
    return report

In [47]:
evaluate_models(x_train,x_test,y_train,y_test,models)

{'Random Forest': 0.9333333333333333,
 'Logistic Regression': 1.0,
 'Decision Tree': 0.9333333333333333,
 'Naive Bayes': 0.7}