<h2 style="font-size:28px;font-family:Consolas;">A pipeline in machine learning is like a step-by-step process that connects all tasks (cleaning data → changing data → training model → testing model) into one chain.</h2>


<h2 style="font-size:18px; font-family:Consolas;">➡️Think of it like cooking a recipe:<br><br>


1.Wash vegetables (data cleaning)

2.Cut them (feature transformation)

3.Cook them (model training)

4.Taste and serve (prediction)

Instead of doing these steps separately each time, a pipeline remembers the order and does everything automatically.</h2>


In [1]:
import pandas as pd 
import numpy as np

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.ensemble import RandomForestClassifier

In [3]:
df=pd.read_csv("Titanic-Dataset.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)
df.sample()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
825,0,3,male,,0,0,6.95,Q


In [5]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                random_state=42)

In [6]:
X_train.shape,X_test.shape

((712, 7), (179, 7))

In [7]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         140
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [8]:
X_test.isnull().sum()

Pclass       0
Sex          0
Age         37
SibSp        0
Parch        0
Fare         0
Embarked     0
dtype: int64

In [12]:
X_train.dtypes

Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

## First Missing Value Handling

In [22]:
impute=ColumnTransformer([
    ('age',SimpleImputer(),[2]),
    ('embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

## Secondly OHE

In [23]:
ohe=ColumnTransformer([
    ('gender',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

## Thirdly Scaling

In [24]:
scale=ColumnTransformer([
    ('allcolumns',MinMaxScaler(),slice(0,10))
])

## Model Building

In [25]:
model=RandomForestClassifier()

## Pipeline Building

In [31]:
pipe=Pipeline([
    ('imputed',impute),
    ('ohencoded',ohe), 
    ('scaled',scale),
    ('RF',model)
     
])

<h2 style="font-size:20px;font-family:Consolas;">
  In scikit-learn, both <code>Pipeline</code> and <code>make_pipeline</code> are used to create a pipeline, but they differ slightly<br><br>

✅In <code>Pipeline</code> <br>

⏺️You manually name each step.<br>

⏺️Useful if you want custom names<br><br>

✅ <code>make_pipeline</code><br>

⏺️You don’t need to give names.

⏺️Names are automatically created from the class names in lowercase.

</h2>


In [32]:
from sklearn import set_config
set_config(display='diagram')

In [33]:
pipe.fit(X_train,y_train)

0,1,2
,steps,"[('imputed', ...), ('ohencoded', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('age', ...), ('embarked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('gender', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('allcolumns', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [34]:
pipe.named_steps

{'imputed': ColumnTransformer(remainder='passthrough',
                   transformers=[('age', SimpleImputer(), [2]),
                                 ('embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'ohencoded': ColumnTransformer(remainder='passthrough',
                   transformers=[('gender',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'scaled': ColumnTransformer(transformers=[('allcolumns', MinMaxScaler(),
                                  slice(0, 10, None))]),
 'RF': RandomForestClassifier()}

In [35]:
y_pred=pipe.predict(X_test)

In [36]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0], dtype=int64)

In [38]:
from sklearn.metrics import accuracy_score

accuracy=accuracy_score(y_test,y_pred)

accuracy

0.6256983240223464

In [41]:
from sklearn.model_selection import GridSearchCV

params = {
    'RF__n_estimators': [100, 200, 300],
    'RF__max_depth': [None, 10, 20],
    'RF__min_samples_split': [2, 5, 10],
    'RF__min_samples_leaf': [1, 2, 4],
    'RF__max_features': ['sqrt', 'log2'],
    'RF__class_weight': [None, 'balanced']
}


grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)



0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'RF__class_weight': [None, 'balanced'], 'RF__max_depth': [None, 10, ...], 'RF__max_features': ['sqrt', 'log2'], 'RF__min_samples_leaf': [1, 2, ...], ...}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('age', ...), ('embarked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('gender', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('allcolumns', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [42]:
grid.best_score_

0.6391214419383433

In [43]:
grid.best_params_

{'RF__class_weight': None,
 'RF__max_depth': None,
 'RF__max_features': 'sqrt',
 'RF__min_samples_leaf': 1,
 'RF__min_samples_split': 2,
 'RF__n_estimators': 100}