<a href="https://colab.research.google.com/github/2003Yash/sklearn-pipelines/blob/main/Sklearn_Pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

IMPORTING LIBRARIES

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

MAKING PIPELINE

In [3]:
# store the steps of a pipeline in a tuple
steps=[("standard_scaler",StandardScaler()),
      ("classifier",LogisticRegression())]

In [4]:
pipe = Pipeline(steps)

In [5]:
# below 2 lines are used to viz pipelines in notebook
from sklearn import set_config
set_config(display="diagram")

In [6]:
pipe

CREATING A CUSTOM DATASET FOR CLASSIFICATION

In [7]:
##creating a dataset
from sklearn.datasets import make_classification
X,y=make_classification(n_samples=1000)

In [8]:
X.shape

(1000, 20)

In [9]:
y.shape

(1000,)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

TRAINING THE MODEL USING PIPELINE

In [11]:
pipe.fit(X_train,y_train)

PREDICTING USING PIPELINE

In [12]:
y_pred=pipe.predict(X_test)

In [13]:
y_pred

array([0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

In [14]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Generate a confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9066666666666666

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91       153
           1       0.93      0.87      0.90       147

    accuracy                           0.91       300
   macro avg       0.91      0.91      0.91       300
weighted avg       0.91      0.91      0.91       300


Confusion Matrix:
[[144   9]
 [ 19 128]]


2nd Example:

In [15]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [16]:
steps2 = [("scaling",StandardScaler()),
      ("PCA",PCA(n_components=3)),
      ("SVC",SVC())]

pipe2 = Pipeline(steps2)

In [17]:
pipe2

In [18]:
#instead passing data through all parts of pipeline we can just individually access modules by their keywords defined in the steps tuple
pipe2['scaling'].fit_transform(X_train)

array([[-0.09400001,  0.18845024,  0.08714207, ..., -0.88681304,
         0.11256714,  0.97737764],
       [-1.57670628, -1.3743323 , -1.14011463, ..., -0.71181171,
        -0.63320122,  1.87304204],
       [ 0.27326038, -1.69811759,  0.92727381, ..., -0.33010203,
        -0.00993114, -0.36074609],
       ...,
       [-0.32186098, -0.20915274,  0.31488704, ...,  0.4325281 ,
         0.13299569, -0.7978075 ],
       [ 0.21272819,  0.89391855, -1.04211228, ..., -0.54180283,
        -0.36899046,  1.05064318],
       [-0.89056574, -1.01297841,  0.17161884, ...,  0.52469633,
        -0.65242543, -1.07164125]])

In [19]:
# all pipeline modules are triggered
pipe2.fit(X_train,y_train)

In [20]:
pipe2.predict(X_test)

array([0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

PIPELINE WITH COLUMN TRANFORMER

In [21]:
from sklearn.impute import SimpleImputer
import numpy as np


## numerical processing pipeline
numeric_pipe=Pipeline(
    steps=[("imputation_mean",SimpleImputer(missing_values=np.nan,strategy="mean")),
          ("scaler",StandardScaler())]

)

In [22]:
numeric_pipe

In [23]:
from sklearn.preprocessing import OneHotEncoder

##categorical procesing pipeline
categorical_pipe=Pipeline(
    steps=[("imputation_consatnt",SimpleImputer(fill_value="missing",strategy="constant")), # fills all missing categorical values with word "missing"
          ("onehot",OneHotEncoder(handle_unknown="ignore"))]
)

In [24]:
categorical_pipe

In [26]:
## combine numeric and categorical pipes

from sklearn.compose import ColumnTransformer

preprocessor=ColumnTransformer(
    [("categorical",categorical_pipe,["gender","City"]), # if in dataset we found gender and city then we trigger this pipe
    ("numerical",numeric_pipe,["age","height"])] # if in dataset we fount age or height then we trigger this pipe
)

In [27]:
preprocessor

In [29]:
from sklearn.pipeline import make_pipeline

final_pipe = make_pipeline(preprocessor,LogisticRegression())

In [30]:
final_pipe