# ***PIPELINE***

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2

In [3]:
data = pd.read_csv('../files/train.csv')
data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [4]:
# keeping only required columns
data.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)

In [5]:
# checking null values
data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [6]:
# train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=["Survived"]),
    data["Survived"],
    test_size=0.2,
    random_state=10,
)

**Steps**<br>
    *1. First we will handle the missing data using SimpleImputer*<br>
    *2. The output from above will be the input and we will perform a OHE*<br>
    *3. The output from above will be the input and we will perform scaling*<br>
    *4. The output from above will be the input and we will perform Feature Selection*<br>
    *5. The output from above will be the input and we will train the model*

In [7]:
# Step1 --> SimpleImputer

# we need to impute two columns: Age, Embarked
# we will use ColumnTransformer

transformer_SI = ColumnTransformer(
    transformers=[
        ("impute_age", SimpleImputer(strategy="mean"), [2]),
        ("impute_embarked", SimpleImputer(strategy="most_frequent"), [6]),
    ],
    remainder="passthrough",
)

In [8]:
# Step2 --> OHE

# we need to encode two columns: Sex, Embarked

transformer_OHE = ColumnTransformer(
    transformers=[
        (
            "ohe_sex_embarked",
            OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
            [1, 6],
        ),
    ],
    remainder="passthrough",
)

In [9]:
# Step3 --> Scaling
# We are using MinMaxScaler because, we are doing feature selection

transformer_Scaler = ColumnTransformer(
    [
        (
            "scaler",
            MinMaxScaler(),
            slice(0, 10),
        )
    ],
)

In [10]:
# Step4 --> Feature Selection

transformer_Feat = SelectKBest(score_func=chi2, k=5)

In [11]:
# Step5 --> Model training

model = DecisionTreeClassifier()

**Pipeline**

In [12]:
pipe = Pipeline(
    [
        ("transformer_SI", transformer_SI),
        ("transformer_OHE", transformer_OHE),
        ("transformer_Scaler", transformer_Scaler),
        ("transformer_Feat", transformer_Feat),
        ("model", model),
    ],
)

**Pipleline vs make_pipeline**<br>
*Pipepline requires naming of steps, make_pipeline does not*<br>
*Same for ColumnTransformer and make_column_transformer*

In [13]:
# pipe = make_pipeline(
#     transformer_SI,
#     transformer_OHE,
#     transformer_Scaler,
#     transformer_Feat,
#     model,
# )

In [14]:
pipe.fit(X_train, y_train)

In [15]:
# when training the model using pipeline call .fit()
# when not training the model using pipeline call .fit_transform()

In [18]:
pipe.named_steps

{'transformer_SI': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'transformer_OHE': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'transformer_Scaler': ColumnTransformer(transformers=[('scaler', MinMaxScaler(), slice(0, 10, None))]),
 'transformer_Feat': SelectKBest(k=5, score_func=<function chi2 at 0x00000286F0DD1940>),
 'model': DecisionTreeClassifier()}

In [21]:
pipe.named_steps['transformer_OHE']

In [22]:
pipe.named_steps['transformer_OHE'].transformers_

[('ohe_sex_embarked',
  OneHotEncoder(handle_unknown='ignore', sparse_output=False),
  [1, 6]),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  [0, 2, 3, 4, 5])]

In [23]:
pipe.named_steps['transformer_OHE'].transformers_[0]

('ohe_sex_embarked',
 OneHotEncoder(handle_unknown='ignore', sparse_output=False),
 [1, 6])

In [24]:
pipe.named_steps['transformer_OHE'].transformers_[0][1]

In [25]:
y_pred = pipe.predict(X_test)

In [26]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.6759776536312849

**Cross VAlidation using PIPELINE**

In [28]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv = 5, scoring='accuracy').mean()

np.float64(0.6251058800354574)

**We will learn GridSearch Using pipeline later**

In [29]:
# exporting
import pickle

In [31]:
pickle.dump(pipe, open("../Concept_18/pipe.pkl", "wb"))