## Without Pipelines

In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer # for handling missing vlues 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler 
from sklearn.tree import DecisionTreeClassifier

In [25]:
df=pd.read_csv('Titanic-Dataset.csv')

In [26]:
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
1,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
2,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [27]:
df=df.drop(columns=['PassengerId','Name','Ticket','Cabin'])

In [28]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,1,female,38.0,1,0,71.2833,C
1,1,3,female,26.0,0,0,7.925,S
2,1,1,female,35.0,1,0,53.1,S
3,0,3,male,35.0,0,0,8.05,S
4,0,3,male,,0,0,8.4583,Q


In [29]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']),
                                                   df['Survived'],
                                                   test_size=0.2,
                                                   random_state=42)

In [30]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
708,3,male,,1,1,15.2458,C
239,3,female,,1,0,14.4542,C
381,3,male,32.0,0,0,7.925,S
792,1,male,,0,0,30.6958,C
672,2,male,31.0,0,0,13.0,S


In [31]:
print(x_train.shape)
print(x_test.shape)

(712, 7)
(178, 7)


In [32]:
print(y_train.shape)
print(y_test.shape)

(712,)
(178,)


In [33]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

### Handling Missing values

In [34]:
imp_age=SimpleImputer()  # in default it fills the value with the mean 
imp_emb=SimpleImputer(strategy='most_frequent')  # hair i replace it with most frequent

*fitting and transforming*

In [35]:
x_train_age=imp_age.fit_transform(x_train[['Age']])
x_train_emb=imp_emb.fit_transform(x_train[['Embarked']])

In [36]:
x_test_age=imp_age.transform(x_train[['Age']])
x_test_emb=imp_emb.transform(x_train[['Embarked']])

### Handling Catagorical values

In [41]:
ohe_sex=OneHotEncoder(sparse_output=False,handle_unknown='ignore')
ohe_emb=OneHotEncoder(sparse_output=False,handle_unknown='ignore')

In [44]:
x_train_sax=ohe_sex.fit_transform(x_train[['Sex']])
x_train_emb1=ohe_emb.fit_transform(x_train[['Embarked']])

In [45]:
x_test_sex=ohe_sex.transform(x_test[['Sex']])
x_test_emb1=ohe_emb.transform(x_test[['Embarked']])

## With Pipeline

In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest,chi2

In [53]:
df=pd.read_csv('Titanic-Dataset.csv')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
1,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
2,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [54]:
df=df.drop(columns=['PassengerId','Name','Ticket','Cabin'])

In [55]:
df.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,1,female,38.0,1,0,71.2833,C
1,1,3,female,26.0,0,0,7.925,S
2,1,1,female,35.0,1,0,53.1,S


In [56]:
x_train, x_test, y_train, y_test=train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                 random_state=40)

In [57]:
print(y_train.shape)
print(y_test.shape)

(712,)
(178,)


In [58]:
print(x_train.shape)
print(x_test.shape)

(712, 7)
(178, 7)


In [59]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
457,2,female,50.0,0,0,10.5,S
661,1,male,47.0,0,0,25.5875,S
393,3,female,24.0,0,2,16.7,S
266,3,male,25.0,1,0,7.775,S
144,2,male,19.0,1,1,36.75,S


In [60]:
x_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
246,2,female,24.0,0,2,14.5,S
588,3,male,,0,0,8.05,S
472,2,female,23.0,0,0,13.7917,C
71,2,male,21.0,0,0,73.5,S
654,2,male,24.0,2,0,73.5,S


In [61]:
type(y_train)

pandas.core.series.Series

In [62]:
y_test.head()

246    1
588    0
472    1
71     0
654    0
Name: Survived, dtype: int64

In [63]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

*So we have missing values in Age and Embarked column*

**Steps in Pipeline**
1. Handling Missing values
2. OneHotEncoding
3. Scaling
4. Feature Selection (drop this to improve)
5. Train model

*1. Handling missing values*

In [64]:
# for SimpleEmpute
trf1=ColumnTransformer([
    ('imp_age',SimpleImputer(),[2]),
    ('imp_emb',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

*2. OneHotEncoding*

In [65]:
# for onehotencodiong
trf2=ColumnTransformer([
    ('ohe_sex_emb',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

*After applying the transformation we got new columns 2 from Age and 3 from Embrked*

*3. Scaling*

In [66]:
trf3=ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0,10))
])

*4. Model Training*

In [67]:
trf4=DecisionTreeClassifier()

## Creating the Pipeline

In [68]:
pipe=Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4)
])

In [69]:
# we can also use make_pipeline
pipe1=make_pipeline(trf1,trf2,trf3,trf4)

In [70]:
# Display the pipeline
from sklearn import set_config
set_config(display='diagram')

In [71]:
# if your process include model traning as well we use fit method 
pipe.fit(x_train,y_train)

In [72]:
# All the info about the Pipeline
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('imp_age', SimpleImputer(), [2]),
                                 ('imp_emb',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_emb',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': DecisionTreeClassifier()}

In [73]:
# Pridict the model 
y_pred=pipe.predict(x_test)

In [74]:
y_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0], dtype=int64)

In [75]:
from sklearn.metrics import accuracy_score

In [76]:
accuracy_score(y_test,y_pred)

0.6404494382022472

*For production*

In [39]:
import pickle
pickle.dump(pipe,open('pipe.pkl1','wb'))