# Piplines chains together multiple steps so that the output of each step is used as input to the next step.
# Piplines make it easy to apply the same preprocessing to train and test

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [4]:
df = pd.read_csv("titanic.csv")

In [5]:
df.shape

(891, 12)

In [6]:
df.sample(8)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
743,744,0,3,"McNamee, Mr. Neal",male,24.0,1,0,376566,16.1,,S
82,83,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q
361,362,0,2,"del Carlo, Mr. Sebastiano",male,29.0,1,0,SC/PARIS 2167,27.7208,,C
432,433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide ...",female,42.0,1,0,SC/AH 3085,26.0,,S
106,107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.65,,S
875,876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.225,,C
315,316,1,3,"Nilsson, Miss. Helmina Josefina",female,26.0,0,0,347470,7.8542,,S
778,779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q


In [7]:
df.drop(columns=["PassengerId","Name","Ticket","Cabin"],inplace=True)

In [8]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'], test_size=0.2, random_state=42)

In [9]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


In [10]:
df.isnull().sum()  # there is null value in age nad embarked 

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [11]:
si_age = SimpleImputer()      # age null values is replaced by mean 
si_embarked = SimpleImputer(strategy="most_frequent")   # embarked null value is replace by most_frequent value


X_train_age = si_age.fit_transform(X_train[["Age"]])
X_train_embarked = si_embarked.fit_transform(X_train[["Embarked"]])


X_test_age = si_age.fit_transform(X_test[["Age"]])
X_test_embarked = si_embarked.fit_transform(X_test[["Embarked"]])

X_train_age


array([[45.5       ],
       [23.        ],
       [32.        ],
       [26.        ],
       [ 6.        ],
       [24.        ],
       [45.        ],
       [29.        ],
       [29.49884615],
       [29.49884615],
       [42.        ],
       [36.        ],
       [33.        ],
       [17.        ],
       [29.        ],
       [50.        ],
       [35.        ],
       [38.        ],
       [34.        ],
       [17.        ],
       [11.        ],
       [61.        ],
       [30.        ],
       [ 7.        ],
       [63.        ],
       [20.        ],
       [29.49884615],
       [29.        ],
       [36.        ],
       [29.49884615],
       [50.        ],
       [27.        ],
       [30.        ],
       [33.        ],
       [29.49884615],
       [29.49884615],
       [ 2.        ],
       [25.        ],
       [51.        ],
       [25.        ],
       [29.49884615],
       [29.49884615],
       [24.        ],
       [18.        ],
       [29.49884615],
       [25

In [12]:
# OneHotEncoder 

ohe_sex = OneHotEncoder(sparse_output=False,handle_unknown="ignore")
ohe_embarked = OneHotEncoder(sparse_output=False,handle_unknown="ignore")

X_train_sex = ohe_sex.fit_transform(X_train[["Sex"]])
X_train_embarked = ohe_embarked.fit_transform(X_train_embarked)

X_test_sex = ohe_sex.fit_transform(X_test[["Sex"]])
X_test_embarked = ohe_embarked.fit_transform(X_test_embarked)

X_train_sex

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

In [13]:
X_train_remain = X_train.drop(columns=["Age","Sex","Embarked"])
X_test_remain = X_test.drop(columns=["Age","Sex","Embarked"])


In [14]:
X_train_transform = np.concatenate((X_train_remain,X_train_age,X_train_sex,X_train_embarked),axis=1)
X_test_transform = np.concatenate((X_test_remain,X_test_age,X_test_sex,X_test_embarked),axis=1)

In [15]:
X_train_transform.shape

(712, 10)

In [16]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transform,y_train)

In [17]:
y_pred = clf.predict(X_test_transform)
y_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1], dtype=int64)

In [18]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_pred)

0.7541899441340782

In [19]:
import pickle

In [20]:

pickle.dump(ohe_sex,open('models/ohe_sex.pkl','wb'))
pickle.dump(ohe_embarked,open('models/ohe_embarked.pkl','wb'))
pickle.dump(clf,open('models/clf.pkl','wb'))

FileNotFoundError: [Errno 2] No such file or directory: 'models/ohe_sex.pkl'