In [53]:
!pip install ydata_profiling



In [54]:
import pandas as pd
from ydata_profiling import ProfileReport
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier


In [55]:
df = pd.read_csv('/content/drive/MyDrive/Titanic/Titanic-Dataset.csv')

In [56]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
626,627,0,2,"Kirkland, Rev. Charles Leonard",male,57.0,0,0,219533,12.35,,Q
664,665,1,3,"Lindqvist, Mr. Eino William",male,20.0,1,0,STON/O 2. 3101285,7.925,,S
828,829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q
435,436,1,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0,B96 B98,S
481,482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0.0,,S


**Pandas Profiling of dataset**


In [57]:
prof = ProfileReport(df)
prof.to_file(output_file='titanic_analysed.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [58]:
prof.to_notebook_iframe()

Output hidden; open in https://colab.research.google.com to view.

**Dropping columns which are not required**

In [59]:
df.drop(columns =['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [60]:
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
561,0,3,male,40.0,0,0,7.8958,S
751,1,3,male,6.0,0,1,12.475,S
305,1,1,male,0.92,1,2,151.55,S
209,1,1,male,40.0,0,0,31.0,C
526,1,2,female,50.0,0,0,10.5,S


**Train-Test Split**

In [61]:
X_train , X_test , y_train , y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=42)

In [62]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


In [63]:
y_train

Unnamed: 0,Survived
331,0
733,0
382,0
704,0
813,0
...,...
106,1
270,0
860,0
435,1


**Imputing Missing values using SimpleImputer and ColumnTransformer**

In [64]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


In [65]:
trf1 = ColumnTransformer([('impute_age',SimpleImputer(),[2]),('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])],remainder='passthrough')

In [66]:
trf1

**OneHotEncoding for Nominal data embarked and sex**

In [67]:
trf2 = ColumnTransformer([('ohe_sex',OneHotEncoder(sparse_output = False,handle_unknown='ignore'),[1,3])],remainder='passthrough')

In [68]:
trf2

**Scaling of values**

In [69]:
trf3 = ColumnTransformer([('scale',MinMaxScaler(),slice(0,10))],remainder='passthrough')

**Feature Selection**

In [70]:
trf4 = SelectKBest(score_func=chi2,k=8)

**Decision Tree**

In [71]:
trf5 = DecisionTreeClassifier()

**Creating Pipeline**

In [72]:
pipe = Pipeline([('trf1',trf1),('trf2',trf2),('trf3',trf3),('trf4',trf4),('trf5',trf5)])

**Training Data**

In [73]:
pipe.fit(X_train,y_train)

**Predicting**

In [74]:
y_pred = pipe.predict(X_test)

In [75]:
y_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1])

In [76]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7932960893854749

**Cross Validating using pipelines**

In [77]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train,y_train,cv=5,scoring='accuracy').mean()

0.7922584457795725

**Exporting Pipeline**

In [78]:
import pickle

In [79]:
pickle.dump(pipe,open('pipe.pkl','wb'))