In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [2]:
df=pd.read_csv("dataset/titanic.csv")
df.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
252,1144,0,1,"Clark, Mr. Walter Miller",male,27.0,1,0,13508,136.7792,C89,C
157,1049,1,3,"Lundin, Miss. Olga Elida",female,23.0,0,0,347469,7.8542,,S
277,1169,0,2,"Faunthorpe, Mr. Harry",male,40.0,1,0,2926,26.0,,S
271,1163,0,3,"Fox, Mr. Patrick",male,,0,0,368573,7.75,,Q
87,979,1,3,"Badman, Miss. Emily Louisa",female,18.0,0,0,A/4 31416,8.05,,S
394,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.025,,S
265,1157,0,3,"Lyntakoff, Mr. Stanko",male,,0,0,349235,7.8958,,S
112,1004,1,1,"Evans, Miss. Edith Corse",female,36.0,0,0,PC 17531,31.6792,A29,C
210,1102,0,3,"Andersen, Mr. Albert Karvin",male,32.0,0,0,C 4001,22.525,,S
114,1006,1,1,"Straus, Mrs. Isidor (Rosalie Ida Blun)",female,63.0,1,0,PC 17483,221.7792,C55 C57,S


In [3]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [4]:
df=df.drop(columns=['Fare','PassengerId','Name','Ticket','Cabin'])

In [5]:
X_train,X_test,y_train,y_test=train_test_split(df.drop('Survived',axis=1),df['Survived'],test_size=0.3,random_state=0)

In [6]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
96,1,female,76.0,1,0,S
381,3,male,26.0,0,0,Q
89,2,male,2.0,1,1,S
233,3,male,,0,0,Q
191,1,male,,0,0,S
...,...,...,...,...,...,...
323,1,male,33.0,0,0,S
192,3,male,11.5,1,1,S
117,3,female,1.0,1,1,S
47,3,male,,0,0,Q


In [7]:
trf1= ColumnTransformer([
    ('Impute_Age',SimpleImputer(),[2])
],remainder='passthrough')

In [8]:
trf3= ColumnTransformer([
    ('Scale_Age',MinMaxScaler(),[2])
])

In [9]:
trf2= ColumnTransformer([
    ('OneHotEncode_Sex',OneHotEncoder(sparse_output=True,handle_unknown='ignore'),[1]),
    ('OneHotEncoder_Embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[5])
],remainder='passthrough')

In [10]:
trf4= SelectKBest(score_func=chi2,k=8)

In [11]:
trf5= DecisionTreeClassifier()

# Pipe

In [12]:
from sklearn.pipeline import Pipeline,make_pipeline

In [13]:
pipe= Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

In [14]:
pipe.fit(X_train,y_train)



0,1,2
,steps,"[('trf1', ...), ('trf2', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('Impute_Age', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('OneHotEncode_Sex', ...), ('OneHotEncoder_Embarked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('Scale_Age', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,score_func,<function chi...002A1FDD7E480>
,k,8

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [15]:
pred=pipe.predict(X_train)

In [20]:
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [17]:
y_train.value_counts()

Survived
0    195
1     97
Name: count, dtype: int64

In [None]:
pipe.named_steps['trf1'].transformers_[0][1].statistics_