Pipelines

In [1]:
import pandas as ps
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler

In [2]:
data=sns.load_dataset('titanic')
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
# select feature and target variable
x=data[['sex','pclass','age','fare','embarked']]
y=data['survived']

# test train split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

# coloums transfer to impute missing values
numeric_feature=['age','fare','pclass']
categoric_feature=['embarked','sex']

scalar=MaxAbsScaler()
numeric_tranform=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scalar',scalar)
    ])

categoric_tranform=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
    ])

preprocessor=ColumnTransformer(
    transformers=[
        ('num',numeric_tranform,numeric_feature),
        ('cat',categoric_tranform,categoric_feature)
    ])

# A pipeline with preprocessor and RandomForestClassifier
pipeline=Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('classifier',RandomForestClassifier(n_estimators=42))
    ])

# fit the pipeline on the training data
pipeline.fit(x_train,y_train)
y_pred=pipeline.predict(x_test)

# calculate accuracy_score
accuracy=accuracy_score(y_test,y_pred)
print('Accuracy score: ',accuracy)

Accuracy score:  0.8100558659217877


In [4]:
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
x=data[['sex','pclass','age','embarked','fare']]
y=data['survived']

# split data into test and train
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

# numeric_feature and categoric_feature
numeric_feature=['pclass','age','fare']
categoric_feature=['sex','embarked']

scalar=StandardScaler()
numeric_tranform=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('sclar',scalar)
    ])

categoric_tranform=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),    
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])

preprocessor=ColumnTransformer(
    transformers=[
        ('num',numeric_tranform,numeric_feature),
        ('cat',categoric_tranform,categoric_feature)
])

pipeline=Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('classifier',RandomForestClassifier(n_estimators=10))
    ])

pipeline.fit(x_train,y_train)
y_pred=pipeline.predict(x_test)

accuracy=accuracy_score(y_test,y_pred)
print('accuracy_score: ',accuracy)



accuracy_score:  0.7686567164179104
