In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv(r"E:\Jupyter Notebooks\train.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [5]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [7]:
xtrain, xtest,ytrain, ytest = train_test_split(X,y,test_size = 0.2, random_state = 0)

In [8]:
xtrain.shape, ytrain.shape, xtest.shape, ytest.shape # test data requires to be of 1d only for sklearn fitting

((712, 7), (712,), (179, 7), (179,))

### flow below is like this 

#### impute numerical and then standardize , impute categorical then encode, then fit the model, and for all these steps get the best technique for each operation using gridsearchCV

### 2points to understand about the use of column traansformer and pipelines

1)  use pipeline - when same transformations are to be applied to the columns and in a sequence

2) use column transformer when different transformations are to be applied to different columns and also sequence of operations doesn't matter 

what chatgpt has to say for above lines 

Use Pipeline: When the same transformations need to be applied to all columns in a sequence.

Use ColumnTransformer: When different transformations need to be applied to different columns, regardless of the sequence of operations.

### Conclusion

1. **Using `Pipeline` within `ColumnTransformer`**: Ensures transformations are applied in the specified sequence.
2. **Using `ColumnTransformer` alone**: Does not enforce any sequence; transformations may be applied in an undefined order, which could lead to errors (like scaling before imputing missing values).

So, always use a `Pipeline` when you need to enforce a specific sequence of transformations for a set of features.

In [16]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [15]:
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [13]:
list(df.select_dtypes('number').columns.values)

['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [14]:
list(df.select_dtypes('object').columns.values)

['Sex', 'Embarked']

In [17]:
# create pipeline for numerical data
numeric_features = list(df.select_dtypes('number').columns.values)
numeric_scaling_req = ['Age','Fare']
numeric_pipe = Pipeline([
    ('impute', SimpleImputer()),
    ('scale', StandardScaler())
])

In [18]:
# create pipeline for categorical data
cat_features = list(df.select_dtypes('object').columns.values)
cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('encode', OneHotEncoder(handle_unknown = 'ignore'))
])

In [19]:
# create columntransform to apply above two pipes on diff columns note how sequence doesn't matter 
# as to which pipe should be applied forst, whereas the sequence within the pipe remain intact
ct = ColumnTransformer([
    ('numeric', numeric_pipe, numeric_scaling_req),
    ('cat', cat_pipe, cat_features)
], remainder = 'passthrough')

In [20]:
# create pipeline for applying above preprocessing and model fitting
# note pipeline is used since we want preprocessing steps to be performed before modeling
final_pipe = Pipeline(
[
    ('preprocessing', ct),
    ('modeling', LogisticRegression())
])


In [21]:
final_pipe

In [24]:
from sklearn import set_config
set_config(display = 'diagram')
final_pipe

#### the above steps just created the pipelines , how the workflow will be, below we'll initialize gidsearch cv to fit the data in the pipeline with the required optimal parameters

In [40]:
param_dict = {
    'preprocessing__numeric__impute__strategy':['mean', 'median'],
    'preprocessing__cat__impute__strategy' : ['most_frequent', 'constant'],
    'modeling__C' : [0.1,1.0,10,100]
}
gridSearch = GridSearchCV(final_pipe, param_grid = param_dict, cv =15)

In [41]:
gridSearch

In [42]:
gridSearch.fit(xtrain, ytrain)
gridSearch.best_params_

{'modeling__C': 0.1,
 'preprocessing__cat__impute__strategy': 'most_frequent',
 'preprocessing__numeric__impute__strategy': 'mean'}

In [43]:
gridSearch.best_score_

0.7978723404255318

In [44]:
gridSearch.cv_results_

{'mean_fit_time': array([0.0222136 , 0.02218401, 0.02172001, 0.02131874, 0.02705003,
        0.02658203, 0.02418675, 0.0248521 , 0.02158821, 0.02218827,
        0.02405217, 0.02385365, 0.02138851, 0.02138704, 0.02318673,
        0.02365244]),
 'std_fit_time': array([0.00320607, 0.00132597, 0.00173047, 0.00107586, 0.00655341,
        0.00620209, 0.00065176, 0.00125753, 0.00149526, 0.00175891,
        0.00152381, 0.00158471, 0.00101943, 0.00101847, 0.00090959,
        0.00094251]),
 'mean_score_time': array([0.00839621, 0.00859556, 0.00852782, 0.00852935, 0.00992715,
        0.00946155, 0.00846173, 0.0081296 , 0.00846047, 0.008193  ,
        0.00812836, 0.00892779, 0.00806108, 0.00812872, 0.00826125,
        0.0085955 ]),
 'std_score_time': array([0.00061145, 0.00048964, 0.00061837, 0.00071882, 0.00261794,
        0.00249763, 0.00049762, 0.00049916, 0.00061835, 0.00039965,
        0.00034142, 0.0008524 , 0.00044291, 0.00049739, 0.00077118,
        0.00120007]),
 'param_modeling__C': mask

In [45]:
cvResult = pd.DataFrame(gridSearch.cv_results_)

In [46]:
cvResult.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_modeling__C', 'param_preprocessing__cat__impute__strategy',
       'param_preprocessing__numeric__impute__strategy', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'split5_test_score',
       'split6_test_score', 'split7_test_score', 'split8_test_score',
       'split9_test_score', 'split10_test_score', 'split11_test_score',
       'split12_test_score', 'split13_test_score', 'split14_test_score',
       'mean_test_score', 'std_test_score', 'rank_test_score'],
      dtype='object')

In [49]:
cvResult = cvResult[['param_modeling__C', 'param_preprocessing__cat__impute__strategy',
       'param_preprocessing__numeric__impute__strategy','mean_test_score']]

In [51]:
cvResult.sort_values('mean_test_score', ascending = False)

Unnamed: 0,param_modeling__C,param_preprocessing__cat__impute__strategy,param_preprocessing__numeric__impute__strategy,mean_test_score
0,0.1,most_frequent,mean,0.797872
1,0.1,most_frequent,median,0.797872
2,0.1,constant,mean,0.797872
3,0.1,constant,median,0.797872
8,10.0,most_frequent,mean,0.795035
12,100.0,most_frequent,mean,0.795035
7,1.0,constant,median,0.793676
14,100.0,constant,mean,0.793617
6,1.0,constant,mean,0.793617
10,10.0,constant,mean,0.793617


<a href = "https://chatgpt.com/share/6c08f2d2-3934-47a2-b072-45adf0203c7e"> talk with chatgpt