In [186]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest,chi2
from sklearn import set_config 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [187]:
set_config(display="diagram")

In [188]:
df=pd.read_csv("./datasets/train.csv")

In [189]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [190]:
df=df[["Pclass","Age","Fare","Survived"]]

In [191]:
df.shape

(891, 4)

In [192]:
df.head(5)

Unnamed: 0,Pclass,Age,Fare,Survived
0,3,22.0,7.25,0
1,1,38.0,71.2833,1
2,3,26.0,7.925,1
3,1,35.0,53.1,1
4,3,35.0,8.05,0


In [193]:
df.duplicated().sum()

145

In [194]:
df.drop_duplicates(inplace=True)

In [195]:
df.shape

(746, 4)

In [196]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 746 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    746 non-null    int64  
 1   Age       658 non-null    float64
 2   Fare      746 non-null    float64
 3   Survived  746 non-null    int64  
dtypes: float64(2), int64(2)
memory usage: 29.1 KB


Age--> having the missing value

In [197]:
df.isnull().mean()*100

Pclass       0.000000
Age         11.796247
Fare         0.000000
Survived     0.000000
dtype: float64

In [198]:
df.describe()

Unnamed: 0,Pclass,Age,Fare,Survived
count,746.0,658.0,746.0,746.0
mean,2.227882,29.841064,34.787785,0.41689
std,0.860201,14.799527,50.177031,0.493375
min,1.0,0.42,0.0,0.0
25%,1.0,20.0,8.05,0.0
50%,3.0,28.25,16.1,0.0
75%,3.0,39.0,34.91355,1.0
max,3.0,80.0,512.3292,1.0


In [199]:
df.corr()

Unnamed: 0,Pclass,Age,Fare,Survived
Pclass,1.0,-0.368138,-0.565326,-0.331683
Age,-0.368138,1.0,0.094089,-0.084596
Fare,-0.565326,0.094089,1.0,0.24751
Survived,-0.331683,-0.084596,0.24751,1.0


# train_test_split

In [200]:
X_train,X_test,y_train,y_test=train_test_split(df.iloc[:,:3],df.iloc[:,3:4])

In [201]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((559, 3), (187, 3), (559, 1), (187, 1))

In [202]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 559 entries, 730 to 668
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  559 non-null    int64  
 1   Age     498 non-null    float64
 2   Fare    559 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 17.5 KB


# Pipeline Using KNNImputer

In [245]:
trf1=ColumnTransformer(transformers=[
    ('KNNImputer',KNNImputer(weights="distance",n_neighbors=5
                            ),[0,1,2])
],remainder="passthrough")

In [246]:
trf2=ColumnTransformer(transformers=[
    ('scale',MinMaxScaler(),slice(0,3))
])

In [247]:
trf3=SelectKBest(score_func=chi2,k=3)

In [248]:
trf4=LogisticRegression()

In [249]:
pipe=Pipeline([('trf1',trf1),('trf2',trf2),('trf3',trf3),('trf4',trf4)])

In [250]:
pipe.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [251]:
y_pred=pipe.predict(X_test)

In [252]:
accuracy_score(y_pred,y_test)

0.6524064171122995

In [253]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train,y_train,cv=5,scoring='accuracy').mean()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.7011904761904761

# Pipeline using SimpleImputer

In [212]:
trf1=ColumnTransformer(transformers=[
    ('KNNImputer',SimpleImputer(),[1])
],remainder="passthrough")

In [213]:
trf2=ColumnTransformer(transformers=[
    ('scale',MinMaxScaler(),slice(0,3))
])

In [214]:
trf3=SelectKBest(score_func=chi2,k=3)

In [215]:
trf4=LogisticRegression()

In [216]:
pipe=Pipeline([('trf1',trf1),('trf2',trf2),('trf3',trf3),('trf4',trf4)])

In [217]:
pipe.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [218]:
y_pred=pipe.predict(X_test)

In [219]:
accuracy_score(y_pred,y_test)

0.6524064171122995

In [220]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train,y_train,cv=5,scoring='accuracy').mean()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.6958333333333334