In [1]:
%config Completer.use_jedi = False
import os
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.compose import ColumnTransformer
from sklearn import datasets
import seaborn as sns
import pickle as pickle

In [2]:
df = pd.read_csv('https://bit.ly/kaggletrain')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
x = df[['Pclass','Sex', 'Age', 'Fare', 'Embarked']]

In [5]:
y = df['Survived']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [7]:
df[['Age']].describe()

Unnamed: 0,Age
count,714.0
mean,29.699118
std,14.526497
min,0.42
25%,20.125
50%,28.0
75%,38.0
max,80.0


In [8]:
x_train.dtypes

Pclass        int64
Sex          object
Age         float64
Fare        float64
Embarked     object
dtype: object

In [9]:
num_feat = make_column_selector(dtype_include=np.number)
cat_feat = make_column_selector(dtype_exclude=np.number)

In [10]:
num_pipe = make_pipeline(
    KNNImputer(),
    StandardScaler()
)

In [11]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy = 'most_frequent'),
    OneHotEncoder(handle_unknown='ignore', sparse=False)
)

In [12]:
col_trans = make_column_transformer(
    (num_pipe, num_feat),
    (cat_pipe, cat_feat)
)

In [13]:
model_pipe = make_pipeline(
    col_trans, 
    LogisticRegression()
)

In [14]:
model_pipe.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('knnimputer',
                                                                   KNNImputer()),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x00000241C38A94F0>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                            

In [15]:
param_grid = {
    'columntransformer__pipeline-1__knnimputer__n_neighbors': [1,5],
    'logisticregression__C': [0.1, 0.5, 1]
}

In [16]:
grid = GridSearchCV(model_pipe, param_grid, cv=10, scoring='accuracy', verbose=10)

In [17]:
grid.fit(x_train, y_train)

Fitting 10 folds for each of 6 candidates, totalling 60 fits
[CV 1/10; 1/6] START columntransformer__pipeline-1__knnimputer__n_neighbors=1, logisticregression__C=0.1
[CV 1/10; 1/6] END columntransformer__pipeline-1__knnimputer__n_neighbors=1, logisticregression__C=0.1; total time=   0.1s
[CV 2/10; 1/6] START columntransformer__pipeline-1__knnimputer__n_neighbors=1, logisticregression__C=0.1
[CV 2/10; 1/6] END columntransformer__pipeline-1__knnimputer__n_neighbors=1, logisticregression__C=0.1; total time=   0.0s
[CV 3/10; 1/6] START columntransformer__pipeline-1__knnimputer__n_neighbors=1, logisticregression__C=0.1
[CV 3/10; 1/6] END columntransformer__pipeline-1__knnimputer__n_neighbors=1, logisticregression__C=0.1; total time=   0.0s
[CV 4/10; 1/6] START columntransformer__pipeline-1__knnimputer__n_neighbors=1, logisticregression__C=0.1
[CV 4/10; 1/6] END columntransformer__pipeline-1__knnimputer__n_neighbors=1, logisticregression__C=0.1; total time=   0.0s
[CV 5/10; 1/6] START column

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('knnimputer',
                                                                                          KNNImputer()),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x00000241C38A94F0>),
                                                                        ('pipeline-2',
                                                                         Pipeline(steps=[('simpleimputer',
                                               

In [18]:
grid.best_params_

{'columntransformer__pipeline-1__knnimputer__n_neighbors': 5,
 'logisticregression__C': 0.1}

In [19]:
grid.best_score_

0.7892996870109548