reference=https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
df=pd.read_csv('D:\\mla-z\\P14-Naive-Bayes\\Naive_Bayes\\Social_Network_Ads.csv')
df.sample(5)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
278,15613014,Female,52,38000,1
205,15766609,Female,47,47000,0
37,15689425,Male,30,49000,0
306,15603942,Female,51,134000,0
189,15715160,Male,28,32000,0


In [0]:
X=df.iloc[:,1:-1]
X.head()

Unnamed: 0,Gender,Age,EstimatedSalary
0,Male,19,19000
1,Male,35,20000
2,Female,26,43000
3,Female,27,57000
4,Male,19,76000


In [0]:
y=df.iloc[:,-1]
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Purchased, dtype: int64

In [0]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

In [0]:

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = [1,2] #or ['Age','EstimatedSalary'] #numeric values help in predicting on our own input without adding columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = [0] #or ['Gender']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='error',drop='first'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', KNeighborsClassifier(n_neighbors=3))])


In [0]:
from sklearn.model_selection import train_test_split


In [0]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=5)
#X_train,X_test=X_train.values,X_test.values #converting pd dataframes into numpy arrays

In [0]:
clf.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [0]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [0]:
scores=cross_val_score(clf,X_train,y_train,cv=5,n_jobs=-1)

In [0]:
print('CV score: %0.3f +/- %0.3f'%(scores.mean(),scores.std()))

CV score: 0.907 +/- 0.034


In [0]:
params=[{'classifier__n_neighbors':[3,5,7,9,11]}]
gs=GridSearchCV(clf,param_grid=params,cv=5,n_jobs=-1)

In [0]:
gs.fit(X_train,y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                    

In [0]:
gs.best_params_

{'classifier__n_neighbors': 3}

In [0]:
gs.best_score_

0.9066666666666666

In [0]:
clf.score(X_test,y_test)

0.9

In [0]:
clf.predict([['Female',45,100000]]) #columns=['Gender','Age','EstimatedSalary'] in case of feature names passed in pipeline

array([1], dtype=int64)

In [0]:
clf.predict(pd.DataFrame([['Male',45,100000],['Female',45,100000]]))

array([1, 1], dtype=int64)