In [0]:
import pandas as pd

In [3]:
df = pd.read_csv("chronic_kidney_disease.csv",
                 header=None,
                 names=['age', 'bp', 'sg', 'al', 
                        'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 
                        'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 
                        'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 
                        'pe', 'ane', 'class'])
df.head()


Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,?,normal,notpresent,notpresent,121,36,1.2,?,?,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.02,4,0,?,normal,notpresent,notpresent,?,18,0.8,?,?,11.3,38,6000,?,no,no,no,good,no,no,ckd
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423,53,1.8,?,?,9.6,31,7500,?,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,56,3.8,111,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106,26,1.4,?,?,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


In [5]:
# First, we need to convert the dataframe into a dictionary.
# This can be achieved by Pandas to_dict method.
#
df_dict = df.to_dict(orient='records') # turn each row as key-value pairs.

print(f" Dictionary created: {df_dict}")

# The orient='records' is required to turn the data frame 
# into a {column:value} format.

 Dictionary created: [{'age': '48', 'bp': '80', 'sg': '1.020', 'al': '1', 'su': '0', 'rbc': '?', 'pc': 'normal', 'pcc': 'notpresent', 'ba': 'notpresent', 'bgr': '121', 'bu': '36', 'sc': '1.2', 'sod': '?', 'pot': '?', 'hemo': '15.4', 'pcv': '44', 'wc': '7800', 'rc': '5.2', 'htn': 'yes', 'dm': 'yes', 'cad': 'no', 'appet': 'good', 'pe': 'no', 'ane': 'no', 'class': 'ckd'}, {'age': '7', 'bp': '50', 'sg': '1.020', 'al': '4', 'su': '0', 'rbc': '?', 'pc': 'normal', 'pcc': 'notpresent', 'ba': 'notpresent', 'bgr': '?', 'bu': '18', 'sc': '0.8', 'sod': '?', 'pot': '?', 'hemo': '11.3', 'pcv': '38', 'wc': '6000', 'rc': '?', 'htn': 'no', 'dm': 'no', 'cad': 'no', 'appet': 'good', 'pe': 'no', 'ane': 'no', 'class': 'ckd'}, {'age': '62', 'bp': '80', 'sg': '1.010', 'al': '2', 'su': '3', 'rbc': 'normal', 'pc': 'normal', 'pcc': 'notpresent', 'ba': 'notpresent', 'bgr': '423', 'bu': '53', 'sc': '1.8', 'sod': '?', 'pot': '?', 'hemo': '9.6', 'pcv': '31', 'wc': '7500', 'rc': '?', 'htn': 'no', 'dm': 'yes', 'cad':

In [0]:
# Now we instantiate a DictVectorizer:
#
from sklearn.feature_extraction import DictVectorizer

# instantiate a dictvectorizer object for df

dv_df = DictVectorizer(sparse=False)
# sparse = False output an array not a sparse matrix

In [7]:
# Apply the dv_df on the df_dict
#
df_encoded = dv_df.fit_transform(df_dict)

# Show encoded df
df_encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Each row represents a sample and each column represents a feature. If we want to know what feature for each column, we can check the vocabulary of this DictVectorizer:

In [8]:
# Vocabulary
vocab = dv_df.vocabulary_

# Show vocabulary
vocab

{'age=11': 0,
 'age=12': 1,
 'age=14': 2,
 'age=15': 3,
 'age=17': 4,
 'age=19': 5,
 'age=2': 6,
 'age=20': 7,
 'age=21': 8,
 'age=22': 9,
 'age=23': 10,
 'age=24': 11,
 'age=25': 12,
 'age=26': 13,
 'age=27': 14,
 'age=28': 15,
 'age=29': 16,
 'age=3': 17,
 'age=30': 18,
 'age=32': 19,
 'age=33': 20,
 'age=34': 21,
 'age=35': 22,
 'age=36': 23,
 'age=37': 24,
 'age=38': 25,
 'age=39': 26,
 'age=4': 27,
 'age=40': 28,
 'age=41': 29,
 'age=42': 30,
 'age=43': 31,
 'age=44': 32,
 'age=45': 33,
 'age=46': 34,
 'age=47': 35,
 'age=48': 36,
 'age=49': 37,
 'age=5': 38,
 'age=50': 39,
 'age=51': 40,
 'age=52': 41,
 'age=53': 42,
 'age=54': 43,
 'age=55': 44,
 'age=56': 45,
 'age=57': 46,
 'age=58': 47,
 'age=59': 48,
 'age=6': 49,
 'age=60': 50,
 'age=61': 51,
 'age=62': 52,
 'age=63': 53,
 'age=64': 54,
 'age=65': 55,
 'age=66': 56,
 'age=67': 57,
 'age=68': 58,
 'age=69': 59,
 'age=7': 60,
 'age=70': 61,
 'age=71': 62,
 'age=72': 63,
 'age=73': 64,
 'age=74': 65,
 'age=75': 66,
 'age=76': 

### Getting Dummies

In [9]:
# Get dummies
#
df_x = pd.get_dummies(df, prefix_sep='_', drop_first=True)

df_x.head()

Unnamed: 0,age_12,age_14,age_15,age_17,age_19,age_2,age_20,age_21,age_22,age_23,age_24,age_25,age_26,age_27,age_28,age_29,age_3,age_30,age_32,age_33,age_34,age_35,age_36,age_37,age_38,age_39,age_4,age_40,age_41,age_42,age_43,age_44,age_45,age_46,age_47,age_48,age_49,age_5,age_50,age_51,...,rc_4.3,rc_4.4,rc_4.5,rc_4.6,rc_4.7,rc_4.8,rc_4.9,rc_5,rc_5.0,rc_5.1,rc_5.2,rc_5.3,rc_5.4,rc_5.5,rc_5.6,rc_5.7,rc_5.8,rc_5.9,rc_6.0,rc_6.1,rc_6.2,rc_6.3,rc_6.4,rc_6.5,rc_8.0,rc_?,htn_no,htn_yes,dm_?,dm_no,dm_yes,cad_no,cad_yes,appet_good,appet_poor,pe_no,pe_yes,ane_no,ane_yes,class_notckd
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,1,0,1,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,0,0,1,1,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,1,0,1,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,1,0,1,0,0


 DictVectorizer is a one step method to encode and support sparse matrix output. Pandas get dummies method is so far the most straight forward and easiest way to encode categorical features. The output will remain dataframe type.

As my point of view, the first choice method will be pandas get dummies. But if the number of categorical features are huge, DictVectorizer will be a good choice as it supports sparse matrix output.