In [4]:
import pandas as pd
import numpy as np
from numpy import nan as NA

'''
Another type of transformation for statistical modeling or machine learning applica‐
tions is converting a categorical variable into a “dummy” or “indicator” matrix. If a
column in a DataFrame has k distinct values, you would derive a matrix or Data‐
Frame with k columns containing all 1s and 0s. pandas has a get_dummies function
for doing this, though devising one yourself is not difficult.
'''


df = pd.DataFrame({'key' : ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1' : range(6)})

df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [5]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [6]:
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [7]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [8]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('/home/armin/Data/Development/Python/pydata-book-2nd-edition/datasets/movielens/movies.dat',
                       sep='::',header=None, names=mnames)
movies[:10]

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [9]:
# Adding indicator variables for each genre

all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))
all_genres = pd.unique(all_genres)
all_genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [10]:
zero_matrix = np.zeros((len(movies), len(all_genres)))
dummies = pd.DataFrame(zero_matrix, columns=all_genres)
dummies.iloc[0]

Animation      0.0
Children's     0.0
Comedy         0.0
Adventure      0.0
Fantasy        0.0
Romance        0.0
Drama          0.0
Action         0.0
Crime          0.0
Thriller       0.0
Horror         0.0
Sci-Fi         0.0
Documentary    0.0
War            0.0
Musical        0.0
Mystery        0.0
Film-Noir      0.0
Western        0.0
Name: 0, dtype: float64

In [11]:
# Now, iterate through each movie and set entries in each row of dummies to 1. To do
# this, we use the dummies.columns to compute the column indices for each genre
gen = movies.genres[1]
gen

In [12]:
dummies.columns.get_indexer(gen.split('|'))

array([3, 1, 4])

In [16]:
for i, genre in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(genre.split('|'))
    dummies.iloc[i][indices] = 1
dummies.iloc[0]

Animation      1.0
Children's     1.0
Comedy         1.0
Adventure      0.0
Fantasy        0.0
Romance        0.0
Drama          0.0
Action         0.0
Crime          0.0
Thriller       0.0
Horror         0.0
Sci-Fi         0.0
Documentary    0.0
War            0.0
Musical        0.0
Mystery        0.0
Film-Noir      0.0
Western        0.0
Name: 0, dtype: float64

In [18]:
dummies.add_prefix('Genre_').iloc[0]

Genre_Animation      1.0
Genre_Children's     1.0
Genre_Comedy         1.0
Genre_Adventure      0.0
Genre_Fantasy        0.0
Genre_Romance        0.0
Genre_Drama          0.0
Genre_Action         0.0
Genre_Crime          0.0
Genre_Thriller       0.0
Genre_Horror         0.0
Genre_Sci-Fi         0.0
Genre_Documentary    0.0
Genre_War            0.0
Genre_Musical        0.0
Genre_Mystery        0.0
Genre_Film-Noir      0.0
Genre_Western        0.0
Name: 0, dtype: float64

In [19]:
# you can combine dummies with movies
movies = movies.join(dummies.add_prefix('Genre_'))
movies.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       