# Import necessary dependencies and settings

In [1]:
import pandas as pd
import numpy as np

# Transforming Nominal Features

Nominal attributes consist of discrete categorical values with no notion or sense of order amongst them. The idea here is to transform these attributes into a more representative numerical format which can be easily understood by downstream code and pipelines. Let’s look at a new dataset pertaining to video game sales.

In [2]:
vg_df = pd.read_csv('vgsales.csv', encoding='utf-8')
vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[0:7]

Unnamed: 0,Name,Platform,Year,Genre,Publisher
0,Wii Sports,Wii,2006.0,Sports,Nintendo
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo
5,Tetris,GB,1989.0,Puzzle,Nintendo
6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo


### Get the list of unique video game genres 

In [3]:
generos = vg_df.Genre.unique()
generos

array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy'], dtype=object)

This tells us that we have 12 distinct video game genres. 

### We can now generate a label encoding scheme for mapping each category to a numeric value by leveraging scikit-learn LabelEncoder

In [4]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(generos)

le.classes_


array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
       'Strategy'], dtype=object)

In [5]:
le.transform(le.classes_)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

### Show the transformed labels values and the dataframe

In [6]:
vg_df.Genre

0              Sports
1            Platform
2              Racing
3              Sports
4        Role-Playing
             ...     
16593        Platform
16594         Shooter
16595          Racing
16596          Puzzle
16597        Platform
Name: Genre, Length: 16598, dtype: object

In [7]:
vg_df['Género_OneHot_Encoder'] = le.transform(vg_df.Genre)
vg_df.loc[:,['Genre', 'Género_OneHot_Encoder']].head(10)


Unnamed: 0,Genre,Género_OneHot_Encoder
0,Sports,10
1,Platform,4
2,Racing,6
3,Sports,10
4,Role-Playing,7
5,Puzzle,5
6,Platform,4
7,Misc,3
8,Platform,4
9,Shooter,8



# Transforming Ordinal Features

Ordinal attributes are categorical attributes with a sense of order amongst the values. Let’s consider the Pokémon dataset. Let’s focus more specifically on the Type 1 attribute. We will think that each Type 1 has a different power that we can order.



In [8]:
poke_df = pd.read_csv('Pokemon.csv', encoding='latin1', index_col=0)
poke_df.head()

Unnamed: 0_level_0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Stage,Legendary
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,2,False
3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,3,False
4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
5,Charmeleon,Fire,,405,58,64,58,80,65,80,2,False


In [9]:
poke_df = poke_df.sample(random_state=1, frac=1).reset_index(drop=True) # toma todo el DataFrame (frac=1) 
poke_df.head()
# sample( random_state= semilla, frac=parte del DataFrame)
# reordena las filas del DataFrame y vuelve a llamar a los índices de 0 a len-1

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Stage,Legendary
0,Beedrill,Bug,Poison,395,65,90,40,45,80,75,3,False
1,Kingler,Water,,475,55,130,115,50,50,75,2,False
2,Golem,Rock,Ground,495,80,120,130,55,65,45,3,False
3,Pidgeotto,Normal,Flying,349,63,60,55,50,50,71,2,False
4,Ditto,Normal,,288,48,48,48,48,48,48,1,False


In [10]:
poke_df.columns

Index(['Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense',
       'Sp. Atk', 'Sp. Def', 'Speed', 'Stage', 'Legendary'],
      dtype='object')

### Show the different type 1 present in the dataset

In general, there is no generic module or function to map and transform these features into numeric representations based on order automatically. Hence we can use a custom encoding\mapping scheme based on a dictionary.

In [11]:
#poke_df['Type 1'].unique()

type_1_map = {'Bug': 1, 'Water': 2, 'Rock': 3, 'Normal': 4, 'Fighting': 5, 'Grass': 6, 'Poison': 7,
       'Fire': 8, 'Ghost': 9, 'Fairy': 10, 'Electric': 11, 'Dragon':12, 'Ground':13,
       'Psychic':14, 'Ice':15}

# map the values to the dataframe

poke_df['type_1_num'] = poke_df['Type 1'].map(type_1_map)
poke_df.head()

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Stage,Legendary,type_1_num
0,Beedrill,Bug,Poison,395,65,90,40,45,80,75,3,False,1
1,Kingler,Water,,475,55,130,115,50,50,75,2,False,2
2,Golem,Rock,Ground,495,80,120,130,55,65,45,3,False,3
3,Pidgeotto,Normal,Flying,349,63,60,55,50,50,71,2,False,4
4,Ditto,Normal,,288,48,48,48,48,48,48,1,False,4


# Encoding Categorical Features

## One-hot Encoding Scheme

In [12]:
poke_df[['Name', 'Stage', 'Legendary']].iloc[4:10]

Unnamed: 0,Name,Stage,Legendary
4,Ditto,1,False
5,Primeape,2,False
6,Aerodactyl,1,False
7,Vileplume,3,False
8,Nidorina,2,False
9,Starmie,2,False


In [13]:
from sklearn.preprocessing import LabelEncoder

# transform and map pokemon Type 1 with LabelEncoder

le_pokemon = preprocessing.LabelEncoder()
le_pokemon.fit(poke_df['Type 1'])

le_pokemon.classes_

le_pokemon.transform(le_pokemon.classes_)

poke_df['Type 1 zip'] = poke_df['Type 1'].map(dict(zip(le_pokemon.classes_,le_pokemon.transform(le_pokemon.classes_))))

# type_1_num y Type 1 zip NO COINCIDEN porque primero he codificado yo el diccionario como he querido
# y luego he utilizado los métodos que codifican b

# transform and map pokemon legendary status with Label Encoder

le_pokemon.fit(poke_df['Legendary'])
poke_df['Legendary zip'] = poke_df['Legendary'].map(dict(zip(le_pokemon.classes_,le_pokemon.transform(le_pokemon.classes_))))



In [14]:
# Otra forma más sencilla utilizando transform
# ¡Para esto vale fit y transform!
# Muchas transformaciones se dividen en fit (ajusta los parámetros de la transformación)
# y en transform (aplica los cambios)

le_pokemon.fit(poke_df['Type 1'])
poke_df['Type 1 transformed'] = le_pokemon.transform(poke_df['Type 1'])

In [15]:
poke_df.head()

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Stage,Legendary,type_1_num,Type 1 zip,Legendary zip,Type 1 transformed
0,Beedrill,Bug,Poison,395,65,90,40,45,80,75,3,False,1,0,0,0
1,Kingler,Water,,475,55,130,115,50,50,75,2,False,2,14,0,14
2,Golem,Rock,Ground,495,80,120,130,55,65,45,3,False,3,13,0,13
3,Pidgeotto,Normal,Flying,349,63,60,55,50,50,71,2,False,4,10,0,10
4,Ditto,Normal,,288,48,48,48,48,48,48,1,False,4,10,0,10


In [16]:
# comprobamos que la codificación del método es alfabética
poke_df.sort_values('Type 1 zip')['Type 1'].unique()


array(['Bug', 'Dragon', 'Electric', 'Fairy', 'Fighting', 'Fire', 'Ghost',
       'Grass', 'Ground', 'Ice', 'Normal', 'Poison', 'Psychic', 'Rock',
       'Water'], dtype=object)

In [17]:
poke_df.head()

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Stage,Legendary,type_1_num,Type 1 zip,Legendary zip,Type 1 transformed
0,Beedrill,Bug,Poison,395,65,90,40,45,80,75,3,False,1,0,0,0
1,Kingler,Water,,475,55,130,115,50,50,75,2,False,2,14,0,14
2,Golem,Rock,Ground,495,80,120,130,55,65,45,3,False,3,13,0,13
3,Pidgeotto,Normal,Flying,349,63,60,55,50,50,71,2,False,4,10,0,10
4,Ditto,Normal,,288,48,48,48,48,48,48,1,False,4,10,0,10


The features Type 1 zip and Legendary_zip now depict the numeric representations of our categorical features. Let’s now apply the one-hot encoding scheme on these features. Apply the get_dummies() method.

In [18]:
# encode Type 1 labels using one-hot encoding scheme

one_hot_df_type_1 = pd.get_dummies(poke_df['Type 1'], prefix='Type_1')

# encode legendary status labels using one-hot encoding scheme
one_hot_df_legendary = pd.get_dummies(poke_df['Legendary'], prefix='Legendary')

In [19]:
sum(one_hot_df_legendary['Legendary_True'] == 1)
# compruebo que solo hay 4 pokemon legendarios

4

In [20]:
df_one_hot = pd.concat([poke_df, one_hot_df_type_1, one_hot_df_legendary], axis=1)
df_one_hot

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,...,Type_1_Grass,Type_1_Ground,Type_1_Ice,Type_1_Normal,Type_1_Poison,Type_1_Psychic,Type_1_Rock,Type_1_Water,Legendary_False,Legendary_True
0,Beedrill,Bug,Poison,395,65,90,40,45,80,75,...,0,0,0,0,0,0,0,0,1,0
1,Kingler,Water,,475,55,130,115,50,50,75,...,0,0,0,0,0,0,0,1,1,0
2,Golem,Rock,Ground,495,80,120,130,55,65,45,...,0,0,0,0,0,0,1,0,1,0
3,Pidgeotto,Normal,Flying,349,63,60,55,50,50,71,...,0,0,0,1,0,0,0,0,1,0
4,Ditto,Normal,,288,48,48,48,48,48,48,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,Vaporeon,Water,,525,130,65,60,110,95,65,...,0,0,0,0,0,0,0,1,1,0
147,Omanyte,Rock,Water,355,35,40,100,90,55,35,...,0,0,0,0,0,0,1,0,1,0
148,Tentacruel,Water,Poison,515,80,70,65,80,120,100,...,0,0,0,0,0,0,0,1,1,0
149,Kabutops,Rock,Water,495,60,115,105,65,70,80,...,0,0,0,0,0,0,1,0,1,0


Consider you built this encoding scheme on your training data and built some model and now you have some new data which has to be engineered for features before predictions as follows.

In [21]:
new_poke_df = pd.DataFrame([['PikaZoom', 'Bug', True], 
                           ['CharMyToast', 'Water', False]],
                           columns=['Name', 'Type 1', 'Legendary'])
new_poke_df

Unnamed: 0,Name,Type 1,Legendary
0,PikaZoom,Bug,True
1,CharMyToast,Water,False


In [22]:
le_pokemon.fit(poke_df['Type 1'])
new_type1_labels = le_pokemon.transform(new_poke_df['Type 1'])
new_poke_df['Type1_Label'] = new_type1_labels

le_pokemon.fit(poke_df['Legendary'])
new_leg_labels = le_pokemon.transform(new_poke_df['Legendary'])
new_poke_df['Lgnd_Label'] = new_leg_labels

new_poke_df[['Name', 'Type 1', 'Type1_Label', 'Legendary', 'Lgnd_Label']]

Unnamed: 0,Name,Type 1,Type1_Label,Legendary,Lgnd_Label
0,PikaZoom,Bug,0,True,1
1,CharMyToast,Water,14,False,0


You can leverage scikit-learn’s excellent API here by calling the transform(…) function of the previously build LabeLEncoder objects on the new data.

## Dummy Coding Scheme

Let’s try applying dummy coding scheme on Pokémon Type 1 by dropping the first level binary encoded feature (Type 1 = Bug).


In [23]:
dummy_features = pd.get_dummies(poke_df, columns = ['Type 1'], drop_first=True)
dummy_features.iloc[4:10] # iloc entiende que tomamos desde la cuarta fila hasta la novena


Unnamed: 0,Name,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Stage,...,Type 1_Fire,Type 1_Ghost,Type 1_Grass,Type 1_Ground,Type 1_Ice,Type 1_Normal,Type 1_Poison,Type 1_Psychic,Type 1_Rock,Type 1_Water
4,Ditto,,288,48,48,48,48,48,48,1,...,0,0,0,0,0,1,0,0,0,0
5,Primeape,,455,65,105,60,60,70,95,2,...,0,0,0,0,0,0,0,0,0,0
6,Aerodactyl,Flying,515,80,105,65,60,75,130,1,...,0,0,0,0,0,0,0,0,1,0
7,Vileplume,Poison,490,75,80,85,110,90,50,3,...,0,0,1,0,0,0,0,0,0,0
8,Nidorina,,365,70,62,67,55,55,56,2,...,0,0,0,0,0,0,1,0,0,0
9,Starmie,Psychic,520,60,75,85,100,85,115,2,...,0,0,0,0,0,0,0,0,0,1


If you want, you can also choose to drop the last level binary encoded feature

In [24]:
le_pokemon.fit(poke_df['Type 1'])
le_pokemon.classes_

array(['Bug', 'Dragon', 'Electric', 'Fairy', 'Fighting', 'Fire', 'Ghost',
       'Grass', 'Ground', 'Ice', 'Normal', 'Poison', 'Psychic', 'Rock',
       'Water'], dtype=object)

In [25]:
dummy_features_todas = pd.get_dummies(poke_df, columns = ['Type 1'], prefix = ['Tipo1'])
dummy_features_todas.head()

Unnamed: 0,Name,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Stage,...,Tipo1_Fire,Tipo1_Ghost,Tipo1_Grass,Tipo1_Ground,Tipo1_Ice,Tipo1_Normal,Tipo1_Poison,Tipo1_Psychic,Tipo1_Rock,Tipo1_Water
0,Beedrill,Poison,395,65,90,40,45,80,75,3,...,0,0,0,0,0,0,0,0,0,0
1,Kingler,,475,55,130,115,50,50,75,2,...,0,0,0,0,0,0,0,0,0,1
2,Golem,Ground,495,80,120,130,55,65,45,3,...,0,0,0,0,0,0,0,0,1,0
3,Pidgeotto,Flying,349,63,60,55,50,50,71,2,...,0,0,0,0,0,1,0,0,0,0
4,Ditto,,288,48,48,48,48,48,48,1,...,0,0,0,0,0,1,0,0,0,0


In [26]:
dummy_features_todas.columns.isin(['Tipo1_Bug'])

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False])

In [27]:
~dummy_features_todas.columns.isin(['Tipo1_Bug'])

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [28]:
dummy_features_todas.columns[~dummy_features_todas.columns.isin(['Tipo1_Bug'])]

Index(['Name', 'Type 2', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk',
       'Sp. Def', 'Speed', 'Stage', 'Legendary', 'type_1_num', 'Type 1 zip',
       'Legendary zip', 'Type 1 transformed', 'Tipo1_Dragon', 'Tipo1_Electric',
       'Tipo1_Fairy', 'Tipo1_Fighting', 'Tipo1_Fire', 'Tipo1_Ghost',
       'Tipo1_Grass', 'Tipo1_Ground', 'Tipo1_Ice', 'Tipo1_Normal',
       'Tipo1_Poison', 'Tipo1_Psychic', 'Tipo1_Rock', 'Tipo1_Water'],
      dtype='object')

In [29]:

dummy_features_sin_primera = dummy_features_todas[dummy_features_todas.columns[~dummy_features_todas.columns.isin(['Tipo1_Bug'])]]
dummy_features_sin_primera.head()


Unnamed: 0,Name,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Stage,...,Tipo1_Fire,Tipo1_Ghost,Tipo1_Grass,Tipo1_Ground,Tipo1_Ice,Tipo1_Normal,Tipo1_Poison,Tipo1_Psychic,Tipo1_Rock,Tipo1_Water
0,Beedrill,Poison,395,65,90,40,45,80,75,3,...,0,0,0,0,0,0,0,0,0,0
1,Kingler,,475,55,130,115,50,50,75,2,...,0,0,0,0,0,0,0,0,0,1
2,Golem,Ground,495,80,120,130,55,65,45,3,...,0,0,0,0,0,0,0,0,1,0
3,Pidgeotto,Flying,349,63,60,55,50,50,71,2,...,0,0,0,0,0,1,0,0,0,0
4,Ditto,,288,48,48,48,48,48,48,1,...,0,0,0,0,0,1,0,0,0,0


## Feature Hashing scheme

Find the number of different 'Genre' in the dataset.

In [30]:
# Usa vgsales.csv

df_videojuegos =  pd.read_csv('vgsales.csv')
df_videojuegos.head()


Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [31]:
print('Total game genres: ' + str(len(df_videojuegos.Genre.unique())))
print(df_videojuegos.Genre.sort_values().unique())

Total game genres: 12
['Action' 'Adventure' 'Fighting' 'Misc' 'Platform' 'Puzzle' 'Racing'
 'Role-Playing' 'Shooter' 'Simulation' 'Sports' 'Strategy']


### We can see that there are a total of 12 genres of video games. If we used a one-hot encoding scheme on the Genre feature, we would end up having 12 binary features. Instead, we will now use a feature hashing scheme by leveraging scikit-learn’s FeatureHasher class, which uses a signed 32-bit version of the Murmurhash3 hash function. We will pre-define the final feature vector size to be 6 in this case.

In [32]:
from sklearn.feature_extraction import FeatureHasher

h = FeatureHasher(n_features=6, input_type='string')
f = h.transform(df_videojuegos['Genre'])

f.toarray()

array([[-2.,  2.,  0., -2.,  0.,  0.],
       [ 0.,  2.,  2., -1.,  1.,  0.],
       [-1.,  0.,  0.,  0.,  0., -1.],
       ...,
       [-1.,  0.,  0.,  0.,  0., -1.],
       [ 0.,  1.,  1., -2.,  1., -1.],
       [ 0.,  2.,  2., -1.,  1.,  0.]])

In [33]:
f.toarray().shape

(16598, 6)

In [34]:
df_videojuegos.Genre.size

16598

In [35]:
df_videojuegos['Genre'].unique().size

12