In [1]:
import pandas as pd
import numpy as np

#### Label Encoding 

In [5]:
data = pd.read_csv('vgsales.csv',encoding='utf-8')
data.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [7]:
#Taking only the categorical features
data[['Name','Platform','Year','Genre','Publisher']][1:7]

Unnamed: 0,Name,Platform,Year,Genre,Publisher
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo
5,Tetris,GB,1989.0,Puzzle,Nintendo
6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo


In [9]:
##Taking out the number of unique genres from the dataset
genres = np.unique(data['Genre'])
print(genres ,"\nNo. of genres in the dataset::",len(genres))


['Action' 'Adventure' 'Fighting' 'Misc' 'Platform' 'Puzzle' 'Racing'
 'Role-Playing' 'Shooter' 'Simulation' 'Sports' 'Strategy'] 
No. of genres in the dataset:: 12


In [10]:
## We will now use the sci-kit learn's LabelEncoder which converts the labels into the integer values
from sklearn.preprocessing import LabelEncoder
gle = LabelEncoder()
gle_labels = gle.fit_transform(data['Genre'])

In [17]:
#create a dictionary to store the values of the labels along with the integer mappings 
genre_mappings = {index:label for index,label in enumerate(gle.classes_)}
genre_mappings

{0: 'Action',
 1: 'Adventure',
 2: 'Fighting',
 3: 'Misc',
 4: 'Platform',
 5: 'Puzzle',
 6: 'Racing',
 7: 'Role-Playing',
 8: 'Shooter',
 9: 'Simulation',
 10: 'Sports',
 11: 'Strategy'}

In [19]:
#We will add this as a new column or feature variable in the dataset
data['Genre Label'] = gle_labels
data[['Name','Platform','Year','Genre','Publisher','Genre Label']][1:7]

Unnamed: 0,Name,Platform,Year,Genre,Publisher,Genre Label
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,4
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,6
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,10
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,7
5,Tetris,GB,1989.0,Puzzle,Nintendo,5
6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,4


###### Thus a mapping scheme has been generated where each genre value is mapped to a number with the help of the LabelEncoder object gle. The transformed labels are stored in the genre_labels value

#### One Hot Encoding 

##### the one-hot encoding scheme, encodes or transforms the attribute into m binary features which can only contain a value of 1 or 0. 
##### Each observation in the categorical feature is thus converted into a vector of size m with only one of the values as 1 (indicating it as active

In [27]:
#lets try to convert the above converted feature into one hot encoded vectors
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
genre_ohe = ohe.fit_transform(data[['Genre Label']]).toarray() #converting the output to array
genre_ohe 

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [29]:
#lets create a data frame of the labels and its encoded features
genre_labels=gle.classes_
feature_data = pd.DataFrame(genre_ohe,columns=genre_labels)
feature_data.head()

Unnamed: 0,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [33]:
#lets concat/merge these features into the dataset
data_sub = data[['Name','Platform','Year','Publisher']]
encoded_data = pd.concat([data_sub,feature_data],axis=1)
encoded_data.head()

Unnamed: 0,Name,Platform,Year,Publisher,Action,Adventure,Fighting,Misc,Platform.1,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,Wii Sports,Wii,2006.0,Nintendo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,Super Mario Bros.,NES,1985.0,Nintendo,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Mario Kart Wii,Wii,2008.0,Nintendo,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Wii Sports Resort,Wii,2009.0,Nintendo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,Pokemon Red/Pokemon Blue,GB,1996.0,Nintendo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [35]:
## We can also achieve the one hot encoding using the "dummies" method in pandas
one_hot_encode = pd.get_dummies(data['Genre'])
one_hot_encode

Unnamed: 0,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
16593,0,0,0,0,1,0,0,0,0,0,0,0
16594,0,0,0,0,0,0,0,0,1,0,0,0
16595,0,0,0,0,0,0,1,0,0,0,0,0
16596,0,0,0,0,0,1,0,0,0,0,0,0


### Feature Hashing Scheme

#### even if we have over 1000 distinct categories in a feature and we set b=10 as the final feature vector size, the output feature set will still have only 10 features as compared to 1000 binary features if we used a one-hot encoding scheme.

In [36]:
#lets take example of the 12 unique labels of genre from the data
genres = np.unique(data['Genre'])
print(genres ,"\nNo. of genres in the dataset::",len(genres))

['Action' 'Adventure' 'Fighting' 'Misc' 'Platform' 'Puzzle' 'Racing'
 'Role-Playing' 'Shooter' 'Simulation' 'Sports' 'Strategy'] 
No. of genres in the dataset:: 12


In [39]:
from sklearn.feature_extraction import FeatureHasher

fhe = FeatureHasher(n_features=5,input_type='string')
hashedFeatures = fhe.fit_transform(data['Genre'])
hashedFeatures = hashedFeatures.toarray()

In [42]:
dataset = pd.concat([data[['Name','Genre']],pd.DataFrame(hashedFeatures)],axis=1)
dataset.head()

Unnamed: 0,Name,Genre,0,1,2,3,4
0,Wii Sports,Sports,1.0,-1.0,0.0,0.0,-2.0
1,Super Mario Bros.,Platform,2.0,0.0,1.0,2.0,-1.0
2,Mario Kart Wii,Racing,1.0,-1.0,-2.0,0.0,0.0
3,Wii Sports Resort,Sports,1.0,-1.0,0.0,0.0,-2.0
4,Pokemon Red/Pokemon Blue,Role-Playing,1.0,-2.0,1.0,1.0,1.0


#### Even if it has 12 labels , we can reduce it to number of features we want to have 

#### Reference :https://towardsdatascience.com/understanding-feature-engineering-part-2-categorical-data-f54324193e63