In [1]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
# define example
data = ["cold" , "cold" , "warm" , "cold" , "hot" , "hot" , "warm" , "cold" , "warm" , "hot" ]
values = array(data)
print(values)

['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']


#### LabelEncoder of creating an integer encoding of labels and <br>the OneHotEncoder for creating a one hot encoding of integer encoded values.

In [3]:
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

[0 0 2 0 1 1 2 0 2 1]


In [4]:
inverse=label_encoder.inverse_transform(integer_encoded)
inverse

array(['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold',
       'warm', 'hot'], dtype='<U4')

In [5]:
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
print(integer_encoded)

[[0]
 [0]
 [2]
 [0]
 [1]
 [1]
 [2]
 [0]
 [2]
 [1]]


In [6]:
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


In [7]:
# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[2, :])])
print(inverted)

['warm']


In [9]:
import pandas as pd
ids = [10,20,30,40,50,60,70]
countries = ['Nepal','India', 'China', 'India', 'Germany', 'China','Nepal']
df = pd.DataFrame(list(zip(ids, countries)),columns=['Ids', 'Country'])
df

Unnamed: 0,Ids,Country
0,10,Nepal
1,20,India
2,30,China
3,40,India
4,50,Germany
5,60,China
6,70,Nepal


In [8]:
label_encoder =LabelEncoder()
# Encode labels in column 'Country'. 
df['Country']= label_encoder.fit_transform(df['Country'])
print(df.head(10))
df.shape

   Ids  Country
0   10        3
1   20        2
2   30        0
3   40        2
4   50        1
5   60        0
6   70        3


(7, 2)

In [10]:
X = onehot_encoder.fit_transform(df.Country.values.reshape(-1,1))
X

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.]])

In [11]:
df=pd.DataFrame(df)
df['Country']
rr=df["Country"].value_counts()
rr

Nepal      2
India      2
China      2
Germany    1
Name: Country, dtype: int64

In [12]:
#To add this back into the original dataframe 
dfOneHot = pd.DataFrame(X, columns = ["Country_"+str(int(i)) for i in range(4)]) 

df = pd.concat([df, dfOneHot], axis=1)
#droping the country column 
df= df.drop(['Country'], axis=1) 
#printing to verify 
print(df.head(8))

   Ids  Country_0  Country_1  Country_2  Country_3
0   10        0.0        0.0        0.0        1.0
1   20        0.0        0.0        1.0        0.0
2   30        1.0        0.0        0.0        0.0
3   40        0.0        0.0        1.0        0.0
4   50        0.0        1.0        0.0        0.0
5   60        1.0        0.0        0.0        0.0
6   70        0.0        0.0        0.0        1.0


In [13]:
# Pre-sequence padding is the default (padding=‘pre’) The example below demonstrates prepadding
#  3-input sequences with 0 values.

from keras.preprocessing.sequence import pad_sequences
# define sequences
sequences = [
    [1, 2, 3, 4],
    [1, 2, 3],
    [1]]
# pad sequence
padded = pad_sequences(sequences)
print(padded)

[[1 2 3 4]
 [0 1 2 3]
 [0 0 0 1]]


In [14]:
# post pad sequence
padded = pad_sequences(sequences, padding="post" )
print(padded)

[[1 2 3 4]
 [1 2 3 0]
 [1 0 0 0]]


In [15]:
# The default truncation method is to remove time steps from the beginning of sequences. This is
# called pre-sequence truncation.

# pre truncate sequence
truncated= pad_sequences(sequences, maxlen=2)
print(truncated)

[[3 4]
 [2 3]
 [0 1]]


In [16]:
# truncate sequence
truncated= pad_sequences(sequences, maxlen=2, truncating="post" )
print(truncated)

[[1 2]
 [1 2]
 [0 1]]


# Pandas shift() Function

In [15]:
from pandas import DataFrame
# define the sequence
df = DataFrame()
df['t'] = [x for x in range(10)]
print(df)

   t
0  0
1  1
2  2
3  3
4  4
5  5
6  6
7  7
8  8
9  9


In [16]:
# shift forward
df[ 't+1' ] = df['t' ].shift(1)
print(df)

   t  t+1
0  0  NaN
1  1  0.0
2  2  1.0
3  3  2.0
4  4  3.0
5  5  4.0
6  6  5.0
7  7  6.0
8  8  7.0
9  9  8.0


In [17]:
# shift backward
df['t-1'] = df['t'].shift(-1)
print(df)

   t  t+1  t-1
0  0  NaN  1.0
1  1  0.0  2.0
2  2  1.0  3.0
3  3  2.0  4.0
4  4  3.0  5.0
5  5  4.0  6.0
6  6  5.0  7.0
7  7  6.0  8.0
8  8  7.0  9.0
9  9  8.0  NaN
