In [1]:
# Feature/target selection

import numpy as np
import pandas as pd

featureColumns = list(range(2,12))

dataset = pd.read_csv("dataset.csv")
x = dataset.iloc[:,featureColumns].values
y = dataset.iloc[:,-1].values

x[0]

array([504, 'Spain', 'Male', 34, 0, 54980.81, 1, 1, 1, 136909.88],
      dtype=object)

In [2]:
# Label encoding

from sklearn.preprocessing import LabelEncoder
encodingColumns = [1,2]

for ec in encodingColumns:
    labelEncoderX = LabelEncoder()
    x[:,ec] = labelEncoderX.fit_transform(x[:,ec])
    
x[0:5]

array([[504, 2, 1, 34, 0, 54980.81, 1, 1, 1, 136909.88],
       [659, 0, 0, 33, 7, 89939.62, 1, 1, 0, 136540.09],
       [473, 1, 0, 32, 5, 146602.25, 2, 1, 1, 72946.95],
       [637, 2, 0, 40, 6, 0.0, 2, 1, 1, 181610.6],
       [657, 2, 1, 75, 7, 126273.95, 1, 0, 1, 91673.6]], dtype=object)

In [3]:
# One hot encoding

from sklearn.preprocessing import OneHotEncoder

oneHotEncoderX = OneHotEncoder(categorical_features=encodingColumns)
x = oneHotEncoderX.fit_transform(x).toarray()

x[0:5]

array([[0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00,
        1.0000000e+00, 5.0400000e+02, 3.4000000e+01, 0.0000000e+00,
        5.4980810e+04, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
        1.3690988e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        0.0000000e+00, 6.5900000e+02, 3.3000000e+01, 7.0000000e+00,
        8.9939620e+04, 1.0000000e+00, 1.0000000e+00, 0.0000000e+00,
        1.3654009e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        0.0000000e+00, 4.7300000e+02, 3.2000000e+01, 5.0000000e+00,
        1.4660225e+05, 2.0000000e+00, 1.0000000e+00, 1.0000000e+00,
        7.2946950e+04],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.0000000e+00,
        0.0000000e+00, 6.3700000e+02, 4.0000000e+01, 6.0000000e+00,
        0.0000000e+00, 2.0000000e+00, 1.0000000e+00, 1.0000000e+00,
        1.8161060e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00,
        1.0000000e+0

In [4]:
# Dummy variable trap recovery
x = np.delete(x, [0,3], 1)

x[0:5]

array([[0.0000000e+00, 1.0000000e+00, 1.0000000e+00, 5.0400000e+02,
        3.4000000e+01, 0.0000000e+00, 5.4980810e+04, 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 1.3690988e+05],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 6.5900000e+02,
        3.3000000e+01, 7.0000000e+00, 8.9939620e+04, 1.0000000e+00,
        1.0000000e+00, 0.0000000e+00, 1.3654009e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 4.7300000e+02,
        3.2000000e+01, 5.0000000e+00, 1.4660225e+05, 2.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 7.2946950e+04],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 6.3700000e+02,
        4.0000000e+01, 6.0000000e+00, 0.0000000e+00, 2.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 1.8161060e+05],
       [0.0000000e+00, 1.0000000e+00, 1.0000000e+00, 6.5700000e+02,
        7.5000000e+01, 7.0000000e+00, 1.2627395e+05, 1.0000000e+00,
        0.0000000e+00, 1.0000000e+00, 9.1673600e+04]])

In [5]:
# Dataset split

from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(x,y,test_size=0.2,random_state=12345)

xTrain[0:5]

array([[0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 5.9200000e+02,
        6.6000000e+01, 5.0000000e+00, 1.4995019e+05, 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 7.6267590e+04],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 5.5400000e+02,
        4.0000000e+01, 4.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        0.0000000e+00, 1.0000000e+00, 1.6878004e+05],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 6.8000000e+02,
        2.5000000e+01, 4.0000000e+00, 1.2381650e+05, 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 9.0162350e+04],
       [0.0000000e+00, 1.0000000e+00, 1.0000000e+00, 4.7900000e+02,
        3.5000000e+01, 4.0000000e+00, 1.2592098e+05, 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 2.0393440e+04],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 5.8500000e+02,
        3.5000000e+01, 2.0000000e+00, 0.0000000e+00, 2.0000000e+00,
        1.0000000e+00, 0.0000000e+00, 9.8621040e+04]])

In [8]:
# Feature scaling (standardization)

from sklearn.preprocessing import StandardScaler
scalingColumns = list(range(3,11))
scalerX = StandardScaler()
xTrain[:,scalingColumns] = scalerX.fit_transform(xTrain[:,scalingColumns].astype(float))
xTest[:,scalingColumns] = scalerX.transform(xTest[:,scalingColumns].astype(float))

xTrain[0:5]

array([[ 0.        ,  0.        ,  0.        , -0.60456728,  2.55988554,
        -0.00462039,  1.17267897, -0.90802098,  0.6479493 ,  0.96881921,
        -0.4184835 ],
       [ 0.        ,  0.        ,  1.        , -0.99761731,  0.09332607,
        -0.35114978, -1.23869855, -0.90802098, -1.54333063,  0.96881921,
         1.18923506],
       [ 0.        ,  0.        ,  0.        ,  0.30565387, -1.329689  ,
        -0.35114978,  0.75241813, -0.90802098,  0.6479493 ,  0.96881921,
        -0.17701477],
       [ 0.        ,  1.        ,  1.        , -1.77337397, -0.38101229,
        -0.35114978,  0.78626067, -0.90802098,  0.6479493 ,  0.96881921,
        -1.38948695],
       [ 0.        ,  0.        ,  0.        , -0.67697123, -0.38101229,
        -1.04420856, -1.23869855,  0.81243982,  0.6479493 , -1.03218432,
        -0.03001626]])