# Data preprocessing in Python

Importing the libraries
Importing the Dataset
Handling of Missing Data
Handling of Categorical Data
Splitting the dataset into training and testing datasets
Feature Scaling

In [1]:
# Importing the libraries
import pandas as pd

In [2]:
import numpy as np

In [3]:
from sklearn.impute import SimpleImputer

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
# Importing the Dataset
df=pd.read_csv("DataPreprocessing.csv")

In [8]:
df

Unnamed: 0,Region,Age,Income,Online Shopper
0,India,49.0,86400.0,No
1,Brazil,32.0,57600.0,Yes
2,USA,35.0,64800.0,No
3,Brazil,43.0,73200.0,No
4,USA,45.0,,Yes
5,India,40.0,69600.0,Yes
6,Brazil,,62400.0,No
7,India,53.0,94800.0,Yes
8,USA,55.0,99600.0,No
9,India,42.0,80400.0,Yes


In [9]:
X=df.iloc[:,:-1].values

In [10]:
X

array([['India', 49.0, 86400.0],
       ['Brazil', 32.0, 57600.0],
       ['USA', 35.0, 64800.0],
       ['Brazil', 43.0, 73200.0],
       ['USA', 45.0, nan],
       ['India', 40.0, 69600.0],
       ['Brazil', nan, 62400.0],
       ['India', 53.0, 94800.0],
       ['USA', 55.0, 99600.0],
       ['India', 42.0, 80400.0]], dtype=object)

In [11]:
y=df.iloc[:,-1].values

In [12]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [13]:
# Handling of Missing Data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [14]:
imputer = imputer.fit(X[:, 1:])

In [15]:
X[:, 1:] = imputer.transform(X[:, 1:])

In [16]:
X[:,1:]

array([[49.0, 86400.0],
       [32.0, 57600.0],
       [35.0, 64800.0],
       [43.0, 73200.0],
       [45.0, 76533.33333333333],
       [40.0, 69600.0],
       [43.77777777777778, 62400.0],
       [53.0, 94800.0],
       [55.0, 99600.0],
       [42.0, 80400.0]], dtype=object)

In [18]:
# Handling of Categorical Data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [19]:
labelencoder_X = LabelEncoder()

In [20]:
labelencoder_X

LabelEncoder()

In [21]:
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

In [22]:
X[:,0]

array([1, 0, 2, 0, 2, 1, 0, 1, 2, 1], dtype=object)

In [24]:
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([("Country", OneHotEncoder(), [0])], remainder = 'passthrough')
X = ct.fit_transform(X)

In [25]:
ct

ColumnTransformer(remainder='passthrough',
                  transformers=[('Country', OneHotEncoder(), [0])])

In [26]:
X

array([[0.0, 1.0, 0.0, 49.0, 86400.0],
       [1.0, 0.0, 0.0, 32.0, 57600.0],
       [0.0, 0.0, 1.0, 35.0, 64800.0],
       [1.0, 0.0, 0.0, 43.0, 73200.0],
       [0.0, 0.0, 1.0, 45.0, 76533.33333333333],
       [0.0, 1.0, 0.0, 40.0, 69600.0],
       [1.0, 0.0, 0.0, 43.77777777777778, 62400.0],
       [0.0, 1.0, 0.0, 53.0, 94800.0],
       [0.0, 0.0, 1.0, 55.0, 99600.0],
       [0.0, 1.0, 0.0, 42.0, 80400.0]], dtype=object)

In [27]:
labelencoder_Y = LabelEncoder()

In [28]:
labelencoder_Y

LabelEncoder()

In [32]:
Y = ct.fit_transform(Y)

In [33]:
Y

array([[0.0, 1.0, 0.0, 1.0, 0.0, 49.0, 86400.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 32.0, 57600.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 35.0, 64800.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 43.0, 73200.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 45.0, 76533.33333333333],
       [0.0, 1.0, 0.0, 1.0, 0.0, 40.0, 69600.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 43.77777777777778, 62400.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 53.0, 94800.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 55.0, 99600.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 42.0, 80400.0]], dtype=object)

In [34]:
# Splitting the dataset into training and testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [37]:
X_train

array([[0.0, 0.0, 1.0, 45.0, 76533.33333333333],
       [0.0, 1.0, 0.0, 42.0, 80400.0],
       [1.0, 0.0, 0.0, 32.0, 57600.0],
       [1.0, 0.0, 0.0, 43.77777777777778, 62400.0],
       [0.0, 1.0, 0.0, 53.0, 94800.0],
       [1.0, 0.0, 0.0, 43.0, 73200.0],
       [0.0, 1.0, 0.0, 49.0, 86400.0],
       [0.0, 1.0, 0.0, 40.0, 69600.0]], dtype=object)

In [36]:
X_test

array([[0.0, 0.0, 1.0, 35.0, 64800.0],
       [0.0, 0.0, 1.0, 55.0, 99600.0]], dtype=object)

In [38]:
Y_train

array([[0.0, 1.0, 0.0, 0.0, 1.0, 45.0, 76533.33333333333],
       [0.0, 1.0, 0.0, 1.0, 0.0, 42.0, 80400.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 32.0, 57600.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 43.77777777777778, 62400.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 53.0, 94800.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 43.0, 73200.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 49.0, 86400.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 40.0, 69600.0]], dtype=object)

In [39]:
Y_test

array([[0.0, 1.0, 0.0, 0.0, 1.0, 35.0, 64800.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 55.0, 99600.0]], dtype=object)

In [40]:
# Feature Scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [41]:
sc_X

StandardScaler()

In [42]:
X_train

array([[-0.77459667, -1.        ,  2.64575131,  0.26306757,  0.12381479],
       [-0.77459667,  1.        , -0.37796447, -0.25350148,  0.46175632],
       [ 1.29099445, -1.        , -0.37796447, -1.97539832, -1.53093341],
       [ 1.29099445, -1.        , -0.37796447,  0.05261351, -1.11141978],
       [-0.77459667,  1.        , -0.37796447,  1.64058505,  1.7202972 ],
       [ 1.29099445, -1.        , -0.37796447, -0.0813118 , -0.16751412],
       [-0.77459667,  1.        , -0.37796447,  0.95182631,  0.98614835],
       [-0.77459667,  1.        , -0.37796447, -0.59788085, -0.48214934]])

In [43]:
X_test

array([[-0.77459667, -1.        ,  2.64575131, -1.45882927, -0.90166297],
       [-0.77459667, -1.        ,  2.64575131,  1.98496442,  2.13981082]])