# Data Preprocessing Tools

## Importing the libraries

In [42]:
import pandas as pd

## Importing the dataset

In [43]:
df = pd.read_csv('Data.csv')

In [44]:
# get number of rows and colums; that is, get the dimensions of dataframe
df.shape

(10, 4)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [46]:
# By default, df.describe() only generates descriptive statistics for numeric columns
df.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [47]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [48]:
# get the feature set
X = df.iloc[:, : -1].values
# get the target vector
y = df.iloc[:, -1].values

In [49]:
X.shape

(10, 3)

In [50]:
y.shape

(10,)

In [51]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### Taking care of missing data

In [52]:
# see if any columns are missing data
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [53]:
from sklearn.impute import SimpleImputer
import numpy as np
# create instance of SimpleImputer class
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[: , 1 : 3])
X[: , 1 : 3] = imputer.transform(X[: , 1 : 3])

In [54]:
# see if any columns are missing data
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Encoding categorical data

### Encoding the Independent Variable

In [55]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [56]:
# create instance of ColumnTransformer class
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
# transform colum
X = ct.fit_transform(X)
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

### Encoding the Dependent Variable

In [57]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting the dataset into the Training set and Test set

In [58]:
from sklearn.model_selection import train_test_split

def split_data(X, y, test_size=0.2, random_state=1):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(X, y)

In [59]:
print('X_train:\n', X_train)
print('X_test:\n', X_test)
print('y_train:\n', y_train)
print('y_test:\n', y_test)

X_train:
 [[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]
X_test:
 [[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
y_train:
 [0 1 0 0 1 1 0 1]
y_test:
 [0 1]


## Feature Scaling

In [63]:
from sklearn.preprocessing import StandardScaler

def scale_features(X_train, X_test):
    ct_scaler =StandardScaler()
    X_train[:, 3:] = ct_scaler.fit_transform(X_train[:, 3:])
    X_test[:, 3:] = ct_scaler.transform(X_test[:, 3:])

scale_features(X_train, X_test)
print('Scaled X_train:\n', X_train, '\n\n Scaled X_test:\n', X_test)

Scaled X_train:
 [[0.0 0.0 1.0 -0.1915918438457855 -1.0781259408412427]
 [0.0 1.0 0.0 -0.014117293757057819 -0.07013167641635407]
 [1.0 0.0 0.0 0.5667085065333239 0.6335624327104546]
 [0.0 0.0 1.0 -0.30453019390224867 -0.307866172742979]
 [0.0 0.0 1.0 -1.901801144700799 -1.4204636155515822]
 [1.0 0.0 0.0 1.1475343068237056 1.2326533634535488]
 [0.0 1.0 0.0 1.4379472069688966 1.5749910381638883]
 [1.0 0.0 0.0 -0.7401495441200352 -0.5646194287757336]] 

 Scaled X_test:
 [[0.0 1.0 0.0 -1.4661817944830127 -0.9069571034860731]
 [1.0 0.0 0.0 -0.44973664397484414 0.20564033932253023]]
