## Install packages

In [142]:
# install.packages('caTools', repos = 'https://cran.r-project.org')

## Importing libraries

In [143]:
library(caTools)  # for splitting data

## Import the dataset

In [144]:
dataset <- read.csv("data/Data.csv")
# dataset <- dataset[.2:3]

In [145]:
dataset

Country,Age,Salary,Purchased
France,44.0,72000.0,No
Spain,27.0,48000.0,Yes
Germany,30.0,54000.0,No
Spain,38.0,61000.0,No
Germany,40.0,,Yes
France,35.0,58000.0,Yes
Spain,,52000.0,No
France,48.0,79000.0,Yes
Germany,50.0,83000.0,No
France,37.0,67000.0,Yes


## Missing data

In [146]:
sum(is.na(dataset))

In [147]:
dataset$Age <- ifelse(is.na(dataset$Age), ave(dataset$Age, FUN = function(x) mean(x, 
    na.rm = TRUE)), dataset$Age)

In [148]:
dataset$Salary <- ifelse(is.na(dataset$Salary), ave(dataset$Salary, FUN = function(x) mean(x, 
    na.rm = TRUE)), dataset$Salary)

In [149]:
dataset

Country,Age,Salary,Purchased
France,44.0,72000.0,No
Spain,27.0,48000.0,Yes
Germany,30.0,54000.0,No
Spain,38.0,61000.0,No
Germany,40.0,63777.78,Yes
France,35.0,58000.0,Yes
Spain,38.77778,52000.0,No
France,48.0,79000.0,Yes
Germany,50.0,83000.0,No
France,37.0,67000.0,Yes


## Encoding Categorical data

In [150]:
dataset$Country <- factor(dataset$Country, levels = c("France", "Spain", "Germany"), 
    labels = c(1, 2, 3))

In [151]:
dataset

Country,Age,Salary,Purchased
1,44.0,72000.0,No
2,27.0,48000.0,Yes
3,30.0,54000.0,No
2,38.0,61000.0,No
3,40.0,63777.78,Yes
1,35.0,58000.0,Yes
2,38.77778,52000.0,No
1,48.0,79000.0,Yes
3,50.0,83000.0,No
1,37.0,67000.0,Yes


In [152]:
dataset$Purchased <- factor(dataset$Purchased, levels = c("No", "Yes"), labels = c(0, 
    1))

In [153]:
dataset

Country,Age,Salary,Purchased
1,44.0,72000.0,0
2,27.0,48000.0,1
3,30.0,54000.0,0
2,38.0,61000.0,0
3,40.0,63777.78,1
1,35.0,58000.0,1
2,38.77778,52000.0,0
1,48.0,79000.0,1
3,50.0,83000.0,0
1,37.0,67000.0,1


## Split dataset into train and test

In [154]:
set.seed(123)
split <- sample.split(dataset$Purchased, SplitRatio = 0.8)

In [155]:
split

In [156]:
training_set <- subset(dataset, split == TRUE)
test_set <- subset(dataset, split == FALSE)

In [157]:
training_set
test_set

Unnamed: 0,Country,Age,Salary,Purchased
1,1,44.0,72000.0,0
2,2,27.0,48000.0,1
3,3,30.0,54000.0,0
4,2,38.0,61000.0,0
5,3,40.0,63777.78,1
7,2,38.77778,52000.0,0
8,1,48.0,79000.0,1
10,1,37.0,67000.0,1


Unnamed: 0,Country,Age,Salary,Purchased
6,1,35,58000,1
9,3,50,83000,0


## Feature scaling

**euclidean distance**  
![Imgur](https://i.imgur.com/ro1YKt6.png?1)

![Imgur](https://i.imgur.com/ERnKcCz.png?1)

In [158]:
training_set[, 2:3] <- scale(training_set[, 2:3])
test_set[, 2:3] <- scale(test_set[, 2:3])

In [159]:
training_set
test_set

Unnamed: 0,Country,Age,Salary,Purchased
1,1,0.90101716,0.9392746,0
2,2,-1.58847494,-1.337116,1
3,3,-1.14915281,-0.7680183,0
4,2,0.02237289,-0.1040711,0
5,3,0.31525431,0.1594,1
7,2,0.13627122,-0.9577176,0
8,1,1.48678,1.6032218,1
10,1,-0.12406783,0.4650265,1


Unnamed: 0,Country,Age,Salary,Purchased
6,1,-0.7071068,-0.7071068,1
9,3,0.7071068,0.7071068,0
