In [None]:
#Data.csv

**Step 1: Importing the libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

**Step 2: Importing dataset**

In [None]:
df = pd.read_csv('/content/Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 3: Handling the missing data**

In [None]:
df.isnull().sum() # Missing data in Age, Salary

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [None]:
age_mean = df['Age'].mean()
sal_mean = df['Salary'].mean()

In [None]:
df['Age'].fillna(round(age_mean),inplace = True)
df['Salary'].fillna(round(sal_mean),inplace = True)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63778.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,39.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 4: Encoding categorical data**

In [None]:
countries = pd.get_dummies(df['Country'],drop_first=True)
df1 = pd.concat([df,countries],axis=1)
df1

Unnamed: 0,Country,Age,Salary,Purchased,Germany,Spain
0,France,44.0,72000.0,No,0,0
1,Spain,27.0,48000.0,Yes,0,1
2,Germany,30.0,54000.0,No,1,0
3,Spain,38.0,61000.0,No,0,1
4,Germany,40.0,63778.0,Yes,1,0
5,France,35.0,58000.0,Yes,0,0
6,Spain,39.0,52000.0,No,0,1
7,France,48.0,79000.0,Yes,0,0
8,Germany,50.0,83000.0,No,1,0
9,France,37.0,67000.0,Yes,0,0


In [None]:
df1.drop('Country',axis=1,inplace=True)

**Step 5: Creating a dummy variable**

In [None]:
df1['Purchased'] = df['Purchased'].map({'Yes':1,'No':0})
df1

Unnamed: 0,Age,Salary,Purchased,Germany,Spain
0,44.0,72000.0,0,0,0
1,27.0,48000.0,1,0,1
2,30.0,54000.0,0,1,0
3,38.0,61000.0,0,0,1
4,40.0,63778.0,1,1,0
5,35.0,58000.0,1,0,0
6,39.0,52000.0,0,0,1
7,48.0,79000.0,1,0,0
8,50.0,83000.0,0,1,0
9,37.0,67000.0,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df1.drop('Purchased',axis=1),df1['Purchased'],test_size=0.1)


In [None]:
X_train

Unnamed: 0,Age,Salary,Germany,Spain
0,44.0,72000.0,0,0
8,50.0,83000.0,1,0
7,48.0,79000.0,0,0
9,37.0,67000.0,0,0
4,40.0,63778.0,1,0
2,30.0,54000.0,1,0
1,27.0,48000.0,0,1
6,39.0,52000.0,0,1
5,35.0,58000.0,0,0


In [None]:
X_test

Unnamed: 0,Age,Salary,Germany,Spain
3,38.0,61000.0,0,1


**Step 7: Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
scaler.fit(X_train)
X_scaled_train = scaler.transform(X_train)
X_scaled_test = scaler.transform(X_test)

In [None]:
X_scaled_train

array([[ 0.70511024,  0.68677143, -0.70710678, -0.53452248],
       [ 1.53284835,  1.64139742,  1.41421356, -0.53452248],
       [ 1.25693565,  1.2942607 , -0.70710678, -0.53452248],
       [-0.26058422,  0.25285053, -0.70710678, -0.53452248],
       [ 0.15328483, -0.0267681 ,  1.41421356, -0.53452248],
       [-1.22627868, -0.87534382,  1.41421356, -0.53452248],
       [-1.64014773, -1.3960489 , -0.70710678,  1.87082869],
       [ 0.01532848, -1.04891218, -0.70710678,  1.87082869],
       [-0.53649692, -0.52820709, -0.70710678, -0.53452248]])

In [None]:
X_scaled_test

array([[-0.12262787, -0.26785455, -0.70710678,  1.87082869]])