In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [86]:
df = pd.read_csv('Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [87]:
df.duplicated().sum()

1

In [88]:
df.drop_duplicates(inplace = True)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [89]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [90]:
df.fillna(1)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,1.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,1.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [91]:
df.dropna()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [92]:
avg_age = df['Age'].mean()
avg_salary = df['Salary'].mean()
print(avg_age)
print(avg_salary)

38.77777777777778
63777.77777777778


In [93]:
df['Age'].replace(np.nan, avg_age, inplace = True)
df['Salary'].replace(np.nan, avg_salary, inplace = True)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### - Using Scikit learn

In [94]:
df2 = pd.read_csv('Data.csv')
df2.drop_duplicates(inplace = True)
X = df2.iloc[:, :-1].values
Y = df2.iloc[:,-1].values

In [95]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imp.fit(X[:,1:3])
X[:,1:3] = imp.fit_transform(X[:,1:3])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [96]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X[:, 1:] = sc.fit_transform(X[:, 1:])
X

array([['France', 0.758874361590019, 0.7494732544921677],
       ['Spain', -1.7115038793306814, -1.4381784072687531],
       ['Germany', -1.2755547779917342, -0.8912654918285229],
       ['Spain', -0.1130238410878753, -0.253200423814921],
       ['Germany', 0.17760889313808945, 6.632191985654332e-16],
       ['France', -0.5489729424268225, -0.5266568815350361],
       ['Spain', 0.0, -1.0735697969752662],
       ['France', 1.3401398300419485, 1.3875383225057696],
       ['Germany', 1.6307725642679132, 1.7521469327992565],
       ['France', -0.2583402082008577, 0.29371249162530916]], dtype=object)

In [97]:
X[:,1].var()

1.0

In [98]:
X[:,2].var()

1.0000000000000002

In [99]:
dummy1 = pd.get_dummies(df['Country'])
dummy1

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,1,0
5,1,0,0
6,0,0,1
7,1,0,0
8,0,1,0
9,1,0,0


In [100]:
df = pd.concat([dummy1, df], axis = 1)
df

Unnamed: 0,France,Germany,Spain,Country,Age,Salary,Purchased
0,1,0,0,France,44.0,72000.0,No
1,0,0,1,Spain,27.0,48000.0,Yes
2,0,1,0,Germany,30.0,54000.0,No
3,0,0,1,Spain,38.0,61000.0,No
4,0,1,0,Germany,40.0,63777.777778,Yes
5,1,0,0,France,35.0,58000.0,Yes
6,0,0,1,Spain,38.777778,52000.0,No
7,1,0,0,France,48.0,79000.0,Yes
8,0,1,0,Germany,50.0,83000.0,No
9,1,0,0,France,37.0,67000.0,Yes


In [101]:
dummy2 = pd.get_dummies(df['Purchased'])
dummy2

Unnamed: 0,No,Yes
0,1,0
1,0,1
2,1,0
3,1,0
4,0,1
5,0,1
6,1,0
7,0,1
8,1,0
9,0,1


In [102]:
X

array([['France', 0.758874361590019, 0.7494732544921677],
       ['Spain', -1.7115038793306814, -1.4381784072687531],
       ['Germany', -1.2755547779917342, -0.8912654918285229],
       ['Spain', -0.1130238410878753, -0.253200423814921],
       ['Germany', 0.17760889313808945, 6.632191985654332e-16],
       ['France', -0.5489729424268225, -0.5266568815350361],
       ['Spain', 0.0, -1.0735697969752662],
       ['France', 1.3401398300419485, 1.3875383225057696],
       ['Germany', 1.6307725642679132, 1.7521469327992565],
       ['France', -0.2583402082008577, 0.29371249162530916]], dtype=object)

In [103]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))
X

array([[1.0, 0.0, 0.0, 0.758874361590019, 0.7494732544921677],
       [0.0, 0.0, 1.0, -1.7115038793306814, -1.4381784072687531],
       [0.0, 1.0, 0.0, -1.2755547779917342, -0.8912654918285229],
       [0.0, 0.0, 1.0, -0.1130238410878753, -0.253200423814921],
       [0.0, 1.0, 0.0, 0.17760889313808945, 6.632191985654332e-16],
       [1.0, 0.0, 0.0, -0.5489729424268225, -0.5266568815350361],
       [0.0, 0.0, 1.0, 0.0, -1.0735697969752662],
       [1.0, 0.0, 0.0, 1.3401398300419485, 1.3875383225057696],
       [0.0, 1.0, 0.0, 1.6307725642679132, 1.7521469327992565],
       [1.0, 0.0, 0.0, -0.2583402082008577, 0.29371249162530916]],
      dtype=object)

In [105]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [109]:
z = df['Country'].values
z = le.fit_transform(z)
z

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

In [110]:
print(X)
print(Y)

[[1.0 0.0 0.0 0.758874361590019 0.7494732544921677]
 [0.0 0.0 1.0 -1.7115038793306814 -1.4381784072687531]
 [0.0 1.0 0.0 -1.2755547779917342 -0.8912654918285229]
 [0.0 0.0 1.0 -0.1130238410878753 -0.253200423814921]
 [0.0 1.0 0.0 0.17760889313808945 6.632191985654332e-16]
 [1.0 0.0 0.0 -0.5489729424268225 -0.5266568815350361]
 [0.0 0.0 1.0 0.0 -1.0735697969752662]
 [1.0 0.0 0.0 1.3401398300419485 1.3875383225057696]
 [0.0 1.0 0.0 1.6307725642679132 1.7521469327992565]
 [1.0 0.0 0.0 -0.2583402082008577 0.29371249162530916]]
[0 1 0 0 1 1 0 1 0 1]


In [112]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [113]:
X_train

array([[0.0, 1.0, 0.0, 0.17760889313808945, 6.632191985654332e-16],
       [1.0, 0.0, 0.0, -0.2583402082008577, 0.29371249162530916],
       [0.0, 0.0, 1.0, -1.7115038793306814, -1.4381784072687531],
       [0.0, 0.0, 1.0, 0.0, -1.0735697969752662],
       [1.0, 0.0, 0.0, 1.3401398300419485, 1.3875383225057696],
       [0.0, 0.0, 1.0, -0.1130238410878753, -0.253200423814921],
       [1.0, 0.0, 0.0, 0.758874361590019, 0.7494732544921677],
       [1.0, 0.0, 0.0, -0.5489729424268225, -0.5266568815350361]],
      dtype=object)

In [114]:
X_test

array([[0.0, 1.0, 0.0, -1.2755547779917342, -0.8912654918285229],
       [0.0, 1.0, 0.0, 1.6307725642679132, 1.7521469327992565]],
      dtype=object)

In [115]:
y_train

array([1, 1, 1, 0, 1, 0, 0, 1])

In [116]:
y_test

array([0, 0])