Install modules

In [25]:
!pip install tensorflow
!pip install tensorflow-gpu
!pip install keras



Import Modules

In [26]:
import tensorflow as tf
import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

Check Versions

In [27]:
print(tf.__version__)
print(keras.__version__)
print(np.__version__)
print(pd.__version__)

2.2.0
2.3.1
1.18.5
1.0.5


Download dataset

In [28]:
!wget -cq https://raw.githubusercontent.com/D-Bhatta/Data-Cleaning-Beginner/master/anne-bonner-tutorial/my_data.csv

Check for GPU

In [29]:
device_name = tf.test.gpu_device_name()
# if device_name != '/device:GPU:0':
    # raise SystemError("GPU device not found")
# print(f'Found GPU at: {device_name}')

Load data

In [30]:
dataset = pd.read_csv('my_data.csv')
dataset

Unnamed: 0,Animal,Age,Worth,Friendly
0,Cat,4.0,72000.0,No
1,Dog,17.0,48000.0,Yes
2,Moose,6.0,54000.0,No
3,Dog,8.0,61000.0,No
4,Moose,4.0,,Yes
5,Cat,15.0,58000.0,Yes
6,Dog,,52000.0,No
7,Cat,12.0,79000.0,Yes
8,Moose,5.0,83000.0,No
9,Cat,7.0,67000.0,Yes


Create independent dataset partition

In [31]:
x = dataset.iloc[:,:-1].values
x

array([['Cat', 4.0, 72000.0],
       ['Dog', 17.0, 48000.0],
       ['Moose', 6.0, 54000.0],
       ['Dog', 8.0, 61000.0],
       ['Moose', 4.0, nan],
       ['Cat', 15.0, 58000.0],
       ['Dog', nan, 52000.0],
       ['Cat', 12.0, 79000.0],
       ['Moose', 5.0, 83000.0],
       ['Cat', 7.0, 67000.0]], dtype=object)

Create dependent dataset partition

In [32]:
y = dataset.iloc[:,-1].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

Replace missing data wtih SimpleImputer

In [33]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer = imputer.fit(x[:,1:3])

Apply imputer transform to column of data

In [34]:
x[:,1:3] = imputer.transform(x[:,1:3])
x

array([['Cat', 4.0, 72000.0],
       ['Dog', 17.0, 48000.0],
       ['Moose', 6.0, 54000.0],
       ['Dog', 8.0, 61000.0],
       ['Moose', 4.0, 48000.0],
       ['Cat', 15.0, 58000.0],
       ['Dog', 4.0, 52000.0],
       ['Cat', 12.0, 79000.0],
       ['Moose', 5.0, 83000.0],
       ['Cat', 7.0, 67000.0]], dtype=object)

Encode categorical data as ratios

In [35]:
labelencoder_x = LabelEncoder()
x[:,0] = labelencoder_x.fit_transform(x[:,0])
x

array([[0, 4.0, 72000.0],
       [1, 17.0, 48000.0],
       [2, 6.0, 54000.0],
       [1, 8.0, 61000.0],
       [2, 4.0, 48000.0],
       [0, 15.0, 58000.0],
       [1, 4.0, 52000.0],
       [0, 12.0, 79000.0],
       [2, 5.0, 83000.0],
       [0, 7.0, 67000.0]], dtype=object)

Collect column of labelled categorical data

In [36]:
x_labeled = x[:,0]
x_labeled

array([0, 1, 2, 1, 2, 0, 1, 0, 2, 0], dtype=object)

Encode column of ratios into multiple columns of binary data

In [37]:
onehotencoder_x = OneHotEncoder(handle_unknown='ignore')
x_labeled = x_labeled.reshape(-1,1)
x_encoded = onehotencoder_x.fit_transform(x_labeled)
x_encoded = x_encoded.toarray()
x_encoded

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

Concatenate encoded rows onto x

In [38]:
x = np.concatenate([x_encoded,x[:,1:]],axis=1)
x

array([[1.0, 0.0, 0.0, 4.0, 72000.0],
       [0.0, 1.0, 0.0, 17.0, 48000.0],
       [0.0, 0.0, 1.0, 6.0, 54000.0],
       [0.0, 1.0, 0.0, 8.0, 61000.0],
       [0.0, 0.0, 1.0, 4.0, 48000.0],
       [1.0, 0.0, 0.0, 15.0, 58000.0],
       [0.0, 1.0, 0.0, 4.0, 52000.0],
       [1.0, 0.0, 0.0, 12.0, 79000.0],
       [0.0, 0.0, 1.0, 5.0, 83000.0],
       [1.0, 0.0, 0.0, 7.0, 67000.0]], dtype=object)

Create scaler object

In [39]:
sc_x = StandardScaler()

Copy column to scale into temp variable

In [40]:
x_sc = x[:,3:]
x_sc

array([[4.0, 72000.0],
       [17.0, 48000.0],
       [6.0, 54000.0],
       [8.0, 61000.0],
       [4.0, 48000.0],
       [15.0, 58000.0],
       [4.0, 52000.0],
       [12.0, 79000.0],
       [5.0, 83000.0],
       [7.0, 67000.0]], dtype=object)

Apply the scaler

In [41]:
x_sc = sc_x.fit_transform(x_sc)
x_sc

array([[-0.92179769,  0.82020574],
       [ 1.93138564, -1.18846138],
       [-0.48284641, -0.6862946 ],
       [-0.04389513, -0.10043336],
       [-0.92179769, -1.18846138],
       [ 1.49243436, -0.35151675],
       [-0.92179769, -0.85368353],
       [ 0.83400743,  1.40606699],
       [-0.70232205,  1.74084484],
       [-0.26337077,  0.40173343]])

Concatenate scaled data to rest of the dataset

In [42]:
x = np.concatenate([x[:,:3],x_sc], axis=1)
x

array([[1.0, 0.0, 0.0, -0.9217976907429086, 0.8202057433801607],
       [0.0, 1.0, 0.0, 1.931385637747047, -1.188461383265131],
       [0.0, 0.0, 1.0, -0.4828464094367616, -0.686294601603808],
       [0.0, 1.0, 0.0, -0.043895128130614545, -0.10043335633226458],
       [0.0, 0.0, 1.0, -0.9217976907429086, -1.188461383265131],
       [1.0, 0.0, 0.0, 1.4924343564409002, -0.351516747162926],
       [0.0, 1.0, 0.0, -0.9217976907429086, -0.8536835288242489],
       [1.0, 0.0, 0.0, 0.8340074344816796, 1.406066988651704],
       [0.0, 0.0, 1.0, -0.7023220500898352, 1.740844843092586],
       [1.0, 0.0, 0.0, -0.2633707687836881, 0.4017334253290583]],
      dtype=object)