In [1]:
# Importing datasets
import pandas as pd  # to import datasets and manage them
import numpy as np   # for mathematical operations
import matplotlib.pyplot as plt # for plotting

In [28]:
# Importing the datasets
data = pd.read_csv('Data.csv')
X = data.iloc[:,:-1].values # create array of features
Y = data.iloc[:,-1].values  # prediction 

In [3]:
# Dealing with missing data
# remove the observations can be dangerous if it contains important information.
# Replace the missing data with mean of the data
# Impute means to infer them from the known parts of data.

# from sklearn.preprocessing import Imputer # import Imputer class from the sklearn.preprocessing library 
# imputer = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis = 0)
# imputer = imputer.fit(X[:, 1:3])
# X[:, 1:3] = imputer.transform(X[:, 1:3])
# print(X)

In [29]:
from sklearn.impute import SimpleImputer
# skl_mean = SimpleImputer(missing_values = np.nan, strategy = 'constant', fill_value = 1)
skl_mean = SimpleImputer(missing_values = np.nan, strategy = 'mean')
skl_mean.fit(X[:,1:3])
X[:,1:3] = skl_mean.transform(X[:,1:3])
print(X)
print(Y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [5]:
# Encoding categorical variable
# Deprecated
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# labelencoder = LabelEncoder()
# X[:,0] = labelencoder.fit_transform(X[:,0])
# print(X)
# onehotencoder = OneHotEncoder(categorical_features = [0])
# X = onehotencoder.fit_transform(X).toarray()
# print(X)

In [30]:
# Encoding categorical variable
# https://jorisvandenbossche.github.io/blog/2018/05/28/scikit-learn-columntransformer/
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, Normalizer, LabelEncoder
# columntransformer = ColumnTransformer(["encode",OneHotEncoder(categories = ['France', 'Spain', 'Germany']),[0,1,2]])
# columntransformer.fit_transform(X[:,0])
# print(X)
# onehotencoder = OneHotEncoder()
# inputs should be transformer, columns 
columntransformer = make_column_transformer((OneHotEncoder(),[0]), (Normalizer(norm='l1'), [1,2]))
X = columntransformer.fit_transform(X)
print(X)

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 6.10737882e-04
  9.99389262e-01]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 5.62183772e-04
  9.99437816e-01]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.55247085e-04
  9.99444753e-01]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 6.22562994e-04
  9.99377437e-01]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 6.26784595e-04
  9.99373215e-01]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 6.03084346e-04
  9.99396916e-01]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 7.45170802e-04
  9.99254829e-01]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 6.07225989e-04
  9.99392774e-01]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 6.02046960e-04
  9.99397953e-01]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 5.51934007e-04
  9.99448066e-01]]


In [31]:
labelencoder = LabelEncoder()
Y = labelencoder.fit_transform(Y)
print(Y)

[0 1 0 0 1 1 0 1 0 1]


In [32]:
# Splitting the dataset into the training set and test set
print(X)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state = 42) # if you don't specify random_state, selection of test set is done at random.
print(X_train)
print(Y_train)
print(X_test)
print(Y_test)

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 6.10737882e-04
  9.99389262e-01]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 5.62183772e-04
  9.99437816e-01]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.55247085e-04
  9.99444753e-01]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 6.22562994e-04
  9.99377437e-01]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 6.26784595e-04
  9.99373215e-01]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 6.03084346e-04
  9.99396916e-01]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 7.45170802e-04
  9.99254829e-01]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 6.07225989e-04
  9.99392774e-01]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 6.02046960e-04
  9.99397953e-01]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 5.51934007e-04
  9.99448066e-01]]
[[1.00000000e+00 0.00000000e+00 0.00000000e+00 6.10737882e-04
  9.99389262e-01]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 6.07225989e-04
  9.99392774e-01]
 [0.00000000e+00 1.00000000e+00 0.00000

In [46]:
# "normalization" typically means that the range of values are "normalized to be from 0.0 to 1.0". "Standardization" typically means that the range of values are "standardized" to measure how many standard deviations the value is from its mean.
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)
# print(X_train)
# print(X_test)

from sklearn.preprocessing import MinMaxScaler
transformer = MinMaxScaler()
X_train_1 = transformer.fit_transform(X_train)
print('transformed x_train: ',X_train_1)
X_test_1 = transformer.transform(X_test)
print(X_test_1)


transformed x_train:  [[1.         0.         0.         0.30430993 0.69569007]
 [1.         0.         0.         0.28613589 0.71386411]
 [0.         1.         0.         0.01714517 0.98285483]
 [1.         0.         0.         0.         1.        ]
 [0.         1.         0.         0.38735163 0.61264837]
 [0.         0.         1.         0.36550486 0.63449514]
 [0.         0.         1.         1.         0.        ]]
[[0.         1.         0.         0.25933442 0.74066558]
 [0.         0.         1.         0.05304251 0.94695749]
 [1.         0.         0.         0.26470289 0.73529711]]
