### 6. Machine Learning data review

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
raw_boston=datasets.load_boston()
X_boston=pd.DataFrame(raw_boston.data)
y_boston=pd.DataFrame(raw_boston.target)
df_boston=pd.concat([X_boston, y_boston], axis=1)
len(df_boston)
df_boston.head()
feature_boston=raw_boston.feature_names
print(feature_boston)
col_boston=np.append(feature_boston, ['target'])
df_boston.columns=col_boston
df_boston.head()

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


#### 6.2.1 Missing Value process

In [3]:
import numpy as np
import pandas as pd

df=pd.DataFrame([[42,'male', 12, 'reading', 'class2'], [35, 'unknown', 3, 'cooking', 'class1'], [1000, 'female', 7, 'cycling', 'class3'], [1000, 'unknown', 21, 'unkown', 'unknown']])
df.columns=['age', 'gender', 'month_birth', 'hobby', 'target']
df

Unnamed: 0,age,gender,month_birth,hobby,target
0,42,male,12,reading,class2
1,35,unknown,3,cooking,class1
2,1000,female,7,cycling,class3
3,1000,unknown,21,unkown,unknown


In [4]:
df['age'].unique()

array([  42,   35, 1000])

In [5]:
df.loc[df['age']>150, ['age']]=np.nan
df.loc[df['gender']=='unknown', ['gender']]=np.nan
df.loc[df['month_birth']>12, ['month_birth']]=np.nan
df.loc[df['hobby']=='unkown', ['hobby']]=np.nan
df.loc[df['target']=='unknown', ['target']]=np.nan

In [6]:
df

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3
3,,,,,


In [7]:
df.isnull().sum()

age            2
gender         2
month_birth    1
hobby          1
target         1
dtype: int64

In [8]:
df2=df.dropna(axis=0)
df2

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2


In [9]:
df3=df.dropna(axis=1)
df3

0
1
2
3


In [10]:
df4=df.dropna(how='all')
df4

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3


In [11]:
df5=df.dropna(thresh=2)
df5

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3


In [12]:
df6=df.dropna(subset=['gender'])
df6

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
2,,female,7.0,cycling,class3


In [13]:
alter_values= {'age':0, 'gender':'U', 'month_birth':0,'hobby':'U', 'target':'class4'}
df7=df.fillna(value=alter_values)
df7

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,U,3.0,cooking,class1
2,0.0,female,7.0,cycling,class3
3,0.0,U,0.0,U,class4


In [14]:
from sklearn.preprocessing import LabelEncoder
df8=df7
class_label=LabelEncoder()
data_value=df8['target'].values
y_new=class_label.fit_transform(data_value)
y_new

array([1, 0, 2, 3])

In [15]:
df8['target']=y_new
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,1
1,35.0,U,3.0,cooking,0
2,0.0,female,7.0,cycling,2
3,0.0,U,0.0,U,3


In [16]:
y_ori=class_label.inverse_transform(y_new)
y_ori
df8['target']=y_ori
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,U,3.0,cooking,class1
2,0.0,female,7.0,cycling,class3
3,0.0,U,0.0,U,class4


In [17]:
#Class labeling
y_arr=df8['target'].values
y_arr.sort()
y_arr

array(['class1', 'class2', 'class3', 'class4'], dtype=object)

In [18]:
num_y=0
dic_y={}
for ith_y in y_arr:
    dic_y[ith_y]=num_y
    num_y+=1

In [19]:
dic_y
df8['target']=df8['target'].replace(dic_y)
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,0
1,35.0,U,3.0,cooking,1
2,0.0,female,7.0,cycling,2
3,0.0,U,0.0,U,3


In [20]:
df9=df8
df9['target']=df9['target'].astype(str)
df10=pd.get_dummies(df9['target'])
print(df10)

   0  1  2  3
0  1  0  0  0
1  0  1  0  0
2  0  0  1  0
3  0  0  0  1


In [21]:
df9['target']=df9['target'].astype(str)
df11=pd.get_dummies(df9['target'], drop_first=True)
print(df11)

   1  2  3
0  0  0  0
1  1  0  0
2  0  1  0
3  0  0  1


In [22]:
df12=df8
df13=pd.get_dummies(df12)
df13

Unnamed: 0,age,month_birth,gender_U,gender_female,gender_male,hobby_U,hobby_cooking,hobby_cycling,hobby_reading,target_0,target_1,target_2,target_3
0,42.0,12.0,0,0,1,0,0,0,1,1,0,0,0
1,35.0,3.0,1,0,0,0,1,0,0,0,1,0,0
2,0.0,7.0,0,1,0,0,0,1,0,0,0,1,0
3,0.0,0.0,1,0,0,1,0,0,0,0,0,0,1


In [23]:
from sklearn.preprocessing import OneHotEncoder
hot_encoder =OneHotEncoder()
y=df7[['target']]
y_hot=hot_encoder.fit_transform(y)
print(y_hot.toarray())

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


In [24]:
from tensorflow.keras.utils import to_categorical
y_hotec=to_categorical(y)
print(y_hotec)

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


In [26]:
# Standard Scaling
from sklearn.preprocessing import StandardScaler

std=StandardScaler()
std.fit(df8[['month_birth']])
x_std=std.transform(df8[['month_birth']])

x_std2=std.fit_transform(df8[['month_birth']])
x_std2

array([[ 1.44444444],
       [-0.55555556],
       [ 0.33333333],
       [-1.22222222]])

In [27]:
np.mean(x_std)
np.std(x_std)

1.0

In [28]:
#Robust Scaling
from sklearn.preprocessing import RobustScaler

robust=RobustScaler()
robust.fit(df8[['month_birth']])
x_robust=robust.transform(df8[['month_birth']])
x_robust

array([[ 1.16666667],
       [-0.33333333],
       [ 0.33333333],
       [-0.83333333]])

In [29]:
# min-max scaling
from sklearn.preprocessing import MinMaxScaler

minmax=MinMaxScaler()
minmax.fit(df8[['month_birth']])
x_minmax=minmax.transform(df8[['month_birth']])
x_minmax

array([[1.        ],
       [0.25      ],
       [0.58333333],
       [0.        ]])

In [30]:
#nomal scaling
from sklearn.preprocessing import Normalizer

normal=Normalizer()
normal.fit(df8[['age', 'month_birth']])
x_normal=normal.transform(df8[['age', 'month_birth']])
x_normal


array([[0.96152395, 0.27472113],
       [0.99634665, 0.08540114],
       [0.        , 1.        ],
       [0.        , 0.        ]])

In [31]:
#Standard scaling
from sklearn.preprocessing import StandardScaler

stand_scale=StandardScaler()
x_train_std=stand_scale.fit_transform(x_train)
x_test_std=stand_scale.transform(x_test)

NameError: name 'x_train' is not defined