In [60]:
import pandas as pd
import numpy as np

df = pd.read_csv('./dataset/dataset/faults.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1941 entries, 0 to 1940
Data columns (total 34 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   X_Minimum              1941 non-null   int64  
 1   X_Maximum              1941 non-null   int64  
 2   Y_Minimum              1941 non-null   int64  
 3   Y_Maximum              1941 non-null   int64  
 4   Pixels_Areas           1941 non-null   int64  
 5   X_Perimeter            1941 non-null   int64  
 6   Y_Perimeter            1941 non-null   int64  
 7   Sum_of_Luminosity      1941 non-null   int64  
 8   Minimum_of_Luminosity  1941 non-null   int64  
 9   Maximum_of_Luminosity  1941 non-null   int64  
 10  Length_of_Conveyer     1941 non-null   int64  
 11  TypeOfSteel_A300       1941 non-null   int64  
 12  TypeOfSteel_A400       1941 non-null   int64  
 13  Steel_Plate_Thickness  1941 non-null   int64  
 14  Edges_Index            1941 non-null   float64
 15  Empt

In [61]:
df_dataset = df[['X_Minimum','X_Maximum','Steel_Plate_Thickness','LogOfAreas',
                 'Pastry','Z_Scratch','Bumps']]
print("df_dataset.shape : ", df_dataset.shape)
print(df_dataset.info())

df_dataset.shape :  (1941, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1941 entries, 0 to 1940
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   X_Minimum              1941 non-null   int64  
 1   X_Maximum              1941 non-null   int64  
 2   Steel_Plate_Thickness  1941 non-null   int64  
 3   LogOfAreas             1941 non-null   float64
 4   Pastry                 1941 non-null   int64  
 5   Z_Scratch              1941 non-null   int64  
 6   Bumps                  1941 non-null   int64  
dtypes: float64(1), int64(6)
memory usage: 106.3 KB
None


In [62]:
df_dataset = np.asarray(df_dataset, dtype='float32')

In [63]:
df_dataset_x = df_dataset[0:1, :-3]
df_dataset_y = df_dataset[0:1, -3:]

print("df_dataset_x : \n", df_dataset_x)
print("df_dataset_x.shape : ", df_dataset_x.shape)
print("df_dataset_y : ", df_dataset_y)
print("df_dataset_y.shape : ", df_dataset_y.shape)

df_dataset_x : 
 [[42.     50.     80.      2.4265]]
df_dataset_x.shape :  (1, 4)
df_dataset_y :  [[1. 0. 0.]]
df_dataset_y.shape :  (1, 3)


용어 재정의
- $\theta_{0}$  : bias
- $\theta_{1}$ : weight



In [64]:
RND_STD = 1
RND_MEAN = 0 

# 독립변수와 종속변수의 개수를 값으로 명시
input_cnt = df_dataset_x.shape[-1]      # 4
output_cnt = df_dataset_y.shape[-1]     # 3
# df_dataset_x.shape  # (1, 4)
# df_dataset_y.shape  # (1, 3)

weight = np.random.normal(RND_MEAN, RND_STD, size = [input_cnt, output_cnt])
bias = np.random.normal(RND_MEAN, RND_STD, size = [output_cnt])

print(f"weight.shape : {weight.shape}")     # (4, 3)
print(f"bias.shape : {bias.shape}")     # (4, 3)

weight.shape : (4, 3)
bias.shape : (3,)


In [65]:
print("df_dataset_x : \n", df_dataset_x)
print("weight : \n", weight)    # 독립변수 네개니까 weight도 네개가 필요하다.
print("bias : \n", bias)

df_dataset_x : 
 [[42.     50.     80.      2.4265]]
weight : 
 [[-0.12642799 -1.00977785 -0.86826677]
 [ 0.39894234  0.68350081 -0.63732268]
 [-0.63891916  1.35129303  1.16548603]
 [ 1.47240778 -0.14597245 -1.32445284]]
bias : 
 [1.51772814 0.14019126 1.21974359]


df_dataset_x의 1행 1열과 weight의 1열을 곱한 후 bias를 더한다.

- 행렬곱을 위한 함수 : np.matmul

In [66]:
# weight[:, 0] == : 로 전체 행을 가져와서 가져온 행의 첫번째 열만 추출하겠다.
P_1 = np.matmul(df_dataset_x[0], weight[:, 0]) + bias[0]
P_2 = np.matmul(df_dataset_x[0], weight[:, 1]) + bias[1]    # 입력값은 하나뿐이므로 바꿀 수 없다. (다른 weight와 연산)
P_3 = np.matmul(df_dataset_x[0], weight[:, 2]) + bias[2]

P_total = np.matmul(df_dataset_x, weight) + bias 

print(P_1, P_2, P_3)
print("=" * 20)
print(P_total)

-31.38586566564822 99.65380226995862 22.9115029918579
[[-31.38586567  99.65380227  22.91150299]]


In [67]:
print(f"df_dataset_x : {df_dataset_x}")     # metrics
print(f"df_dataset_x.shape : {df_dataset_x.shape}\n")

print(f"df_dataset_x[0] : {df_dataset_x[0]}")   # vector
print(f"df_dataset_x[0].shape : {df_dataset_x[0].shape}")

df_dataset_x : [[42.     50.     80.      2.4265]]
df_dataset_x.shape : (1, 4)

df_dataset_x[0] : [42.     50.     80.      2.4265]
df_dataset_x[0].shape : (4,)


### (Model) Parameter

적절한 weight와 bias를 찾기 위해 경사하강법을 적용하는데, 이 때의 weight와 bias를 `Parameter`라고 한다.

### Hyper Parameter

Learning Rate(lr), Epoch, STD 등 고정되어있는 값을 `Hyper Parameter`라고 한다.

### 

<img alt= "process (back, forward propagation)" src=img/process.png>

In [68]:
df = pd.read_csv('dataset/dataset/faults_mini.csv')
df.info()
display(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   X_Minimum              10 non-null     int64
 1   Y_Minimum              10 non-null     int64
 2   Pixels_Areas           10 non-null     int64
 3   Steel_Plate_Thickness  10 non-null     int64
 4   Pastry                 10 non-null     int64
 5   Z_Scratch              10 non-null     int64
 6   K_Scatch               10 non-null     int64
dtypes: int64(7)
memory usage: 688.0 bytes


Unnamed: 0,X_Minimum,Y_Minimum,Pixels_Areas,Steel_Plate_Thickness,Pastry,Z_Scratch,K_Scatch
0,42,270900,267,80,1,0,0
1,1084,185575,108,40,0,0,1
2,1109,1170194,130,175,0,1,0
3,190,210936,132,150,1,0,0
4,330,429227,264,150,1,0,0
5,74,779144,1506,150,1,0,0
6,51,585861,139,70,0,1,0
7,397,604478,444,70,0,1,0
8,43,366881,4048,40,0,0,1
9,48,377537,3985,40,0,0,1


In [69]:
input_cnt, output_cnt = 4, 3
data = np.asarray(df, dtype='float32')
print(type(data))
print(data)

<class 'numpy.ndarray'>
[[4.200000e+01 2.709000e+05 2.670000e+02 8.000000e+01 1.000000e+00
  0.000000e+00 0.000000e+00]
 [1.084000e+03 1.855750e+05 1.080000e+02 4.000000e+01 0.000000e+00
  0.000000e+00 1.000000e+00]
 [1.109000e+03 1.170194e+06 1.300000e+02 1.750000e+02 0.000000e+00
  1.000000e+00 0.000000e+00]
 [1.900000e+02 2.109360e+05 1.320000e+02 1.500000e+02 1.000000e+00
  0.000000e+00 0.000000e+00]
 [3.300000e+02 4.292270e+05 2.640000e+02 1.500000e+02 1.000000e+00
  0.000000e+00 0.000000e+00]
 [7.400000e+01 7.791440e+05 1.506000e+03 1.500000e+02 1.000000e+00
  0.000000e+00 0.000000e+00]
 [5.100000e+01 5.858610e+05 1.390000e+02 7.000000e+01 0.000000e+00
  1.000000e+00 0.000000e+00]
 [3.970000e+02 6.044780e+05 4.440000e+02 7.000000e+01 0.000000e+00
  1.000000e+00 0.000000e+00]
 [4.300000e+01 3.668810e+05 4.048000e+03 4.000000e+01 0.000000e+00
  0.000000e+00 1.000000e+00]
 [4.800000e+01 3.775370e+05 3.985000e+03 4.000000e+01 0.000000e+00
  0.000000e+00 1.000000e+00]]


In [70]:
weight = np.random.normal(RND_MEAN, RND_MEAN, size = [input_cnt, output_cnt])
bias = np.random.normal(RND_MEAN, RND_MEAN, size = [output_cnt])

print("weight.shape : ", weight.shape)  # (4, 3)
print("bias.shape : ", bias.shape)  # (3,)

weight.shape :  (4, 3)
bias.shape :  (3,)


mini batch

<img alt="mini batch" src=img/minibatch.png> 

데이터의 개수가 많을 때 각 mini batch의 개수가 몇개인지도 알 수 있어야 한다.

In [71]:
# how?

data_count = 10
mb_size = 2
train_ratio = 0.8 

In [72]:
# 처음에 몇 개의 mini batch 덩어리가 생성되느냐?
MiniBatch_step_count = int(data_count * train_ratio) // mb_size
print("MiniBatch_step_count : ", MiniBatch_step_count)

MiniBatch_step_count :  4


In [73]:
# train data가 끝나는 위치 == test data가 시작하는 위치! (위 그림 참고)
test_begin_index = MiniBatch_step_count * mb_size
print("test_begin_index : ", test_begin_index)

test_begin_index :  8


In [74]:
# test data가 시작하는 위치를 알았으니 그 위치를 기준으로 데이터를 섞는다.

# shuffle_map : 데이터가 아니라 인덱스다

shuffle_map = np.arange(data.shape[0])
print("Before : ", shuffle_map)

np.random.shuffle(shuffle_map)
print("After : ", shuffle_map)

Before :  [0 1 2 3 4 5 6 7 8 9]
After :  [2 4 1 9 5 6 0 7 8 3]


In [76]:
# 섞은 인덱스를 데이터에 적용하는 방법

mb_data_1 = data[shuffle_map[mb_size*0 : mb_size*1]]
mb_data_2 = data[shuffle_map[mb_size*1 : mb_size*2]]
mb_data_3 = data[shuffle_map[mb_size*2 : mb_size*3]]
mb_data_4 = data[shuffle_map[mb_size*3 : mb_size*4]]


print(mb_data_1)
print(mb_data_2)
print(mb_data_3)
print(mb_data_4)

[[1.109000e+03 1.170194e+06 1.300000e+02 1.750000e+02 0.000000e+00
  1.000000e+00 0.000000e+00]
 [3.300000e+02 4.292270e+05 2.640000e+02 1.500000e+02 1.000000e+00
  0.000000e+00 0.000000e+00]]
[[1.08400e+03 1.85575e+05 1.08000e+02 4.00000e+01 0.00000e+00 0.00000e+00
  1.00000e+00]
 [4.80000e+01 3.77537e+05 3.98500e+03 4.00000e+01 0.00000e+00 0.00000e+00
  1.00000e+00]]
[[7.40000e+01 7.79144e+05 1.50600e+03 1.50000e+02 1.00000e+00 0.00000e+00
  0.00000e+00]
 [5.10000e+01 5.85861e+05 1.39000e+02 7.00000e+01 0.00000e+00 1.00000e+00
  0.00000e+00]]
[[4.20000e+01 2.70900e+05 2.67000e+02 8.00000e+01 1.00000e+00 0.00000e+00
  0.00000e+00]
 [3.97000e+02 6.04478e+05 4.44000e+02 7.00000e+01 0.00000e+00 1.00000e+00
  0.00000e+00]]


In [85]:
# 위 데이터는 독립과 종속 변수가 섞여있으므로 slicing으로 나눌 필요가 있다.test_begin_index

print("첫 번째 미니배치에 대한 독립변수와 종속변수")
print("="*45)
mb_1_train_x = mb_data_1[:, :-output_cnt]  # 독립변수
mb_1_train_y = mb_data_1[:, output_cnt:]   # 종속변수

print('mb_1_train_x :\n', mb_1_train_x)
print('mb_1_train_y :\n', mb_1_train_y)

첫 번째 미니배치에 대한 독립변수와 종속변수
mb_1_train_x :
 [[1.109000e+03 1.170194e+06 1.300000e+02 1.750000e+02]
 [3.300000e+02 4.292270e+05 2.640000e+02 1.500000e+02]]
mb_1_train_y :
 [[175.   0.   1.   0.]
 [150.   1.   0.   0.]]


In [86]:
print("두 번째 미니배치에 대한 독립변수와 종속변수")
print("="*45)
mb_2_train_x = mb_data_2[:, :-output_cnt]  # 독립변수
mb_2_train_y = mb_data_2[:, output_cnt:]   # 종속변수

print('mb_2_train_x :\n', mb_2_train_x)
print('mb_2_train_y :\n', mb_2_train_y)

두 번째 미니배치에 대한 독립변수와 종속변수
mb_2_train_x :
 [[1.08400e+03 1.85575e+05 1.08000e+02 4.00000e+01]
 [4.80000e+01 3.77537e+05 3.98500e+03 4.00000e+01]]
mb_2_train_y :
 [[40.  0.  0.  1.]
 [40.  0.  0.  1.]]


In [87]:
print("세 번째 미니배치에 대한 독립변수와 종속변수")
print("="*45)
mb_3_train_x = mb_data_3[:, :-output_cnt]  # 독립변수
mb_3_train_y = mb_data_3[:, output_cnt:]   # 종속변수

print('mb_3_train_x :\n', mb_3_train_x)
print('mb_3_train_y :\n', mb_3_train_y)

세 번째 미니배치에 대한 독립변수와 종속변수
mb_3_train_x :
 [[7.40000e+01 7.79144e+05 1.50600e+03 1.50000e+02]
 [5.10000e+01 5.85861e+05 1.39000e+02 7.00000e+01]]
mb_3_train_y :
 [[150.   1.   0.   0.]
 [ 70.   0.   1.   0.]]


In [88]:
print("네 번째 미니배치에 대한 독립변수와 종속변수")
print("="*45)
mb_4_train_x = mb_data_4[:, :-output_cnt]  # 독립변수
mb_4_train_y = mb_data_4[:, output_cnt:]   # 종속변수

print('mb_4_train_x :\n', mb_4_train_x)
print('mb_4_train_y :\n', mb_4_train_y)

네 번째 미니배치에 대한 독립변수와 종속변수
mb_4_train_x :
 [[4.20000e+01 2.70900e+05 2.67000e+02 8.00000e+01]
 [3.97000e+02 6.04478e+05 4.44000e+02 7.00000e+01]]
mb_4_train_y :
 [[80.  1.  0.  0.]
 [70.  0.  1.  0.]]


<img alt="when input, output layer number is over 1" src=img/input_output_layers.png> 

In [89]:
mb_1_y_hat_1 = np.matmul(mb_1_train_x, weight[:, 0]) + bias[0]
mb_1_y_hat_2 = np.matmul(mb_1_train_x, weight[:, 1]) + bias[1]
mb_1_y_hat_3 = np.matmul(mb_1_train_x, weight[:, 2]) + bias[2]