### Numpy의 활용

In [142]:
import numpy as np

numpy로 n dimension의 배열 객체 생성

In [143]:
arr1 = np.array([1,2,3])
arr2 = np.array([[1,2,3],[2,3,4]])
arr2.shape

(2, 3)

ndarray의 데이터 타입

In [144]:
arr2.dtype

dtype('int64')

astype()으로 변경 가능

In [145]:
arr2 = arr2.astype('float')
arr2.dtype

dtype('float64')

ndarray의 axis 축  
세로 방향(row)이 axis 0, 가로 방향(column) axis 1 이 됨  
차원이 높아져 새로운 축이 나타날때마다 axis 값은 1씩 늘어남

ndarray의 초기화와 생성

In [146]:
# arange
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [147]:
# zeros
np.zeros((3,2), dtype='int32')

array([[0, 0],
       [0, 0],
       [0, 0]], dtype=int32)

In [148]:
# ones
np.ones((3,2))

array([[1., 1.],
       [1., 1.],
       [1., 1.]])

ndarray의 차원과 크기를 변경하는  reshape()

In [149]:
np.arange(10).reshape(2,5)

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

axis 0 자리에 -1을 두면 axis 0의 크기는 가변적으로 두겠단 뜻.  
나머지 축은 고정으로 둚.

In [150]:
np.arange(20).reshape(-1, 4)  # axis 1 의 크기만 4로 고정하겠다는 뜻

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

간혹 reshape(-1, 1)의 변환을 요구하는 경우가 있는데, 머신러닝 라이브러리에서 1d ndarray를 받지 않고 2d를 요구하기 때문이다. 

In [151]:
# 1d -> 2d 변환
np.arange(5).reshape(-1, 1)

array([[0],
       [1],
       [2],
       [3],
       [4]])

In [152]:
# 2d -> 1d 변환
np.array([[0], [1], [2], [3], [4]]).reshape(-1,)

array([0, 1, 2, 3, 4])

ndarray의 데이터 세트 선택 - Indexing

In [153]:
# 1차원 추출
arr = np.arange(10)
arr[3]

3

In [154]:
# 2차원 추출
arr = np.array([[1,2,3],[2,3,4]])
arr[1][2]

4

In [155]:
# 1차원 slicing
arr = np.arange(10)
arr[3:]

array([3, 4, 5, 6, 7, 8, 9])

In [156]:
# 2차원 slicing
arr = np.array([[1,2,3],[2,3,4],[3,4,5]])
arr[1:, :2]

array([[2, 3],
       [3, 4]])

In [157]:
# 1차원 fancy indexing
arr = np.arange(10)
arr[[1,3,5]]

array([1, 3, 5])

In [158]:
# 2차원 fancy indexing
arr = np.array([[1,2,3],[2,3,4],[3,4,5]])
arr[[0,1], 2]

array([3, 4])

In [159]:
# boolean indexing
arr = np.arange(10)
arr[arr<5]

array([0, 1, 2, 3, 4])

sort() 와 argsort()

In [160]:
arr = np.array([3,2,5,1,6])
np.sort(arr)

array([1, 2, 3, 5, 6])

In [161]:
# 정렬한 원소의 원본 index를 반환
np.argsort(arr)

array([3, 1, 0, 2, 4])

행렬 내적

In [162]:
# 행렬 내적
np.dot([1,2,3],[1,2,3])

14

In [163]:
# 전치 행렬
np.transpose([[1,2,3], [2,3,4]])

array([[1, 2],
       [2, 3],
       [3, 4]])

### Pandas의 활용

예시 데이터 따릉이 사용

In [164]:
import pandas as pd
data = pd.read_csv("./ddareng/train.csv")
data.head()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
0,3,20,16.3,1.0,1.5,89.0,576.0,0.027,76.0,33.0,49.0
1,6,13,20.1,0.0,1.4,48.0,916.0,0.042,73.0,40.0,159.0
2,7,6,13.9,0.0,0.7,79.0,1382.0,0.033,32.0,19.0,26.0
3,8,23,8.1,0.0,2.7,54.0,946.0,0.04,75.0,64.0,57.0
4,9,18,29.5,0.0,4.8,7.0,2000.0,0.057,27.0,11.0,431.0


In [165]:
data.tail(3)

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
1456,2176,5,18.3,0.0,1.9,54.0,2000.0,0.009,30.0,21.0,22.0
1457,2178,21,20.7,0.0,3.7,37.0,1395.0,0.082,71.0,36.0,216.0
1458,2179,17,21.1,0.0,3.1,47.0,1973.0,0.046,38.0,17.0,170.0


In [166]:
data.shape

(1459, 11)

In [167]:
print(data.columns)
print(data.index)
print(data.index.values)

Index(['id', 'hour', 'hour_bef_temperature', 'hour_bef_precipitation',
       'hour_bef_windspeed', 'hour_bef_humidity', 'hour_bef_visibility',
       'hour_bef_ozone', 'hour_bef_pm10', 'hour_bef_pm2.5', 'count'],
      dtype='object')
RangeIndex(start=0, stop=1459, step=1)
[   0    1    2 ... 1456 1457 1458]


In [168]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      1459 non-null   int64  
 1   hour                    1459 non-null   int64  
 2   hour_bef_temperature    1457 non-null   float64
 3   hour_bef_precipitation  1457 non-null   float64
 4   hour_bef_windspeed      1450 non-null   float64
 5   hour_bef_humidity       1457 non-null   float64
 6   hour_bef_visibility     1457 non-null   float64
 7   hour_bef_ozone          1383 non-null   float64
 8   hour_bef_pm10           1369 non-null   float64
 9   hour_bef_pm2.5          1342 non-null   float64
 10  count                   1459 non-null   float64
dtypes: float64(9), int64(2)
memory usage: 125.5 KB


In [169]:
data.describe()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
count,1459.0,1459.0,1457.0,1457.0,1450.0,1457.0,1457.0,1383.0,1369.0,1342.0,1459.0
mean,1105.914325,11.493489,16.717433,0.031572,2.479034,52.231297,1405.216884,0.039149,57.168736,30.327124,108.5634
std,631.338681,6.92279,5.23915,0.174917,1.378265,20.370387,583.131708,0.019509,31.771019,14.713252,82.631733
min,3.0,0.0,3.1,0.0,0.0,7.0,78.0,0.003,9.0,8.0,1.0
25%,555.5,5.5,12.8,0.0,1.4,36.0,879.0,0.0255,36.0,20.0,37.0
50%,1115.0,11.0,16.6,0.0,2.3,51.0,1577.0,0.039,51.0,26.0,96.0
75%,1651.0,17.5,20.1,0.0,3.4,69.0,1994.0,0.052,69.0,37.0,150.0
max,2179.0,23.0,30.0,1.0,8.0,99.0,2000.0,0.125,269.0,90.0,431.0


In [170]:
data['hour'].value_counts()

20    61
4     61
5     61
11    61
22    61
19    61
15    61
7     61
8     61
10    61
1     61
14    61
9     61
3     61
2     61
18    61
23    61
6     61
12    61
13    60
17    60
16    60
0     60
21    60
Name: hour, dtype: int64

In [171]:
data['new column'] = np.arange(data.shape[0])
data.head()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count,new column
0,3,20,16.3,1.0,1.5,89.0,576.0,0.027,76.0,33.0,49.0,0
1,6,13,20.1,0.0,1.4,48.0,916.0,0.042,73.0,40.0,159.0,1
2,7,6,13.9,0.0,0.7,79.0,1382.0,0.033,32.0,19.0,26.0,2
3,8,23,8.1,0.0,2.7,54.0,946.0,0.04,75.0,64.0,57.0,3
4,9,18,29.5,0.0,4.8,7.0,2000.0,0.057,27.0,11.0,431.0,4


In [172]:
# 원본 유지 drop
data.drop('new column', axis=1)

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
0,3,20,16.3,1.0,1.5,89.0,576.0,0.027,76.0,33.0,49.0
1,6,13,20.1,0.0,1.4,48.0,916.0,0.042,73.0,40.0,159.0
2,7,6,13.9,0.0,0.7,79.0,1382.0,0.033,32.0,19.0,26.0
3,8,23,8.1,0.0,2.7,54.0,946.0,0.040,75.0,64.0,57.0
4,9,18,29.5,0.0,4.8,7.0,2000.0,0.057,27.0,11.0,431.0
...,...,...,...,...,...,...,...,...,...,...,...
1454,2174,4,16.8,0.0,1.6,53.0,2000.0,0.031,37.0,27.0,21.0
1455,2175,3,10.8,0.0,3.8,45.0,2000.0,0.039,34.0,19.0,20.0
1456,2176,5,18.3,0.0,1.9,54.0,2000.0,0.009,30.0,21.0,22.0
1457,2178,21,20.7,0.0,3.7,37.0,1395.0,0.082,71.0,36.0,216.0


In [173]:
# 원본 변경 drop
data.drop('new column', axis=1, inplace=True)
data.head()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
0,3,20,16.3,1.0,1.5,89.0,576.0,0.027,76.0,33.0,49.0
1,6,13,20.1,0.0,1.4,48.0,916.0,0.042,73.0,40.0,159.0
2,7,6,13.9,0.0,0.7,79.0,1382.0,0.033,32.0,19.0,26.0
3,8,23,8.1,0.0,2.7,54.0,946.0,0.04,75.0,64.0,57.0
4,9,18,29.5,0.0,4.8,7.0,2000.0,0.057,27.0,11.0,431.0


In [174]:
# reset_index() 시 기존 인덱스는 새로운 column으로 추가
data.reset_index()

Unnamed: 0,index,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
0,0,3,20,16.3,1.0,1.5,89.0,576.0,0.027,76.0,33.0,49.0
1,1,6,13,20.1,0.0,1.4,48.0,916.0,0.042,73.0,40.0,159.0
2,2,7,6,13.9,0.0,0.7,79.0,1382.0,0.033,32.0,19.0,26.0
3,3,8,23,8.1,0.0,2.7,54.0,946.0,0.040,75.0,64.0,57.0
4,4,9,18,29.5,0.0,4.8,7.0,2000.0,0.057,27.0,11.0,431.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1454,2174,4,16.8,0.0,1.6,53.0,2000.0,0.031,37.0,27.0,21.0
1455,1455,2175,3,10.8,0.0,3.8,45.0,2000.0,0.039,34.0,19.0,20.0
1456,1456,2176,5,18.3,0.0,1.9,54.0,2000.0,0.009,30.0,21.0,22.0
1457,1457,2178,21,20.7,0.0,3.7,37.0,1395.0,0.082,71.0,36.0,216.0


In [175]:
# column 추가 없이 reset
data.reset_index(drop=True)
data.drop('id', axis=1)

Unnamed: 0,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
0,20,16.3,1.0,1.5,89.0,576.0,0.027,76.0,33.0,49.0
1,13,20.1,0.0,1.4,48.0,916.0,0.042,73.0,40.0,159.0
2,6,13.9,0.0,0.7,79.0,1382.0,0.033,32.0,19.0,26.0
3,23,8.1,0.0,2.7,54.0,946.0,0.040,75.0,64.0,57.0
4,18,29.5,0.0,4.8,7.0,2000.0,0.057,27.0,11.0,431.0
...,...,...,...,...,...,...,...,...,...,...
1454,4,16.8,0.0,1.6,53.0,2000.0,0.031,37.0,27.0,21.0
1455,3,10.8,0.0,3.8,45.0,2000.0,0.039,34.0,19.0,20.0
1456,5,18.3,0.0,1.9,54.0,2000.0,0.009,30.0,21.0,22.0
1457,21,20.7,0.0,3.7,37.0,1395.0,0.082,71.0,36.0,216.0


df의 인덱싱에는 loc[] : 명칭기반, iloc[] : 위치 기반 인덱싱이 있다.  
loc[]는 column 명, index를 사용해 지정하고, iloc[]는 오직 정수 값으로만 위치를 지정한다.

In [176]:
# loc
print(data.loc[0, 'hour']) # 이때 사용된 0은 index 값일 뿐 꼭 정수일 필요가 없다. 
# iloc
print(data.iloc[0, 1]) # 이때 두 좌표는 반드시 정수여야 함.

20
20


In [177]:
# 불린 인덱싱을 사용하면 더 편리한 연산이 가능하다.
data[data['hour']<12].head()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
2,7,6,13.9,0.0,0.7,79.0,1382.0,0.033,32.0,19.0,26.0
5,13,2,13.6,0.0,1.7,80.0,1073.0,0.027,34.0,15.0,39.0
6,14,3,10.6,0.0,1.5,58.0,1548.0,0.038,62.0,33.0,23.0
8,19,9,13.8,0.0,1.9,64.0,1344.0,0.039,93.0,19.0,39.0
10,21,4,5.7,0.0,0.6,77.0,1960.0,0.028,14.0,18.0,6.0


In [178]:
data[(data['hour']<12) & (data['count']>30.0)][['hour', 'count']].head()

Unnamed: 0,hour,count
5,2,39.0
8,9,39.0
11,10,42.0
12,9,59.0
13,10,60.0


In [179]:
data.sort_values(by='hour').head()

Unnamed: 0,id,hour,hour_bef_temperature,hour_bef_precipitation,hour_bef_windspeed,hour_bef_humidity,hour_bef_visibility,hour_bef_ozone,hour_bef_pm10,hour_bef_pm2.5,count
149,223,0,18.6,0.0,0.3,58.0,554.0,0.053,110.0,63.0,104.0
485,748,0,14.8,0.0,1.2,64.0,2000.0,0.042,28.0,16.0,78.0
1224,1847,0,16.8,0.0,0.6,62.0,2000.0,0.029,35.0,19.0,94.0
829,1260,0,15.6,0.0,2.3,62.0,1433.0,0.023,56.0,36.0,93.0
473,729,0,13.8,0.0,3.2,69.0,518.0,0.042,79.0,46.0,46.0


In [180]:
data.groupby('hour')['count'].agg(['mean', 'max', 'min'])

Unnamed: 0_level_0,mean,max,min
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,71.766667,137.0,17.0
1,47.606557,98.0,6.0
2,31.409836,70.0,3.0
3,21.377049,46.0,3.0
4,13.52459,31.0,2.0
5,13.114754,28.0,1.0
6,24.557377,53.0,3.0
7,62.360656,127.0,6.0
8,136.688525,267.0,21.0
9,93.540984,149.0,30.0


In [181]:
data.isna().sum()

id                          0
hour                        0
hour_bef_temperature        2
hour_bef_precipitation      2
hour_bef_windspeed          9
hour_bef_humidity           2
hour_bef_visibility         2
hour_bef_ozone             76
hour_bef_pm10              90
hour_bef_pm2.5            117
count                       0
dtype: int64

In [182]:
data = data.fillna(0)
data.isna().sum()

id                        0
hour                      0
hour_bef_temperature      0
hour_bef_precipitation    0
hour_bef_windspeed        0
hour_bef_humidity         0
hour_bef_visibility       0
hour_bef_ozone            0
hour_bef_pm10             0
hour_bef_pm2.5            0
count                     0
dtype: int64

In [183]:
data.nunique()

id                        1459
hour                        24
hour_bef_temperature       246
hour_bef_precipitation       2
hour_bef_windspeed          73
hour_bef_humidity           92
hour_bef_visibility        782
hour_bef_ozone              96
hour_bef_pm10              149
hour_bef_pm2.5              80
count                      314
dtype: int64

In [184]:
data['hour'] = data['hour'].replace({ 0 : 100 })
data['hour'].value_counts()

20     61
4      61
5      61
11     61
22     61
19     61
15     61
7      61
8      61
10     61
1      61
14     61
9      61
3      61
2      61
18     61
23     61
6      61
12     61
13     60
17     60
16     60
100    60
21     60
Name: hour, dtype: int64

In [185]:
data['hour'].apply(lambda x: x**2).value_counts()

400      61
16       61
25       61
121      61
484      61
361      61
225      61
49       61
64       61
100      61
1        61
196      61
81       61
9        61
4        61
324      61
529      61
36       61
144      61
169      60
289      60
256      60
10000    60
441      60
Name: hour, dtype: int64

In [186]:
data['hour'].apply(lambda x: 10 if x<=100 else 100).value_counts()

10    1459
Name: hour, dtype: int64