In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

<font color = "#CC3D3D"><p>
# Topics
* [Apply](#Apply)
* [Manipulating Dates and Times](#Manipulating-Dates-and-Times)
* [Handling Missing Data](#Handling-Missing-Data)

## Apply

<font color = 'blue'>Apply a function to each value in a Series

In [2]:
Series(range(1,5)).apply(np.log) #1,2,3,4에 대해서 log를 취함.

0    0.000000
1    0.693147
2    1.098612
3    1.386294
dtype: float64

In [3]:
# lamda 함수의 파라미터인 x에는 Series의 특정 원소값이 할당 
Series(range(1,5)).apply(lambda x: 1/x)

0    1.000000
1    0.500000
2    0.333333
3    0.250000
dtype: float64

In [4]:
# 파라미터를 추가할 때 apply()의 args 파라미터에 튜플로 값 지정  
Series(range(1,5)).apply(lambda x,y: x+y, args=(3,))

0    4
1    5
2    6
3    7
dtype: int64

In [5]:
df = pd.DataFrame({
        'id': [1,1,1,1,2,2,2],
        'site': ['a','b','c','a','a','b','b'],
        'pageview': np.arange(1,8),
        'dwelltime': np.arange(7.0, 0, -1),
    }, columns=['id','site','pageview','dwelltime'])
df

Unnamed: 0,id,site,pageview,dwelltime
0,1,a,1,7.0
1,1,b,2,6.0
2,1,c,3,5.0
3,1,a,4,4.0
4,2,a,5,3.0
5,2,b,6,2.0
6,2,b,7,1.0


In [6]:
def normalize(x, min, max): 
    return (x-min)/(max-min) # min max

min = df['pageview'].min();
max = df['pageview'].max(); 
min, max

(1, 7)

In [7]:
df['pageview'].apply(normalize, args=(min, max))
#normalize 함수에는 min, max을 파라미터로 가짐 따라서 argsdp 두개의 값을 입력.

0    0.000000
1    0.166667
2    0.333333
3    0.500000
4    0.666667
5    0.833333
6    1.000000
Name: pageview, dtype: float64

## Manipulating Dates and Times

In [8]:
# pd.date_range 함수를 쓰면 날짜/시간을 일일히 입력할 필요없이 지정한 범위 내의 날짜를 생성
# https://datascienceschool.net/view-notebook/8959673a97214e8fafdb159f254185e9/ 참조
t = Series(pd.date_range('2020-05-20', periods=7))
t
# 5월 20일 부터 7일까지를 출력.

0   2020-05-20
1   2020-05-21
2   2020-05-22
3   2020-05-23
4   2020-05-24
5   2020-05-25
6   2020-05-26
dtype: datetime64[ns]

In [43]:
Series(['2020-05-20','2020-05-21','2020-05-22','2020-05-23','2020-05-24','2020-05-25','2020-05-26'])

0    2020-05-20
1    2020-05-21
2    2020-05-22
3    2020-05-23
4    2020-05-24
5    2020-05-25
6    2020-05-26
dtype: object

In [9]:
t = pd.to_datetime(Series(['2020-05-20','2020-05-21','2020-05-22','2020-05-23','2020-05-24','2020-05-25','2020-05-26']))
t
#그냥 list 형식이 아니라 datetime 형식으로 series 타입으로 저장.

0   2020-05-20
1   2020-05-21
2   2020-05-22
3   2020-05-23
4   2020-05-24
5   2020-05-25
6   2020-05-26
dtype: datetime64[ns]

In [10]:
Series(['2020-05-20','2020-05-21','2020-05-22']).astype('datetime64')
#astype을 사용해서 pd.to_datetime을 대체할수 있음.

0   2020-05-20
1   2020-05-21
2   2020-05-22
dtype: datetime64[ns]

In [11]:
Series(['2020-05-20','2020-05-21','2020-05-22']).astype('datetime64').dt.month #month만을 추출. --> 추출된 값은 int형.

0    5
1    5
2    5
dtype: int64

In [12]:
t

0   2020-05-20
1   2020-05-21
2   2020-05-22
3   2020-05-23
4   2020-05-24
5   2020-05-25
6   2020-05-26
dtype: datetime64[ns]

In [13]:
t.dt.weekday
# 0 --> 월, 1 --> 화, 2--> 수

0    2
1    3
2    4
3    5
4    6
5    0
6    1
dtype: int64

In [14]:
# Series의 한 원소에 대해 날짜 정보를 얻으려면
# Series의 전체를 지칭할때 dt를 사용함. 이때에는 dt가 필요 없음.
t[0].year , t[0].month, t[0].day

(2020, 5, 20)

In [15]:
# Series의 모든 원소에 대해 한꺼번에 날짜 정보를 얻으려면 dt accessor 를 사용하여 추출가능.
print(t.dt.year)
print(t.dt.month)
print(t.dt.day)

0    2020
1    2020
2    2020
3    2020
4    2020
5    2020
6    2020
dtype: int64
0    5
1    5
2    5
3    5
4    5
5    5
6    5
dtype: int64
0    20
1    21
2    22
3    23
4    24
5    25
6    26
dtype: int64


In [16]:
list(zip(t.dt.year, t.dt.month, t.dt.day)) #각 행마다의 데이터를 한 줄씩 읽어 zip으로 묶음.

[(2020, 5, 20),
 (2020, 5, 21),
 (2020, 5, 22),
 (2020, 5, 23),
 (2020, 5, 24),
 (2020, 5, 25),
 (2020, 5, 26)]

<font color = 'blue'>Weekday

In [17]:
t.dt.weekday

0    2
1    3
2    4
3    5
4    6
5    0
6    1
dtype: int64

In [18]:
t.apply(lambda x: x.weekday())

0    2
1    3
2    4
3    5
4    6
5    0
6    1
dtype: int64

In [19]:
t.dt.day_name()

0    Wednesday
1     Thursday
2       Friday
3     Saturday
4       Sunday
5       Monday
6      Tuesday
dtype: object

In [20]:
# pandas에서 문자열 처리는 .str 객체를 사용
# dt는 전체 문자, str은 문자열만을 가리킴.
t.dt.day_name().str.upper()

0    WEDNESDAY
1     THURSDAY
2       FRIDAY
3     SATURDAY
4       SUNDAY
5       MONDAY
6      TUESDAY
dtype: object

In [21]:
t.dt.day_name().str.upper().str[:3] # str[]을 사용하여 인덱싱해야함.

0    WED
1    THU
2    FRI
3    SAT
4    SUN
5    MON
6    TUE
dtype: object

In [22]:
t.dt.day_name().str.upper().str[:3].str.contains('SAT') #토요일을 포함하고 있는지 아닌지..

0    False
1    False
2    False
3     True
4    False
5    False
6    False
dtype: bool

In [23]:
t.apply(lambda x: ('월','화','수','목','금','토','일')[x.weekday()] + '요일') #인덱싱을 통해서 나열

0    수요일
1    목요일
2    금요일
3    토요일
4    일요일
5    월요일
6    화요일
dtype: object

In [24]:
df['date'] = Series(['2020-05-20','2020-05-21','2020-05-22','2020-05-23','2020-05-24','2020-05-25','2020-05-26'])
df

Unnamed: 0,id,site,pageview,dwelltime,date
0,1,a,1,7.0,2020-05-20
1,1,b,2,6.0,2020-05-21
2,1,c,3,5.0,2020-05-22
3,1,a,4,4.0,2020-05-23
4,2,a,5,3.0,2020-05-24
5,2,b,6,2.0,2020-05-25
6,2,b,7,1.0,2020-05-26


In [25]:
# 날짜가 문자열로 되어 있는 컬럼에서 요일을 얻으려면
df['dayofweek'] = df.date.astype('datetime64').dt.day_name()
df

Unnamed: 0,id,site,pageview,dwelltime,date,dayofweek
0,1,a,1,7.0,2020-05-20,Wednesday
1,1,b,2,6.0,2020-05-21,Thursday
2,1,c,3,5.0,2020-05-22,Friday
3,1,a,4,4.0,2020-05-23,Saturday
4,2,a,5,3.0,2020-05-24,Sunday
5,2,b,6,2.0,2020-05-25,Monday
6,2,b,7,1.0,2020-05-26,Tuesday


<font color = 'blue'>Elapsed time

In [26]:
# 2000-01-01부터의 경과일 계산
edays = (t - pd.to_datetime('2000-01-01'))
edays
# edays + 1 --> error 발생. 시간의 양은 +1을 연산할 수없음. -> 시간을 int로 변경해야함.

0   7445 days
1   7446 days
2   7447 days
3   7448 days
4   7449 days
5   7450 days
6   7451 days
dtype: timedelta64[ns]

In [27]:
edays.astype('timedelta64[D]').astype('int')# int 타입으로 변경하였기 때문에 +1을 연산할 수 있음.
# [d]는 days를 의미함. 

0    7445
1    7446
2    7447
3    7448
4    7449
5    7450
6    7451
dtype: int64

In [28]:
df['elapsed'] = (df.date.astype('datetime64') - pd.to_datetime('2000-01-01')).astype('timedelta64[D]').astype('int')
df

Unnamed: 0,id,site,pageview,dwelltime,date,dayofweek,elapsed
0,1,a,1,7.0,2020-05-20,Wednesday,7445
1,1,b,2,6.0,2020-05-21,Thursday,7446
2,1,c,3,5.0,2020-05-22,Friday,7447
3,1,a,4,4.0,2020-05-23,Saturday,7448
4,2,a,5,3.0,2020-05-24,Sunday,7449
5,2,b,6,2.0,2020-05-25,Monday,7450
6,2,b,7,1.0,2020-05-26,Tuesday,7451


In [44]:
df['date_3m'] = df.date.astype('datetime64') + pd.DateOffset(month = 3) #특정기간 후



## Handling Missing Data

In [46]:
df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'],
                 'data2': range(3)})
df5 = pd.merge(df3, df4, left_on='lkey', right_on='rkey', how='outer')
df5

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,,
7,,,d,2.0


<font color = 'blue'>Find missing values

In [58]:
df5

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,,
7,,,d,2.0


In [59]:
pd.isnull(df5)  # df5.isnull()

Unnamed: 0,lkey,data1,rkey,data2
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,True,True
7,True,True,False,False


In [60]:
df5.isnull()

Unnamed: 0,lkey,data1,rkey,data2
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,True,True
7,True,True,False,False


In [61]:
pd.notnull(df5.data1)  # df5.data1.notnull()

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7    False
Name: data1, dtype: bool

In [65]:
pd.notnull(df5.data1).value_counts()

True     7
False    1
Name: data1, dtype: int64

In [66]:
df5

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,,
7,,,d,2.0


<font color = 'blue'>Remove rows with missing values

In [63]:
df5.dropna(how='any')
#모든 행중 결측값이 있다면 행 삭제

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0


In [36]:
df5.dropna(how='all').reset_index(drop = True) #결측값 제외.
#모든 행이 결측값일 경우 행 삭제

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,,
7,,,d,2.0


In [37]:
#인덱스 6번의 값을 리스트 형식으로 변경
df5.iloc[6] = [np.nan, np.nan, np.nan, np.nan]
df5

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,,,,
7,,,d,2.0


In [38]:
df5.dropna(how='all').reset_index(drop=True) # 모든 값이 결측값일 경우에만 행이 삭제됨.

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,,,d,2.0


<font color = 'blue'>Replace Missing Values

보통 평균값, 최빈값 등 다른 값으로 결측값을 채움.

In [40]:
df5.fillna(-1)

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,-1,-1.0,-1,-1.0
7,-1,-1.0,d,2.0


In [41]:
df5.fillna({'data1': 1.5, 'data2': 0.5, 'lkey': 'Y', 'rkey': ''})
#각 컬럼 별로 다른 값으로 결측값을 채움.

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,Y,1.5,,0.5
7,Y,1.5,d,2.0


<font color = "#CC3D3D"><p>
# End