### 데이터 분석용 함수 사용하기

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = [[1.4, np.nan],
           [7.1, -4.5],
        [np.nan, np.nan],
        [0.75, -1.3]]
df = pd.DataFrame(data, columns=["one", "two"], index=["a", "b", "c", "d"])


### pandas의 DataFrame에 적용되는 통계 함수 정리

- count : 전체 성분의 (NaN이 아닌) 값의 갯수를 계산
- min, max : 전체 성분의 최솟, 최댓값을 계산
- argmin, argmax : 전체 성분의 최솟값, 최댓값이 위치한 (정수)인덱스를 반환
- idxmin, idxmax : 전체 인덱스 중 최솟값, 최댓값을 반환
- quantile : 전체 성분의 특정 사분위수에 해당하는 값을 반환 (0~1 사이)
- sum :	전체 성분의 합을 계산
- mean	: 전체 성분의 평균을 계산
- median  :	전체 성분의 중간값을 반환
- mad :	전체 성분의 평균값으로부터의 절대 편차(absolute deviation)의 평균을 계산
- std, var  : 전체 성분의 표준편차, 분산을 계산
- cumsum :	맨 첫 번째 성분부터 각 성분까지의 누적합을 계산 (0에서부터 계속 더해짐)
- cumprod	: 맨 첫번째 성분부터 각 성분까지의 누적곱을 계산 (1에서부터 계속 곱해짐)

In [3]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [5]:
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [6]:
# NaN 값 배제하고 계산
df.sum(axis=1)

a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

In [7]:
df["one"].sum()

9.25

In [8]:
df.loc["b"].sum()

2.5999999999999996

In [9]:
# 계산에 NaN이 하나라도 포함되면 결과 NaN
df.mean(axis=1,skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [10]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [11]:
one_mean = df.mean(axis=0)["one"]

In [19]:
one_mean

3.0833333333333335

In [16]:
two_min = df.min(axis=0)["two"]

In [20]:
two_min

-4.5

In [17]:
# 결측값 채우기
df["one"] = df["one"].fillna(value=one_mean)

In [18]:
df["two"] = df["two"].fillna(value=two_min)

In [21]:
df

Unnamed: 0,one,two
a,1.4,-4.5
b,7.1,-4.5
c,3.083333,-4.5
d,0.75,-1.3


In [22]:
df2 = pd.DataFrame(np.random.randn(6, 4),
                   columns=["A", "B", "C", "D"],
                   index=pd.date_range("20160701", periods=6))

In [24]:
df2

Unnamed: 0,A,B,C,D
2016-07-01,-1.358444,1.061269,0.403506,-1.911504
2016-07-02,1.588405,-0.27325,0.521027,0.463116
2016-07-03,0.045428,1.968125,0.324094,0.006559
2016-07-04,0.126236,0.285163,-0.835457,0.540301
2016-07-05,-1.580125,-0.364743,1.288973,-0.073984
2016-07-06,-0.271599,-1.005723,-1.549332,0.609718


In [25]:
# A열과 B열의 상관계수 구하기
# A열 데이터 프레임을 전체선택 후 corr함수를 이용하여 B열 데이터와의 상관계수를 구함
df2["A"].corr(df2["B"])

-0.080046649772919864

In [28]:
#공분산 구하기
df2["B"].cov(df2["C"])

0.34619474129850825

In [29]:
# df2의 모든 열들 간의 상관계수 구하기
df2.corr()

Unnamed: 0,A,B,C,D
A,1.0,-0.080047,-0.235058,0.597563
B,-0.080047,1.0,0.311404,-0.481707
C,-0.235058,0.311404,1.0,-0.411062
D,0.597563,-0.481707,-0.411062,1.0


In [30]:
# df2의 모든 열들 간의 공분산 구하기
df2.cov()

Unnamed: 0,A,B,C,D
A,1.3213,-0.099399,-0.278058,0.652738
B,-0.099399,1.167004,0.346195,-0.494508
C,-0.278058,0.346195,1.05906,-0.401996
D,0.652738,-0.494508,-0.401996,0.903044


In [31]:
dates = df2.index
random_dates = np.random.permutation(dates)
df2 = df2.reindex(index=random_dates, columns=["D", "B", "C", "A"])

In [32]:
df2

Unnamed: 0,D,B,C,A
2016-07-06,0.609718,-1.005723,-1.549332,-0.271599
2016-07-04,0.540301,0.285163,-0.835457,0.126236
2016-07-02,0.463116,-0.27325,0.521027,1.588405
2016-07-01,-1.911504,1.061269,0.403506,-1.358444
2016-07-05,-0.073984,-0.364743,1.288973,-1.580125
2016-07-03,0.006559,1.968125,0.324094,0.045428


In [34]:
# 인덱스 오름차순 정렬
df2.sort_index(axis=0)

Unnamed: 0,D,B,C,A
2016-07-01,-1.911504,1.061269,0.403506,-1.358444
2016-07-02,0.463116,-0.27325,0.521027,1.588405
2016-07-03,0.006559,1.968125,0.324094,0.045428
2016-07-04,0.540301,0.285163,-0.835457,0.126236
2016-07-05,-0.073984,-0.364743,1.288973,-1.580125
2016-07-06,0.609718,-1.005723,-1.549332,-0.271599


In [35]:
# 컬럼 인덱스 오름차순 정렬
df2.sort_index(axis=1)

Unnamed: 0,A,B,C,D
2016-07-06,-0.271599,-1.005723,-1.549332,0.609718
2016-07-04,0.126236,0.285163,-0.835457,0.540301
2016-07-02,1.588405,-0.27325,0.521027,0.463116
2016-07-01,-1.358444,1.061269,0.403506,-1.911504
2016-07-05,-1.580125,-0.364743,1.288973,-0.073984
2016-07-03,0.045428,1.968125,0.324094,0.006559


In [36]:
# 내림차순으로 정렬
df2.sort_index(axis=0, ascending=False)

Unnamed: 0,D,B,C,A
2016-07-06,0.609718,-1.005723,-1.549332,-0.271599
2016-07-05,-0.073984,-0.364743,1.288973,-1.580125
2016-07-04,0.540301,0.285163,-0.835457,0.126236
2016-07-03,0.006559,1.968125,0.324094,0.045428
2016-07-02,0.463116,-0.27325,0.521027,1.588405
2016-07-01,-1.911504,1.061269,0.403506,-1.358444


In [38]:
df2

Unnamed: 0,D,B,C,A
2016-07-06,0.609718,-1.005723,-1.549332,-0.271599
2016-07-04,0.540301,0.285163,-0.835457,0.126236
2016-07-02,0.463116,-0.27325,0.521027,1.588405
2016-07-01,-1.911504,1.061269,0.403506,-1.358444
2016-07-05,-0.073984,-0.364743,1.288973,-1.580125
2016-07-03,0.006559,1.968125,0.324094,0.045428


In [39]:
# D열 값을 기준으로 오름차순 정렬
df2.sort_values(by="D")

Unnamed: 0,D,B,C,A
2016-07-01,-1.911504,1.061269,0.403506,-1.358444
2016-07-05,-0.073984,-0.364743,1.288973,-1.580125
2016-07-03,0.006559,1.968125,0.324094,0.045428
2016-07-02,0.463116,-0.27325,0.521027,1.588405
2016-07-04,0.540301,0.285163,-0.835457,0.126236
2016-07-06,0.609718,-1.005723,-1.549332,-0.271599


In [40]:
df2.sort_values(by="B")

Unnamed: 0,D,B,C,A
2016-07-06,0.609718,-1.005723,-1.549332,-0.271599
2016-07-05,-0.073984,-0.364743,1.288973,-1.580125
2016-07-02,0.463116,-0.27325,0.521027,1.588405
2016-07-04,0.540301,0.285163,-0.835457,0.126236
2016-07-01,-1.911504,1.061269,0.403506,-1.358444
2016-07-03,0.006559,1.968125,0.324094,0.045428


In [None]:
df2

In [41]:
df2["E"] = np.random.randint(0, 6, size=6)
df2["F"] = ["alpha", "beta", "gamma", "gamma", "alpha", "gamma"]

In [42]:
df2

Unnamed: 0,D,B,C,A,E,F
2016-07-06,0.609718,-1.005723,-1.549332,-0.271599,3,alpha
2016-07-04,0.540301,0.285163,-0.835457,0.126236,4,beta
2016-07-02,0.463116,-0.27325,0.521027,1.588405,4,gamma
2016-07-01,-1.911504,1.061269,0.403506,-1.358444,3,gamma
2016-07-05,-0.073984,-0.364743,1.288973,-1.580125,4,alpha
2016-07-03,0.006559,1.968125,0.324094,0.045428,5,gamma


In [43]:
# E먼저 정럴 후 F순으로 정렬
df2.sort_values(by=["E","F"])

Unnamed: 0,D,B,C,A,E,F
2016-07-06,0.609718,-1.005723,-1.549332,-0.271599,3,alpha
2016-07-01,-1.911504,1.061269,0.403506,-1.358444,3,gamma
2016-07-05,-0.073984,-0.364743,1.288973,-1.580125,4,alpha
2016-07-04,0.540301,0.285163,-0.835457,0.126236,4,beta
2016-07-02,0.463116,-0.27325,0.521027,1.588405,4,gamma
2016-07-03,0.006559,1.968125,0.324094,0.045428,5,gamma


In [44]:
# F의 열의 유니크값 
df2["F"].unique()

array(['alpha', 'beta', 'gamma'], dtype=object)

In [45]:
# F열에서 특정한 값의 갯수
df2["F"].value_counts()

gamma    3
alpha    2
beta     1
Name: F, dtype: int64

In [46]:
# F열에서 alpha ,beta 를 포함하고 있는지 불리언 마스크
df2["F"].isin(["alph","beta"])

2016-07-06    False
2016-07-04     True
2016-07-02    False
2016-07-01    False
2016-07-05    False
2016-07-03    False
Name: F, dtype: bool

In [47]:
df2.loc[df2["F"].isin(["alpha","beta"]),:]

Unnamed: 0,D,B,C,A,E,F
2016-07-06,0.609718,-1.005723,-1.549332,-0.271599,3,alpha
2016-07-04,0.540301,0.285163,-0.835457,0.126236,4,beta
2016-07-05,-0.073984,-0.364743,1.288973,-1.580125,4,alpha


In [48]:
df3 = pd.DataFrame(np.random.randn(4, 3), columns=["b", "d", "e"],
                   index=["Seoul", "Incheon", "Busan", "Daegu"])

In [49]:
df3

Unnamed: 0,b,d,e
Seoul,0.620957,-0.262335,0.162728
Incheon,0.773077,-0.091086,-0.505358
Busan,0.082551,-0.165307,0.434979
Daegu,-1.42617,-0.987191,-1.089048


In [52]:
# 사용자 정의 함수 
# lambda이름의 함수를 정의한 뒤 func변수에 저장
func = lambda x : x.max() - x.min()

In [53]:
# apply함수를 이용하여 df3 데이터 프레임에 사용자 함수 적용
df3.apply(func, axis = 0)

b    2.199247
d    0.896104
e    1.524027
dtype: float64

In [54]:
df3.apply(func, axis = 1 )

Seoul      0.883292
Incheon    1.278435
Busan      0.600286
Daegu      0.438979
dtype: float64