In [1]:
import numpy as np
import pandas as pd

In [2]:
data = [[9.9, 8.8], [np.nan, 6.6],[7.7, np.nan],  [0.99, 9.8]]
df = pd.DataFrame(data, columns=["first", "second"], index=["a", "b", "c", "d"]) 

In [3]:
df

Unnamed: 0,first,second
a,9.9,8.8
b,,6.6
c,7.7,
d,0.99,9.8


In [4]:
df.sum(axis=0) 

first     18.59
second    25.20
dtype: float64

In [5]:
df.sum(axis=1) 

a    18.70
b     6.60
c     7.70
d    10.79
dtype: float64

In [6]:
df

Unnamed: 0,first,second
a,9.9,8.8
b,,6.6
c,7.7,
d,0.99,9.8


In [7]:
df['first'].sum() 

18.59

In [8]:
df.loc['a'].sum() 

18.700000000000003

In [9]:
df.mean(axis=1, skipna=False) 

a    9.350
b      NaN
c      NaN
d    5.395
dtype: float64

In [10]:
df

Unnamed: 0,first,second
a,9.9,8.8
b,,6.6
c,7.7,
d,0.99,9.8


In [11]:
first_mean = df.mean(axis=0)['first'] 

In [12]:
second_min = df.min(axis=0)['second']

In [13]:
df['first'] = df['first'].fillna(value=first_mean) 

In [14]:
df['second'] = df['second'].fillna(value=second_min) 

In [15]:
df

Unnamed: 0,first,second
a,9.9,8.8
b,6.196667,6.6
c,7.7,6.6
d,0.99,9.8


In [16]:
df2 = pd.DataFrame(np.random.randn(6, 4),
                   columns=["A", "B", "C", "D"],
                   index=pd.date_range("20180220", periods=6)) 

In [17]:
df2

Unnamed: 0,A,B,C,D
2018-02-20,1.40071,-0.335419,0.173555,-0.238657
2018-02-21,-0.62075,1.485617,1.79914,1.236582
2018-02-22,-0.634166,-0.214743,0.289798,-0.910459
2018-02-23,-0.752375,0.445504,0.69804,0.316263
2018-02-24,-0.287388,-1.426095,1.142831,0.08454
2018-02-25,0.237973,0.347219,0.252159,-0.951904


In [18]:
df2['A'].corr(df2['B'])

-0.26712219971537038

In [19]:
df2['A'].corr(df2['C'])

-0.52257802878123594

In [20]:
df2['A'].corr(df2['D'])

-0.31406433675858253

In [21]:
df2['B'].corr(df2['D'])

0.45708499084990972

In [22]:
df2.corr() 

Unnamed: 0,A,B,C,D
A,1.0,-0.267122,-0.522578,-0.314064
B,-0.267122,1.0,0.355184,0.457085
C,-0.522578,0.355184,1.0,0.885657
D,-0.314064,0.457085,0.885657,1.0


In [23]:
df2.cov() 

Unnamed: 0,A,B,C,D
A,0.676611,-0.213105,-0.274982,-0.212796
B,-0.213105,0.940651,0.22037,0.365162
C,-0.274982,0.22037,0.40923,0.466685
D,-0.212796,0.365162,0.466685,0.678498


In [24]:
df2

Unnamed: 0,A,B,C,D
2018-02-20,1.40071,-0.335419,0.173555,-0.238657
2018-02-21,-0.62075,1.485617,1.79914,1.236582
2018-02-22,-0.634166,-0.214743,0.289798,-0.910459
2018-02-23,-0.752375,0.445504,0.69804,0.316263
2018-02-24,-0.287388,-1.426095,1.142831,0.08454
2018-02-25,0.237973,0.347219,0.252159,-0.951904


In [25]:
dates = df2.index
# permutation 은 치환
random_dates = np.random.permutation(dates)
# 무작위로 섞어봄. index 순서와 컬럼의 순서가 불규칙하게 변함
df2 = df2.reindex(index=random_dates, columns=["D", "B", "C", "A"]) 

In [26]:
df2

Unnamed: 0,D,B,C,A
2018-02-21,1.236582,1.485617,1.79914,-0.62075
2018-02-24,0.08454,-1.426095,1.142831,-0.287388
2018-02-22,-0.910459,-0.214743,0.289798,-0.634166
2018-02-20,-0.238657,-0.335419,0.173555,1.40071
2018-02-23,0.316263,0.445504,0.69804,-0.752375
2018-02-25,-0.951904,0.347219,0.252159,0.237973


In [27]:
df2.sort_index(axis=0)

Unnamed: 0,D,B,C,A
2018-02-20,-0.238657,-0.335419,0.173555,1.40071
2018-02-21,1.236582,1.485617,1.79914,-0.62075
2018-02-22,-0.910459,-0.214743,0.289798,-0.634166
2018-02-23,0.316263,0.445504,0.69804,-0.752375
2018-02-24,0.08454,-1.426095,1.142831,-0.287388
2018-02-25,-0.951904,0.347219,0.252159,0.237973


In [28]:
df2.sort_index(axis=0, ascending=False)

Unnamed: 0,D,B,C,A
2018-02-25,-0.951904,0.347219,0.252159,0.237973
2018-02-24,0.08454,-1.426095,1.142831,-0.287388
2018-02-23,0.316263,0.445504,0.69804,-0.752375
2018-02-22,-0.910459,-0.214743,0.289798,-0.634166
2018-02-21,1.236582,1.485617,1.79914,-0.62075
2018-02-20,-0.238657,-0.335419,0.173555,1.40071


In [29]:
df2.sort_index(axis=1) 

Unnamed: 0,A,B,C,D
2018-02-21,-0.62075,1.485617,1.79914,1.236582
2018-02-24,-0.287388,-1.426095,1.142831,0.08454
2018-02-22,-0.634166,-0.214743,0.289798,-0.910459
2018-02-20,1.40071,-0.335419,0.173555,-0.238657
2018-02-23,-0.752375,0.445504,0.69804,0.316263
2018-02-25,0.237973,0.347219,0.252159,-0.951904


In [30]:
df2.sort_index(axis=1, ascending=False) 

Unnamed: 0,D,C,B,A
2018-02-21,1.236582,1.79914,1.485617,-0.62075
2018-02-24,0.08454,1.142831,-1.426095,-0.287388
2018-02-22,-0.910459,0.289798,-0.214743,-0.634166
2018-02-20,-0.238657,0.173555,-0.335419,1.40071
2018-02-23,0.316263,0.69804,0.445504,-0.752375
2018-02-25,-0.951904,0.252159,0.347219,0.237973


In [31]:
df2

Unnamed: 0,D,B,C,A
2018-02-21,1.236582,1.485617,1.79914,-0.62075
2018-02-24,0.08454,-1.426095,1.142831,-0.287388
2018-02-22,-0.910459,-0.214743,0.289798,-0.634166
2018-02-20,-0.238657,-0.335419,0.173555,1.40071
2018-02-23,0.316263,0.445504,0.69804,-0.752375
2018-02-25,-0.951904,0.347219,0.252159,0.237973


In [32]:
df2.sort_values(by='D') 

Unnamed: 0,D,B,C,A
2018-02-25,-0.951904,0.347219,0.252159,0.237973
2018-02-22,-0.910459,-0.214743,0.289798,-0.634166
2018-02-20,-0.238657,-0.335419,0.173555,1.40071
2018-02-24,0.08454,-1.426095,1.142831,-0.287388
2018-02-23,0.316263,0.445504,0.69804,-0.752375
2018-02-21,1.236582,1.485617,1.79914,-0.62075


In [33]:
df2.sort_values(by='D', ascending=False) 

Unnamed: 0,D,B,C,A
2018-02-21,1.236582,1.485617,1.79914,-0.62075
2018-02-23,0.316263,0.445504,0.69804,-0.752375
2018-02-24,0.08454,-1.426095,1.142831,-0.287388
2018-02-20,-0.238657,-0.335419,0.173555,1.40071
2018-02-22,-0.910459,-0.214743,0.289798,-0.634166
2018-02-25,-0.951904,0.347219,0.252159,0.237973


In [34]:
df2

Unnamed: 0,D,B,C,A
2018-02-21,1.236582,1.485617,1.79914,-0.62075
2018-02-24,0.08454,-1.426095,1.142831,-0.287388
2018-02-22,-0.910459,-0.214743,0.289798,-0.634166
2018-02-20,-0.238657,-0.335419,0.173555,1.40071
2018-02-23,0.316263,0.445504,0.69804,-0.752375
2018-02-25,-0.951904,0.347219,0.252159,0.237973


In [35]:
df2["E"] = np.random.randint(0, 6, size=6)
df2["F"] = ["first", "second", "first", "third", "first", "second"] 

In [36]:
df2

Unnamed: 0,D,B,C,A,E,F
2018-02-21,1.236582,1.485617,1.79914,-0.62075,3,first
2018-02-24,0.08454,-1.426095,1.142831,-0.287388,0,second
2018-02-22,-0.910459,-0.214743,0.289798,-0.634166,3,first
2018-02-20,-0.238657,-0.335419,0.173555,1.40071,4,third
2018-02-23,0.316263,0.445504,0.69804,-0.752375,4,first
2018-02-25,-0.951904,0.347219,0.252159,0.237973,2,second


In [37]:
df2.sort_values(by=['E','F']) 

Unnamed: 0,D,B,C,A,E,F
2018-02-24,0.08454,-1.426095,1.142831,-0.287388,0,second
2018-02-25,-0.951904,0.347219,0.252159,0.237973,2,second
2018-02-21,1.236582,1.485617,1.79914,-0.62075,3,first
2018-02-22,-0.910459,-0.214743,0.289798,-0.634166,3,first
2018-02-23,0.316263,0.445504,0.69804,-0.752375,4,first
2018-02-20,-0.238657,-0.335419,0.173555,1.40071,4,third


In [38]:
df2['F'].unique() 

array(['first', 'second', 'third'], dtype=object)

In [39]:
df2['F'].value_counts() 

first     3
second    2
third     1
Name: F, dtype: int64

In [40]:
df3 = pd.DataFrame(np.random.randn(4, 3), columns=["b", "d", "e"],
                   index=["Seoul", "Incheon", "Busan", "Daegu"]) 

In [41]:
df3

Unnamed: 0,b,d,e
Seoul,-1.938474,-1.93222,-0.416785
Incheon,-0.958943,1.360802,0.994511
Busan,1.133526,-0.822016,-0.041062
Daegu,-1.859373,0.811166,0.81524


- 사용자 정의 함수를 DataFrame 에 적용하기

In [42]:
func = lambda x : x.max() - x.min() 

In [44]:
df.apply(func, axis=0) 

first     8.91
second    3.20
dtype: float64

In [45]:
df.apply(func, axis=1)  

a    1.100000
b    0.403333
c    1.100000
d    8.810000
dtype: float64

- 누적합과 누적곱

In [46]:
arr = np.array([1,2,3,4,5]) 
arr

array([1, 2, 3, 4, 5])

In [47]:
np.cumsum(arr) 

array([ 1,  3,  6, 10, 15], dtype=int32)

In [48]:
arr.cumsum() 

array([ 1,  3,  6, 10, 15], dtype=int32)

In [49]:
np.cumprod(arr) 

array([  1,   2,   6,  24, 120], dtype=int32)

In [50]:
sr = pd.Series([1,2,3,4,5]) 
sr

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [51]:
np.cumsum(sr) 

0     1
1     3
2     6
3    10
4    15
dtype: int64

In [52]:
sr.cumsum() 

0     1
1     3
2     6
3    10
4    15
dtype: int64

In [53]:
sr.cumprod() 

0      1
1      2
2      6
3     24
4    120
dtype: int64

In [54]:
df = pd.DataFrame([ [1,2,3,4,5], [6,7,8,9,10]]) 
df

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,5
1,6,7,8,9,10


In [55]:
df.cumsum() 

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,5
1,7,9,11,13,15


In [56]:
df.cumprod() 

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,5
1,6,14,24,36,50
