In [2]:
import numpy as np
import pandas as pd

In [3]:
data = [[9.9, 8.8], [np.nan, 6.6],[7.7, np.nan],  [0.99, 9.8]]
print(data)
df = pd.DataFrame(data, columns=["first", "second"], index=["a", "b", "c", "d"])
df

[[9.9, 8.8], [nan, 6.6], [7.7, nan], [0.99, 9.8]]


Unnamed: 0,first,second
a,9.9,8.8
b,,6.6
c,7.7,
d,0.99,9.8


In [4]:
df.sum(axis=0)

first     18.59
second    25.20
dtype: float64

In [5]:
df.sum(axis=1)

a    18.70
b     6.60
c     7.70
d    10.79
dtype: float64

In [9]:
df['first'].sum()

18.59

In [13]:
df.loc['a'].sum()

18.700000000000003

In [18]:
df.mean(axis=1, skipna=False)

a    9.350
b      NaN
c      NaN
d    5.395
dtype: float64

In [19]:
df

Unnamed: 0,first,second
a,9.9,8.8
b,,6.6
c,7.7,
d,0.99,9.8


In [21]:
first_mean = df.mean(axis=0)['first']
first_mean

6.196666666666666

In [22]:
second_min = df.min(axis=0)['second']
second_min

6.6

In [26]:
df['first'] = df['first'].fillna(value=first_mean)

In [27]:
df['second'] = df['second'].fillna(value=second_min)
df['second']

a    8.8
b    6.6
c    6.6
d    9.8
Name: second, dtype: float64

In [28]:
df

Unnamed: 0,first,second
a,9.9,8.8
b,6.196667,6.6
c,7.7,6.6
d,0.99,9.8


In [30]:
df2 = pd.DataFrame(np.random.randn(6, 4),
                   columns=["A", "B", "C", "D"],
                   index=pd.date_range("20180220", periods=6))
df2

Unnamed: 0,A,B,C,D
2018-02-20,0.25652,0.357066,-1.191746,1.361154
2018-02-21,-1.93045,0.487121,-0.570828,0.418806
2018-02-22,0.242921,-1.11553,-0.040527,1.073412
2018-02-23,-1.090743,0.814218,1.362925,0.150908
2018-02-24,0.027758,-1.558914,-1.174422,0.791645
2018-02-25,1.02103,0.807901,0.299328,-0.156414


In [31]:
# A, B의 상관계수
df2['A'].corr(df2['B'])

-0.23089263230345833

In [32]:
# A, C의 상관계수
df2['A'].corr(df2['C'])

-0.14332255029399735

In [33]:
# A, D의 상관계수
df2['A'].corr(df2['D'])

0.12788713181054917

In [34]:
# B, C의 상관계수
df2['B'].corr(df2['C'])

0.4898860465354469

In [35]:
# B, D의 상관계수
df2['B'].corr(df2['D'])

-0.5454965182458865

In [36]:
# C, D의 상관계수
df2['C'].corr(df2['D'])

-0.668905344286815

In [39]:
# df2 전체에 대한 상관계수 구하기
df2.corr()

Unnamed: 0,A,B,C,D
A,1.0,-0.230893,-0.143323,0.127887
B,-0.230893,1.0,0.489886,-0.545497
C,-0.143323,0.489886,1.0,-0.668905
D,0.127887,-0.545497,-0.668905,1.0


In [41]:
# 공분산(covariance)
df2.cov()

Unnamed: 0,A,B,C,D
A,1.144568,-0.255467,-0.150013,0.078516
B,-0.255467,1.069568,0.495671,-0.323749
C,-0.150013,0.495671,0.957171,-0.375553
D,0.078516,-0.323749,-0.375553,0.329324


In [42]:
dates = df2.index
# permutation 은 치환
random_dates = np.random.permutation(dates)
# 무작위로 섞어봄. index 순서와 컬럼의 순서가 불규칙하게 변함.
df2 = df2.reindex(index=random_dates, columns=["D", "B", "C", "A"])

In [43]:
df2

Unnamed: 0,D,B,C,A
2018-02-21,0.418806,0.487121,-0.570828,-1.93045
2018-02-20,1.361154,0.357066,-1.191746,0.25652
2018-02-24,0.791645,-1.558914,-1.174422,0.027758
2018-02-23,0.150908,0.814218,1.362925,-1.090743
2018-02-25,-0.156414,0.807901,0.299328,1.02103
2018-02-22,1.073412,-1.11553,-0.040527,0.242921


In [45]:
df2.sort_index?

In [46]:
df2.sort_index(inplace=True)
df2

Unnamed: 0,D,B,C,A
2018-02-20,1.361154,0.357066,-1.191746,0.25652
2018-02-21,0.418806,0.487121,-0.570828,-1.93045
2018-02-22,1.073412,-1.11553,-0.040527,0.242921
2018-02-23,0.150908,0.814218,1.362925,-1.090743
2018-02-24,0.791645,-1.558914,-1.174422,0.027758
2018-02-25,-0.156414,0.807901,0.299328,1.02103


In [48]:
df2.sort_index(axis=1)
df2

Unnamed: 0,D,B,C,A
2018-02-20,1.361154,0.357066,-1.191746,0.25652
2018-02-21,0.418806,0.487121,-0.570828,-1.93045
2018-02-22,1.073412,-1.11553,-0.040527,0.242921
2018-02-23,0.150908,0.814218,1.362925,-1.090743
2018-02-24,0.791645,-1.558914,-1.174422,0.027758
2018-02-25,-0.156414,0.807901,0.299328,1.02103


In [49]:
df2.sort_values?

In [50]:
df2.sort_values(by='D')

Unnamed: 0,D,B,C,A
2018-02-25,-0.156414,0.807901,0.299328,1.02103
2018-02-23,0.150908,0.814218,1.362925,-1.090743
2018-02-21,0.418806,0.487121,-0.570828,-1.93045
2018-02-24,0.791645,-1.558914,-1.174422,0.027758
2018-02-22,1.073412,-1.11553,-0.040527,0.242921
2018-02-20,1.361154,0.357066,-1.191746,0.25652


### 열 추가

In [51]:
df2["E"] = np.random.randint(0, 6, size=6)
df2

Unnamed: 0,D,B,C,A,E
2018-02-20,1.361154,0.357066,-1.191746,0.25652,2
2018-02-21,0.418806,0.487121,-0.570828,-1.93045,5
2018-02-22,1.073412,-1.11553,-0.040527,0.242921,4
2018-02-23,0.150908,0.814218,1.362925,-1.090743,1
2018-02-24,0.791645,-1.558914,-1.174422,0.027758,3
2018-02-25,-0.156414,0.807901,0.299328,1.02103,3


In [52]:
df2["F"] = ["first", "second", "first", "third", "first", "second"]
df2

Unnamed: 0,D,B,C,A,E,F
2018-02-20,1.361154,0.357066,-1.191746,0.25652,2,first
2018-02-21,0.418806,0.487121,-0.570828,-1.93045,5,second
2018-02-22,1.073412,-1.11553,-0.040527,0.242921,4,first
2018-02-23,0.150908,0.814218,1.362925,-1.090743,1,third
2018-02-24,0.791645,-1.558914,-1.174422,0.027758,3,first
2018-02-25,-0.156414,0.807901,0.299328,1.02103,3,second


In [53]:
df2.sort_values(by=['E', 'F'])

Unnamed: 0,D,B,C,A,E,F
2018-02-23,0.150908,0.814218,1.362925,-1.090743,1,third
2018-02-20,1.361154,0.357066,-1.191746,0.25652,2,first
2018-02-24,0.791645,-1.558914,-1.174422,0.027758,3,first
2018-02-25,-0.156414,0.807901,0.299328,1.02103,3,second
2018-02-22,1.073412,-1.11553,-0.040527,0.242921,4,first
2018-02-21,0.418806,0.487121,-0.570828,-1.93045,5,second


In [55]:
df2['F'].unique()

array(['first', 'second', 'third'], dtype=object)

In [57]:
df2['F'].value_counts()

first     3
second    2
third     1
Name: F, dtype: int64

In [58]:
df3 = pd.DataFrame(np.random.randn(4, 3),
                   columns=["b", "d", "e"],
                   index=["Seoul", "Incheon", "Busan", "Daegu"])

In [59]:
df3

Unnamed: 0,b,d,e
Seoul,-0.31895,1.380816,-0.219208
Incheon,-0.081056,0.688359,0.85718
Busan,0.475828,-0.93234,-0.887313
Daegu,-0.435069,0.703066,-1.557316


## 누적합과 누적곱

In [60]:
arr = np.array([1,2,3,4,5])
arr

array([1, 2, 3, 4, 5])

In [61]:
np.cumsum(arr)

array([ 1,  3,  6, 10, 15])

In [62]:
np.cumproduct(arr)

array([  1,   2,   6,  24, 120])

In [63]:
series = pd.Series([1,2,3,4,5])
series

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [64]:
np.cumsum(series)

0     1
1     3
2     6
3    10
4    15
dtype: int64

In [65]:
series.cumsum()

0     1
1     3
2     6
3    10
4    15
dtype: int64

In [66]:
series.cumprod()

0      1
1      2
2      6
3     24
4    120
dtype: int64

In [67]:
df = pd.DataFrame([[1,2,3,4,5], [6,7,8,9,10]])
df

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,5
1,6,7,8,9,10


In [68]:
df.cumsum()

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,5
1,7,9,11,13,15


In [69]:
df.cumprod()

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,5
1,6,14,24,36,50
