In [1]:
import pandas as pd
import numpy as np

In [4]:
# Series 생성
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
# 날짜 형의 데이텅 : data_range, periods : 6일간
dates = pd.date_range('20220101', periods=6)
dates

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2022-01-01,0.130354,1.500406,0.17329,-1.784245
2022-01-02,1.71319,0.499851,0.15617,0.066028
2022-01-03,-0.446603,0.940961,1.294809,0.993898
2022-01-04,0.858115,1.31184,1.174808,-0.261098
2022-01-05,0.776659,0.415124,-0.009434,0.237204
2022-01-06,-0.925415,-0.033729,-0.576099,-0.177671


In [6]:
# head : 기본적으로 첫 5행을 보여줌
df.head(3)

Unnamed: 0,A,B,C,D
2022-01-01,0.130354,1.500406,0.17329,-1.784245
2022-01-02,1.71319,0.499851,0.15617,0.066028
2022-01-03,-0.446603,0.940961,1.294809,0.993898


In [7]:
df.index

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [9]:
df.values

array([[ 0.13035414,  1.50040573,  0.17329026, -1.78424483],
       [ 1.71319008,  0.49985097,  0.15617017,  0.06602839],
       [-0.44660293,  0.94096097,  1.29480883,  0.99389843],
       [ 0.85811534,  1.31184029,  1.17480779, -0.26109759],
       [ 0.77665872,  0.41512424, -0.00943443,  0.23720353],
       [-0.92541459, -0.03372924, -0.57609942, -0.17767136]])

In [10]:
# df.info() : df 변수 dataframe의 개요를 알 수 있음
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2022-01-01 to 2022-01-06
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [11]:
# df.describe() : 통계적 개요 확인 가능
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.35105,0.772409,0.368924,-0.154314
std,0.96,0.583297,0.724772,0.915298
min,-0.925415,-0.033729,-0.576099,-1.784245
25%,-0.302364,0.436306,0.031967,-0.240241
50%,0.453506,0.720406,0.16473,-0.055821
75%,0.837751,1.21912,0.924428,0.19441
max,1.71319,1.500406,1.294809,0.993898


In [12]:
# sort_values : by로 지정된 컬럼 기준으로 정렬. ascending : 내림차순 / 오름차순 정렬
df.sort_values(by='B', ascending=True)

Unnamed: 0,A,B,C,D
2022-01-06,-0.925415,-0.033729,-0.576099,-0.177671
2022-01-05,0.776659,0.415124,-0.009434,0.237204
2022-01-02,1.71319,0.499851,0.15617,0.066028
2022-01-03,-0.446603,0.940961,1.294809,0.993898
2022-01-04,0.858115,1.31184,1.174808,-0.261098
2022-01-01,0.130354,1.500406,0.17329,-1.784245


In [13]:
df

Unnamed: 0,A,B,C,D
2022-01-01,0.130354,1.500406,0.17329,-1.784245
2022-01-02,1.71319,0.499851,0.15617,0.066028
2022-01-03,-0.446603,0.940961,1.294809,0.993898
2022-01-04,0.858115,1.31184,1.174808,-0.261098
2022-01-05,0.776659,0.415124,-0.009434,0.237204
2022-01-06,-0.925415,-0.033729,-0.576099,-0.177671


In [15]:
# 해당 컬럼 A만 series로 보여줌
df['A']

2022-01-01    0.130354
2022-01-02    1.713190
2022-01-03   -0.446603
2022-01-04    0.858115
2022-01-05    0.776659
2022-01-06   -0.925415
Freq: D, Name: A, dtype: float64

In [16]:
# 행의 범위 지정
df[0:3]

Unnamed: 0,A,B,C,D
2022-01-01,0.130354,1.500406,0.17329,-1.784245
2022-01-02,1.71319,0.499851,0.15617,0.066028
2022-01-03,-0.446603,0.940961,1.294809,0.993898


In [17]:
df['2022-01-01':'2022-01-03']

Unnamed: 0,A,B,C,D
2022-01-01,0.130354,1.500406,0.17329,-1.784245
2022-01-02,1.71319,0.499851,0.15617,0.066028
2022-01-03,-0.446603,0.940961,1.294809,0.993898


In [18]:
# loc : location 옵션으로 슬라이싱
df.loc[dates[0]]

A    0.130354
B    1.500406
C    0.173290
D   -1.784245
Name: 2022-01-01 00:00:00, dtype: float64

In [19]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2022-01-01,0.130354,1.500406
2022-01-02,1.71319,0.499851
2022-01-03,-0.446603,0.940961
2022-01-04,0.858115,1.31184
2022-01-05,0.776659,0.415124
2022-01-06,-0.925415,-0.033729


In [20]:
df.loc['2022-01-01':'2022-01-03', ['A', 'B']]

Unnamed: 0,A,B
2022-01-01,0.130354,1.500406
2022-01-02,1.71319,0.499851
2022-01-03,-0.446603,0.940961


In [21]:
df.loc['2022-01-02', ['A', 'B']]

A    1.713190
B    0.499851
Name: 2022-01-02 00:00:00, dtype: float64

In [23]:
df.loc[dates[0], 'A']   # 컬럼 내용 확인

0.13035414459117528

In [24]:
# loc 명령과 달리 행과 열의 번호를 이용해 데이터에 접근 : iloc
df.iloc[3]

A    0.858115
B    1.311840
C    1.174808
D   -0.261098
Name: 2022-01-04 00:00:00, dtype: float64

In [25]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2022-01-04,0.858115,1.31184
2022-01-05,0.776659,0.415124


In [26]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2022-01-02,1.71319,0.15617
2022-01-03,-0.446603,1.294809
2022-01-05,0.776659,-0.009434


In [27]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2022-01-02,1.71319,0.499851,0.15617,0.066028
2022-01-03,-0.446603,0.940961,1.294809,0.993898


In [28]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2022-01-01,1.500406,0.17329
2022-01-02,0.499851,0.15617
2022-01-03,0.940961,1.294809
2022-01-04,1.31184,1.174808
2022-01-05,0.415124,-0.009434
2022-01-06,-0.033729,-0.576099


In [29]:
df

Unnamed: 0,A,B,C,D
2022-01-01,0.130354,1.500406,0.17329,-1.784245
2022-01-02,1.71319,0.499851,0.15617,0.066028
2022-01-03,-0.446603,0.940961,1.294809,0.993898
2022-01-04,0.858115,1.31184,1.174808,-0.261098
2022-01-05,0.776659,0.415124,-0.009434,0.237204
2022-01-06,-0.925415,-0.033729,-0.576099,-0.177671


In [30]:
# 특정 조건을 만족하는 데이터만 얻기
df[df.A > 0]

Unnamed: 0,A,B,C,D
2022-01-01,0.130354,1.500406,0.17329,-1.784245
2022-01-02,1.71319,0.499851,0.15617,0.066028
2022-01-04,0.858115,1.31184,1.174808,-0.261098
2022-01-05,0.776659,0.415124,-0.009434,0.237204


In [31]:
df[df > 0]

Unnamed: 0,A,B,C,D
2022-01-01,0.130354,1.500406,0.17329,
2022-01-02,1.71319,0.499851,0.15617,0.066028
2022-01-03,,0.940961,1.294809,0.993898
2022-01-04,0.858115,1.31184,1.174808,
2022-01-05,0.776659,0.415124,,0.237204
2022-01-06,,,,


In [32]:
# dataframe을 복사할 때 그냥 = 기호를 복사하면 실제 데이터의 내용이 복사되는 것이 아닌 데이터 위치만 복사됨
# 데이터의 내용까지 복사하라는 명령은 copy() 옵션 붙이면 됨

df2 = df.copy()

In [33]:
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

In [34]:
df2['E'].isin(['two', 'four'])

2022-01-01    False
2022-01-02    False
2022-01-03     True
2022-01-04    False
2022-01-05     True
2022-01-06    False
Freq: D, Name: E, dtype: bool

In [35]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2022-01-03,-0.446603,0.940961,1.294809,0.993898,two
2022-01-05,0.776659,0.415124,-0.009434,0.237204,four


In [36]:
df

Unnamed: 0,A,B,C,D
2022-01-01,0.130354,1.500406,0.17329,-1.784245
2022-01-02,1.71319,0.499851,0.15617,0.066028
2022-01-03,-0.446603,0.940961,1.294809,0.993898
2022-01-04,0.858115,1.31184,1.174808,-0.261098
2022-01-05,0.776659,0.415124,-0.009434,0.237204
2022-01-06,-0.925415,-0.033729,-0.576099,-0.177671


In [37]:
# numpy의 cumsum : 누적합
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2022-01-01,0.130354,1.500406,0.17329,-1.784245
2022-01-02,1.843544,2.000257,0.32946,-1.718216
2022-01-03,1.396941,2.941218,1.624269,-0.724318
2022-01-04,2.255057,4.253058,2.799077,-0.985416
2022-01-05,3.031715,4.668182,2.789643,-0.748212
2022-01-06,2.106301,4.634453,2.213543,-0.925883


In [38]:
df.apply(lambda x: x.max() - x.min())

A    2.638605
B    1.534135
C    1.870908
D    2.778143
dtype: float64