In [20]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
                              'Parrot', 'Parrot'],
                   'Max Speed': [380., 370., 24., 26.]})

In [3]:
df

Unnamed: 0,Animal,Max Speed
0,Falcon,380.0
1,Falcon,370.0
2,Parrot,24.0
3,Parrot,26.0


In [4]:
df.groupby(['Animal']).mean()

Unnamed: 0_level_0,Max Speed
Animal,Unnamed: 1_level_1
Falcon,375.0
Parrot,25.0


In [5]:
df.groupby('Animal')['Max Speed'].mean()

Animal
Falcon    375.0
Parrot     25.0
Name: Max Speed, dtype: float64

In [6]:
arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
          ['Captive', 'Wild', 'Captive', 'Wild']]

In [8]:
index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, index=index)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Max Speed
Animal,Type,Unnamed: 2_level_1
Falcon,Captive,390.0
Falcon,Wild,350.0
Parrot,Captive,30.0
Parrot,Wild,20.0


In [11]:
df.groupby(level =0).mean()

Unnamed: 0_level_0,Max Speed
Animal,Unnamed: 1_level_1
Falcon,370.0
Parrot,25.0


In [12]:
l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
df = pd.DataFrame(l, columns=["a", "b", "c"])

In [13]:
df

Unnamed: 0,a,b,c
0,1,2.0,3
1,1,,4
2,2,1.0,3
3,1,2.0,2


In [14]:
df.groupby(by=['b']).sum()

Unnamed: 0_level_0,a,c
b,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2,3
2.0,2,5


In [15]:
df.groupby(by=['b'], dropna=False).sum()

Unnamed: 0_level_0,a,c
b,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2,3
2.0,2,5
,1,4


In [16]:
l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
df = pd.DataFrame(l, columns=["a", "b", "c"])

In [17]:
df

Unnamed: 0,a,b,c
0,a,12.0,12.0
1,,12.3,33.0
2,b,12.3,123.0
3,a,1.0,1.0


In [18]:
df.groupby(by='a').sum()

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
a,13.0,13.0
b,12.3,123.0


In [21]:
speeds = pd.DataFrame(
    [
        ("bird", "Falconiformes", 389.0),
        ("bird", "Psittaciformes", 24.0),
        ("mammal", "Carnivora", 80.2),
        ("mammal", "Primates", np.nan),
        ("mammal", "Carnivora", 58),
    ],
    index=["falcon", "parrot", "lion", "monkey", "leopard"],
    columns=("class", "order", "max_speed"),
)


In [22]:
speeds

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [23]:
grouped = speeds.groupby("class")

In [24]:
grouped = speeds.groupby(['class','order'])

In [26]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)


In [27]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.366484,0.9239
1,bar,one,0.618689,0.686885
2,foo,two,-0.857125,0.724137
3,bar,three,-0.635196,-0.913241
4,foo,two,-0.227441,0.66126
5,bar,two,1.51136,-0.333994
6,foo,one,-0.771498,0.743207
7,foo,three,-0.884515,-2.351585


In [None]:
df2 = df.set_index(['A','B'])

In [None]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)

df2 = df.set_index(['A','B'])

grouped = df2.groupby(level=df2.index.names.difference(['B']))
grouped.sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.494853,-0.56035
foo,-3.107063,0.700918


In [32]:
# 코드 13-1. to_datetime 함수 실습 예제 코드
import pandas as pd
data = ['2023-01-01', '2023-02-02', '2023-03-02', '2023-04-10', '2023-05-31']
s = pd.Series(data)
s1 = pd.Series(['2022-01-03', '김판다'])
s

0    2023-01-01
1    2023-02-02
2    2023-03-02
3    2023-04-10
4    2023-05-31
dtype: object

In [33]:
pd.to_datetime(s)

0   2023-01-01
1   2023-02-02
2   2023-03-02
3   2023-04-10
4   2023-05-31
dtype: datetime64[ns]

In [35]:
s2 = pd.to_datetime(s)
# s2 + pd.Timedelta('2 day')
s2.dt.to_period(freq='Q')

0    2023Q1
1    2023Q1
2    2023Q1
3    2023Q2
4    2023Q2
dtype: period[Q-DEC]

In [36]:
# 코드 13-6. DatetimeIndex의 인덱싱과 슬라이싱 실습 예제 코드
import pandas as pd
date = ['2025-12-31 00:30:10', '2026-01-10 16:40:10', '2026-01-10 18:50:10',
        '2026-02-01 07:00:10', '2026-02-12 16:40:10', '2026-04-01 19:20:10']
s = pd.Series([10, 20, 30, 40, 50, 60], index=pd.to_datetime(date))
s

2025-12-31 00:30:10    10
2026-01-10 16:40:10    20
2026-01-10 18:50:10    30
2026-02-01 07:00:10    40
2026-02-12 16:40:10    50
2026-04-01 19:20:10    60
dtype: int64

In [37]:
s.info()

<class 'pandas.core.series.Series'>
DatetimeIndex: 6 entries, 2025-12-31 00:30:10 to 2026-04-01 19:20:10
Series name: None
Non-Null Count  Dtype
--------------  -----
6 non-null      int64
dtypes: int64(1)
memory usage: 96.0 bytes


In [42]:
s.loc['2026/1':'2026/2']

2026-01-10 16:40:10    20
2026-01-10 18:50:10    30
2026-02-01 07:00:10    40
2026-02-12 16:40:10    50
dtype: int64

In [43]:
s.loc['2026/1/10 18':]

2026-01-10 18:50:10    30
2026-02-01 07:00:10    40
2026-02-12 16:40:10    50
2026-04-01 19:20:10    60
dtype: int64

In [44]:
s.at_time('16:40:10')

2026-01-10 16:40:10    20
2026-02-12 16:40:10    50
dtype: int64

In [45]:
# 코드 13-14. 온라인 쇼핑몰 로그 데이터 파일에서 데이터 프레임 불러오기
import pandas as pd
pd.options.display.max_rows = 6 # 6행까지만 출력
url1 = 'https://github.com/panda-kim/book1/blob/main/22ecomerce.xlsx?raw=true'
df_ec = pd.read_excel(url1, parse_dates=['Date'], index_col='Date')
df_ec

Unnamed: 0_level_0,Ucode,Category,Pcode,Price,Action
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-06-01 00:00:01,u514028527,Fashion,p13071150,411.59,V
2022-06-01 00:00:13,u550193582,Car,p12705151,51.22,C
2022-06-01 00:00:19,u542985695,Food,p12705928,139.58,B
...,...,...,...,...,...
2022-06-06 23:59:29,u534987603,Food,p71018150,115.81,V
2022-06-06 23:59:43,u553673695,Food,p30400010,584.75,V
2022-06-06 23:59:50,u524946851,Food,p54900011,64.35,V


In [46]:
df_ec.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 62455 entries, 2022-06-01 00:00:01 to 2022-06-06 23:59:50
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Ucode     62455 non-null  object 
 1   Category  62455 non-null  object 
 2   Pcode     62455 non-null  object 
 3   Price     62455 non-null  float64
 4   Action    62455 non-null  object 
dtypes: float64(1), object(4)
memory usage: 2.9+ MB


In [47]:
df_ec.loc['2022-06-03']

Unnamed: 0_level_0,Ucode,Category,Pcode,Price,Action
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-06-03 00:00:23,u518644680,Fashion,p23000487,1099.90,V
2022-06-03 00:00:27,u520745970,Food,p21401969,46.82,V
2022-06-03 00:00:39,u546887712,Fashion,p96000970,1020.28,V
...,...,...,...,...,...
2022-06-03 23:58:58,u552265414,Computer,p27017030,554.37,V
2022-06-03 23:59:21,u519190772,Fashion,p12708879,100.13,V
2022-06-03 23:59:33,u514568319,Food,p51005760,401.43,V


In [48]:
df_ec.loc['2022-06-03':'2022-06-04'].between_time('00','03')

Unnamed: 0_level_0,Ucode,Category,Pcode,Price,Action
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-06-03 00:00:23,u518644680,Fashion,p23000487,1099.90,V
2022-06-03 00:00:27,u520745970,Food,p21401969,46.82,V
2022-06-03 00:00:39,u546887712,Fashion,p96000970,1020.28,V
...,...,...,...,...,...
2022-06-04 02:59:20,u516219724,Computer,p34200029,21.85,V
2022-06-04 02:59:31,u514722600,Food,p11400443,118.40,V
2022-06-04 02:59:56,u513451722,Fashion,p10043200,256.71,V


In [52]:
# pd.date_range('2025-01-03', '2025-01-14')
# pd.date_range('2025-01-03', periods=4)
pd.date_range('2024-03-30', 'today')

DatetimeIndex(['2024-03-30', '2024-03-31', '2024-04-01', '2024-04-02',
               '2024-04-03', '2024-04-04', '2024-04-05', '2024-04-06',
               '2024-04-07', '2024-04-08',
               ...
               '2025-04-14', '2025-04-15', '2025-04-16', '2025-04-17',
               '2025-04-18', '2025-04-19', '2025-04-20', '2025-04-21',
               '2025-04-22', '2025-04-23'],
              dtype='datetime64[ns]', length=390, freq='D')

In [53]:
pd.date_range('2025-01-03','2025-02-03',freq='B')

DatetimeIndex(['2025-01-03', '2025-01-06', '2025-01-07', '2025-01-08',
               '2025-01-09', '2025-01-10', '2025-01-13', '2025-01-14',
               '2025-01-15', '2025-01-16', '2025-01-17', '2025-01-20',
               '2025-01-21', '2025-01-22', '2025-01-23', '2025-01-24',
               '2025-01-27', '2025-01-28', '2025-01-29', '2025-01-30',
               '2025-01-31', '2025-02-03'],
              dtype='datetime64[ns]', freq='B')

In [55]:
pd.date_range('2025-01-03', periods=4, freq='10min 30s')

DatetimeIndex(['2025-01-03 00:00:00', '2025-01-03 00:10:30',
               '2025-01-03 00:21:00', '2025-01-03 00:31:30'],
              dtype='datetime64[ns]', freq='630s')

In [56]:
# 코드 13-27. resample 함수 실습 예제 코드
import pandas as pd
data = {'날짜': ['2024-02-01', '2024-03-15', '2024-03-30',
                 '2024-03-31', '2024-04-02', '2024-04-05'],
        '금액': [1000, 2000, 3000, 4000, 5000, 6000]}
df = pd.DataFrame(data)
df

Unnamed: 0,날짜,금액
0,2024-02-01,1000
1,2024-03-15,2000
2,2024-03-30,3000
3,2024-03-31,4000
4,2024-04-02,5000
5,2024-04-05,6000


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   날짜      6 non-null      object
 1   금액      6 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 228.0+ bytes


In [58]:
df['날짜'] = pd.to_datetime(df['날짜'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   날짜      6 non-null      datetime64[ns]
 1   금액      6 non-null      int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 228.0 bytes


In [59]:
df

Unnamed: 0,날짜,금액
0,2024-02-01,1000
1,2024-03-15,2000
2,2024-03-30,3000
3,2024-03-31,4000
4,2024-04-02,5000
5,2024-04-05,6000


In [60]:
df.resample('M', on='날짜')['금액'].cumsum()

  df.resample('M', on='날짜')['금액'].cumsum()


0     1000
1     2000
2     5000
3     9000
4     5000
5    11000
Name: 금액, dtype: int64

In [61]:
df.resample('M', on='날짜')['금액'].transform('sum')

  df.resample('M', on='날짜')['금액'].transform('sum')


0     1000
1     9000
2     9000
3     9000
4    11000
5    11000
Name: 금액, dtype: int64

In [62]:
# 코드 13-31. groupby 함수와 resample 함수 비교 예제 코드
data1 = {'날짜': ['2024-02-01', '2024-03-15', '2024-03-30',
                  '2024-03-31', '2024-04-02', '2024-04-05'],
         '월': ['2024-02', '2024-03', '2024-03',
                '2024-03', '2024-04', '2024-04'],
         '금액': [1000, 2000, 3000, 4000, 5000, 6000]}
df1 = pd.DataFrame(data1)
df1['날짜'] = pd.to_datetime(df1['날짜']) # datetime으로 변환
df1

Unnamed: 0,날짜,월,금액
0,2024-02-01,2024-02,1000
1,2024-03-15,2024-03,2000
2,2024-03-30,2024-03,3000
3,2024-03-31,2024-03,4000
4,2024-04-02,2024-04,5000
5,2024-04-05,2024-04,6000


In [65]:
# df1.resample('M', on='날짜')['금액'].cumsum()
df1.groupby('월')['금액'].cumsum()

0     1000
1     2000
2     5000
3     9000
4     5000
5    11000
Name: 금액, dtype: int64

In [76]:
df1.resample('6MS', on='날짜', origin='start_day')['금액'].mean()

날짜
2024-02-01    3500.0
Freq: 6MS, Name: 금액, dtype: float64