In [1]:
from datetime import datetime, timedelta

import pandas as pd

In [2]:
test = pd.read_csv("../data/test1003.csv", encoding="cp949")

# 승/하차시간 열을 정제
def clean_drop_off_time(time_str):
    if isinstance(time_str, str):
        # 'YYYYMMDDHHMMSS' 형식인 경우
        if len(time_str) == 14 and time_str.isdigit():
            return datetime.strptime(time_str, '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S')
        # 이미 'YYYY-MM-DD HH:MM:SS' 형식인 경우
        elif len(time_str) == 19 and time_str[4] == '-' and time_str[13] == ':':
            return time_str
    # 다른 모든 경우 (NaT 포함)
    return pd.NaT

test['승차시간'] = test['승차시간'].apply(clean_drop_off_time)
test['하차시간'] = test['하차시간'].apply(clean_drop_off_time)

# 날짜-시간 컬럼을 datetime 타입으로 변환
test['승차시간'] = pd.to_datetime(test['승차시간'])
test['하차시간'] = pd.to_datetime(test['하차시간'])

test.head()

Unnamed: 0,차량이름,승차시간,승차요일,승차X좌표,승차Y좌표,하차시간,하차X좌표,하차Y좌표,승차거리(m),할증여부,요금
0,eiihxy,2023-05-25 00:01:09,Thursday,127.390112,36.347852,2023-05-25 00:09:24,127.407985,36.333623,3647.76,1,6120
1,bfllzp,2023-05-25 00:01:52,Thursday,127.316568,36.364632,2023-05-25 00:10:53,127.301787,36.34358,4620.15,1,6960
2,zazemd,2023-05-25 00:05:45,Thursday,127.397139,36.342268,2023-05-25 00:11:13,127.392709,36.363743,3148.23,1,5520
3,eiihxy,2023-05-25 00:11:18,Thursday,127.406668,36.329063,2023-05-25 00:13:23,127.415765,36.335744,1151.5,1,3960
4,hzdykr,2023-05-25 00:04:28,Thursday,127.332634,36.34864,2023-05-25 00:13:17,127.301659,36.358536,5552.4,1,7800


In [4]:
# 각 택시별로 데이터를 정렬
test = test.sort_values(['차량이름', '승차시간'])

# 데이터가 처음 집계된 시점 계산
data_start_time = test['승차시간'].min()
data_start_time

Timestamp('2023-05-25 00:00:51')

In [3]:
# 휴식 시간을 계산하는 함수
def calculate_rest_time(group, data_start_time):
    rest_times = []
    
    # 첫 승차 전 휴식 시간 추가
    first_ride_time = group.iloc[0]['승차시간']
    if first_ride_time > data_start_time:
        rest_times.append({
            'name': group.iloc[0]['차량이름'],
            'start': data_start_time,
            'end': first_ride_time,
            'duration': first_ride_time - data_start_time
        })

    for i in range(1, len(group)):
        rest_start = group.iloc[i-1]['하차시간']
        rest_end = group.iloc[i]['승차시간']
        rest_duration = rest_end - rest_start
        if rest_duration > timedelta(hours=1):
            rest_times.append({
                'name': group.iloc[i]['차량이름'],
                'start': rest_start,
                'end': rest_end,
                # '휴식 시간(시간)': rest_duration.total_seconds() / 3600
                'duration': rest_duration
            })
    return pd.DataFrame(rest_times)

# 각 택시별로 휴식 시간 계산
rest_times = test.groupby('차량이름').apply(lambda x: calculate_rest_time(x, data_start_time)).reset_index(drop=True)
rest_times.head()

  rest_times = test.groupby('차량이름').apply(lambda x: calculate_rest_time(x, data_start_time)).reset_index(drop=True)


Unnamed: 0,name,start,end,duration
0,aewuuy,2023-05-25 00:00:51,2023-05-25 00:13:58,0 days 00:13:07
1,aewuuy,2023-05-25 01:39:14,2023-05-25 16:50:50,0 days 15:11:36
2,aewuuy,2023-05-26 01:21:15,2023-05-26 12:22:26,0 days 11:01:11
3,aewuuy,2023-05-26 12:33:27,2023-05-26 16:38:05,0 days 04:04:38
4,aewuuy,2023-05-26 16:42:46,2023-05-26 18:14:13,0 days 01:31:27


In [6]:
rest_times.to_csv("../metadata/taxi_rest_times.csv", index=False)