In [1]:
from datetime import datetime, timedelta

import pandas as pd

In [2]:
train_v1 = pd.read_csv("../data/raw/v1/train.csv", encoding="cp949")
train_v1 = train_v1[(train_v1["승차X좌표"] != 0) & (train_v1["승차Y좌표"] != 0)]
train_v1['승차시간'] = pd.to_datetime(train_v1['승차시간'])
train_v1.shape

(1082657, 5)

In [3]:
train_v1.head()

Unnamed: 0,차량번호,승차시간,승차X좌표,승차Y좌표,요일
44,sqonam,2023-04-01 05:15:37,127.377749,36.350133,Saturday
45,sqonam,2023-04-01 05:42:47,127.358606,36.353338,Saturday
46,sqonam,2023-04-01 06:02:58,127.38024,36.365618,Saturday
47,sqonam,2023-04-01 06:15:05,127.426666,36.351172,Saturday
48,sqonam,2023-04-01 06:28:36,127.408217,36.324442,Saturday


In [4]:
# 승/하차시간 열을 정제
def clean_drop_off_time(value):
    if isinstance(value, datetime):
        return value
    if isinstance(value, int):
        value = str(value)
    if isinstance(value, str):
        # 'YYYYMMDDHHMMSS' 형식인 경우
        if len(value) == 14 and value.isdigit():
            return datetime.strptime(value, '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S')
        # 이미 'YYYY-MM-DD HH:MM:SS' 형식인 경우
        elif len(value) == 19 and value[4] == '-' and value[13] == ':':
            return value
    # 다른 모든 경우 (NaT 포함)
    return pd.NaT

train_v3 = pd.read_csv("../data/raw/v3/train.csv", encoding="cp949")
train_v3 = train_v3[(train_v3["승차X좌표"] != 0) & (train_v3["승차Y좌표"] != 0)]
train_v3['승차시간'] = train_v3['승차시간'].apply(clean_drop_off_time)
train_v3['하차시간'] = train_v3['하차시간'].apply(clean_drop_off_time)
train_v3['승차시간'] = pd.to_datetime(train_v3['승차시간'])
train_v3['하차시간'] = pd.to_datetime(train_v3['하차시간'])
train_v3['승차시간'].where(train_v3['승차시간'].dt.year != 1970, pd.NaT, inplace=True)
train_v3['하차시간'].where(train_v3['하차시간'].dt.year != 1970, pd.NaT, inplace=True)
train_v3.shape

  train_v3 = pd.read_csv("../data/raw/v3/train.csv", encoding="cp949")


(1151410, 11)

In [5]:
train_v3.head()

Unnamed: 0,차량이름,승차시간,승차요일,승차X좌표,승차Y좌표,하차시간,하차X좌표,하차Y좌표,승차거리(m),할증여부,요금
0,xqbxch,NaT,Tuesday,127.367251,36.349866,2023-06-20 11:08:21,127.367251,36.349866,0.0,미할증,3300
1,xqbxch,NaT,Tuesday,127.367263,36.349684,2023-06-20 11:11:00,127.367263,36.349684,0.0,미할증,3300
3,xqbxch,NaT,Tuesday,127.340568,36.298811,2023-06-20 22:27:20,127.350915,36.305854,1518.13,미할증,3300
4,xqbxch,NaT,Tuesday,127.351858,36.299839,2023-06-20 22:47:15,127.374955,36.345087,7045.65,미할증,8000
5,xqbxch,NaT,Tuesday,127.371055,36.346926,2023-06-20 22:54:43,127.376224,36.350242,738.03,미할증,3300


In [6]:
# 데이터 전처리: 컬럼명 통일 및 데이터 타입 변환
train_v1 = train_v1.rename(columns={'차량번호': '차량이름', '요일': '승차요일'})

# 병합을 위한 키 생성
train_v1['merge_key'] = train_v1['차량이름'] + train_v1['승차요일'] + train_v1['승차X좌표'].round(6).astype(str) + train_v1['승차Y좌표'].round(6).astype(str)
train_v3['merge_key'] = train_v3['차량이름'] + train_v3['승차요일'] + train_v3['승차X좌표'].round(6).astype(str) + train_v3['승차Y좌표'].round(6).astype(str)

# 데이터 병합
train = pd.merge(train_v3, train_v1[['merge_key', '승차시간']], on='merge_key', how='left', suffixes=('', '_train_v1'))

# 조건에 맞는 경우 승차시간 업데이트
train.loc[train['승차시간'].isna(), '승차시간'] = train.loc[train['승차시간'].isna(), '승차시간_train_v1']

# 불필요한 컬럼 제거
train = train.drop(['merge_key', '승차시간_train_v1'], axis=1)

In [7]:
train.dropna(inplace=True)

In [8]:
train.drop_duplicates(subset=['차량이름', '승차시간'], keep=False, inplace=True)

In [9]:
train = train[train['승차거리(m)'] != 0]

In [11]:
train.shape

(890645, 11)

In [12]:
train.head()

Unnamed: 0,차량이름,승차시간,승차요일,승차X좌표,승차Y좌표,하차시간,하차X좌표,하차Y좌표,승차거리(m),할증여부,요금
2,xqbxch,2023-06-20 22:23:00,Tuesday,127.340568,36.298811,2023-06-20 22:27:20,127.350915,36.305854,1518.13,미할증,3300
3,xqbxch,2023-06-20 22:33:00,Tuesday,127.351858,36.299839,2023-06-20 22:47:15,127.374955,36.345087,7045.65,미할증,8000
4,xqbxch,2023-06-20 22:50:00,Tuesday,127.371055,36.346926,2023-06-20 22:54:43,127.376224,36.350242,738.03,미할증,3300
5,xqbxch,2023-06-20 22:58:00,Tuesday,127.382028,36.349033,2023-06-20 23:04:50,127.388901,36.3521,928.7,미할증,3400
6,xqbxch,2023-06-20 23:17:00,Tuesday,127.376671,36.359815,2023-06-20 23:25:11,127.377479,36.361661,1259.61,미할증,3700


In [14]:
train_v1['승차일자']= train_v1['승차시간'].dt.date

In [15]:
train['승차일자']= train['승차시간'].dt.date

In [17]:
# find the difference between two dates
train['승차일자'] = pd.to_datetime(train['승차일자'])
train_v1['승차일자'] = pd.to_datetime(train_v1['승차일자'])

In [18]:
train

Unnamed: 0,차량이름,승차시간,승차요일,승차X좌표,승차Y좌표,하차시간,하차X좌표,하차Y좌표,승차거리(m),할증여부,요금,승차일자
2,xqbxch,2023-06-20 22:23:00,Tuesday,127.340568,36.298811,2023-06-20 22:27:20,127.350915,36.305854,1518.13,미할증,3300,2023-06-20
3,xqbxch,2023-06-20 22:33:00,Tuesday,127.351858,36.299839,2023-06-20 22:47:15,127.374955,36.345087,7045.65,미할증,8000,2023-06-20
4,xqbxch,2023-06-20 22:50:00,Tuesday,127.371055,36.346926,2023-06-20 22:54:43,127.376224,36.350242,738.03,미할증,3300,2023-06-20
5,xqbxch,2023-06-20 22:58:00,Tuesday,127.382028,36.349033,2023-06-20 23:04:50,127.388901,36.352100,928.70,미할증,3400,2023-06-20
6,xqbxch,2023-06-20 23:17:00,Tuesday,127.376671,36.359815,2023-06-20 23:25:11,127.377479,36.361661,1259.61,미할증,3700,2023-06-20
...,...,...,...,...,...,...,...,...,...,...,...,...
1169564,hkhvmu,2024-03-24 23:56:34,Sunday,127.378557,36.298425,2024-03-25 00:06:57,127.388942,36.317763,3406.73,1,7800,2024-03-24
1169565,tqxhbo,2024-03-24 23:25:58,Sunday,127.375799,36.298259,2024-03-25 00:09:45,127.266861,36.515996,30218.90,2,35860,2024-03-24
1169566,jxpqzc,2024-03-24 23:51:37,Sunday,127.423131,36.448187,2024-03-25 00:10:03,127.400651,36.339285,14331.62,1,17520,2024-03-24
1169567,teqsog,2024-03-24 23:59:41,Sunday,127.376398,36.348168,2024-03-25 00:12:45,127.440522,36.305813,8121.50,1,11760,2024-03-24


In [20]:
train['승차일자'].value_counts()

승차일자
2023-12-23    3650
2024-02-24    3622
2023-12-22    3564
2023-12-09    3522
2024-03-09    3507
              ... 
2023-05-02    2225
2024-01-01    2113
2024-02-10    1918
2023-05-01    1487
2023-03-31      37
Name: count, Length: 325, dtype: int64

In [21]:
train_v1['승차일자'].value_counts()

승차일자
2024-02-24    4461
2023-12-23    4459
2024-03-09    4341
2023-12-22    4307
2023-06-24    4290
              ... 
2024-03-26       2
2023-09-25       1
2024-01-26       1
2023-11-27       1
2023-11-30       1
Name: count, Length: 330, dtype: int64

In [25]:
# make a new column which has the difference of count between two dates
new = train['승차일자'].value_counts() - train_v1['승차일자'].value_counts()
new.sort

In [28]:
new.sort_values(key=abs)

승차일자
2023-03-31      9.0
2023-05-01   -261.0
2023-04-03   -384.0
2023-06-06   -394.0
2023-10-01   -407.0
              ...  
2023-09-25      NaN
2023-11-27      NaN
2023-11-30      NaN
2024-01-26      NaN
2024-03-26      NaN
Name: count, Length: 330, dtype: float64