# 대전 시내 택시 기사의 이동 위치 추천

라이브러리 호출

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import requests
import numpy as np
from datetime import datetime

데이터 불러오기

In [2]:
taxi_data = pd.read_csv('data/taxi/sample.csv', encoding='euc-kr')
grouped_means = pd.read_csv('data/taxi/group_means.csv', encoding='euc-kr')

display(taxi_data.head())

Unnamed: 0,차량번호,승차시간,승차X좌표,승차Y좌표,요일
0,zixghz,2023-04-01 00:01:25,127.378452,36.298087,Saturday
1,zixghz,2023-04-01 00:05:13,127.38464,36.297051,Saturday
2,zixghz,2023-04-01 00:17:24,127.369007,36.32985,Saturday
3,zixghz,2023-04-01 00:25:00,127.37727,36.351743,Saturday
4,zixghz,2023-04-01 00:42:32,127.395335,36.37661,Saturday


열 이름을 영어로 변경

In [3]:
columns = {
    '차량번호': 'car_num',
    '승차시간': 'date_time',
    '승차X좌표': 'x',
    '승차Y좌표': 'y',
    '요일': 'day'
}

taxi_data = taxi_data.rename(columns=columns)
taxi_data.head()

Unnamed: 0,car_num,date_time,x,y,day
0,zixghz,2023-04-01 00:01:25,127.378452,36.298087,Saturday
1,zixghz,2023-04-01 00:05:13,127.38464,36.297051,Saturday
2,zixghz,2023-04-01 00:17:24,127.369007,36.32985,Saturday
3,zixghz,2023-04-01 00:25:00,127.37727,36.351743,Saturday
4,zixghz,2023-04-01 00:42:32,127.395335,36.37661,Saturday


시간대 그룹화 & 날씨 적용

In [4]:
# 승차시간을 datetime 타입으로 변경
taxi_data['date_time'] = pd.to_datetime(taxi_data['date_time'])

# 6시간 단위로 시간 그룹화
def group_by_three_hours(dt):
    hour = (dt.hour // 6) * 6
    return dt.replace(hour=hour, minute=0, second=0, microsecond=0)

# '승차시간'을 3시간 단위로 그룹화
taxi_data['time_group'] = taxi_data['date_time'].apply(group_by_three_hours)

In [5]:
taxi_data['time_group'].unique()

<DatetimeArray>
['2023-04-01 00:00:00', '2023-04-01 12:00:00', '2023-04-01 18:00:00',
 '2023-04-02 00:00:00', '2023-04-02 12:00:00', '2023-04-02 18:00:00',
 '2023-04-03 00:00:00', '2023-04-04 00:00:00', '2023-04-04 06:00:00',
 '2023-04-04 12:00:00',
 ...
 '2024-03-28 18:00:00', '2024-03-29 00:00:00', '2024-03-29 06:00:00',
 '2024-03-29 12:00:00', '2024-03-29 18:00:00', '2024-03-30 00:00:00',
 '2024-03-30 06:00:00', '2024-03-30 12:00:00', '2024-03-30 18:00:00',
 '2024-03-31 00:00:00']
Length: 1166, dtype: datetime64[ns]

In [6]:
display(taxi_data.head())
grouped_means.head()

Unnamed: 0,car_num,date_time,x,y,day,time_group
0,zixghz,2023-04-01 00:01:25,127.378452,36.298087,Saturday,2023-04-01
1,zixghz,2023-04-01 00:05:13,127.38464,36.297051,Saturday,2023-04-01
2,zixghz,2023-04-01 00:17:24,127.369007,36.32985,Saturday,2023-04-01
3,zixghz,2023-04-01 00:25:00,127.37727,36.351743,Saturday,2023-04-01
4,zixghz,2023-04-01 00:42:32,127.395335,36.37661,Saturday,2023-04-01


Unnamed: 0,시간그룹,승차X좌표,승차Y좌표,타임스탬프,날씨
0,2023-04-01 00:00:00,127.389334,36.337359,1680307200,맑음
1,2023-04-01 12:00:00,127.410449,36.444332,1680350400,약간의 구름이 낀 하늘
2,2023-04-01 18:00:00,127.391457,36.374136,1680372000,맑음
3,2023-04-02 00:00:00,127.403348,36.363562,1680393600,맑음
4,2023-04-02 12:00:00,127.382176,36.432141,1680436800,맑음


In [7]:
grouped_means['시간그룹'] = pd.to_datetime(grouped_means['시간그룹'])

weather_map = dict(zip(grouped_means['시간그룹'], grouped_means['날씨']))
taxi_data['weather'] = taxi_data['time_group'].apply(lambda x: weather_map[x])

taxi_data.drop('time_group', axis=1, inplace=True)
display(taxi_data.head())
taxi_data['weather'].unique()

Unnamed: 0,car_num,date_time,x,y,day,weather
0,zixghz,2023-04-01 00:01:25,127.378452,36.298087,Saturday,맑음
1,zixghz,2023-04-01 00:05:13,127.38464,36.297051,Saturday,맑음
2,zixghz,2023-04-01 00:17:24,127.369007,36.32985,Saturday,맑음
3,zixghz,2023-04-01 00:25:00,127.37727,36.351743,Saturday,맑음
4,zixghz,2023-04-01 00:42:32,127.395335,36.37661,Saturday,맑음


array(['맑음', '약간의 구름이 낀 하늘', '온흐림', '보통 비', '실 비', '구름조금', '튼구름', '강한 비',
       '가벼운 눈', '안개', '박무'], dtype=object)

날씨 카테고리가 너무 많기에 간단하게 통합

In [8]:
def simple_weather(x):
    if x in ['맑음', '구름조금', '약간의 구름이 낀 하늘', '튼구름']:
        return '맑음'
    elif x == '온흐림':
        return '흐림'
    elif x in ['보통 비', '실 비', '강한 비']:
        return '비'
    elif x == '가벼운 눈':
        return '눈'
    else:
        return '안개'

taxi_data['weather'] = taxi_data['weather'].apply(lambda x: simple_weather(x))
taxi_data['weather'].unique()

array(['맑음', '흐림', '비', '눈', '안개'], dtype=object)

In [9]:
def get_period_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

def get_season(month):
    if month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    elif month in [9, 10, 11]:
        return 'fall'
    else:
        return 'winter'

# 시간 파생 변수 생성 (0=00시, 23=23시)
taxi_data['month'] = taxi_data['date_time'].dt.month
taxi_data['hour'] = taxi_data['date_time'].dt.hour

# 주기성을 반영한 시간, 월 데이터 변환
taxi_data.loc[:, 'hour_sin'] = np.sin(2 * np.pi * taxi_data['hour'] / 24)
taxi_data.loc[:, 'hour_cos'] = np.cos(2 * np.pi * taxi_data['hour'] / 24)

taxi_data.loc[:, 'month_sin'] = np.sin(2 * np.pi * taxi_data['month'] / 12)
taxi_data.loc[:, 'month_cos'] = np.cos(2 * np.pi * taxi_data['month'] / 12)

# 시간대 변수 생성 (아침, 오후, 저녁, 밤)
taxi_data['time_zone'] = taxi_data['hour'].apply(get_period_of_day)

# 계절 변수 생성
taxi_data['season'] = taxi_data['month'].apply(get_season)

# 주중/주말 구분 파생 변수 생성 (1=주말, 0=주중)
taxi_data['is_weekend'] = taxi_data['date_time'].dt.weekday.apply(lambda x: 1 if x >= 5 else 0)

taxi_data.drop(['hour', 'month'], axis=1, inplace=True)
taxi_data.head()

Unnamed: 0,car_num,date_time,x,y,day,weather,hour_sin,hour_cos,month_sin,month_cos,time_zone,season,is_weekend
0,zixghz,2023-04-01 00:01:25,127.378452,36.298087,Saturday,맑음,0.0,1.0,0.866025,-0.5,Night,spring,1
1,zixghz,2023-04-01 00:05:13,127.38464,36.297051,Saturday,맑음,0.0,1.0,0.866025,-0.5,Night,spring,1
2,zixghz,2023-04-01 00:17:24,127.369007,36.32985,Saturday,맑음,0.0,1.0,0.866025,-0.5,Night,spring,1
3,zixghz,2023-04-01 00:25:00,127.37727,36.351743,Saturday,맑음,0.0,1.0,0.866025,-0.5,Night,spring,1
4,zixghz,2023-04-01 00:42:32,127.395335,36.37661,Saturday,맑음,0.0,1.0,0.866025,-0.5,Night,spring,1


결측값이 있는지 확인해보자. 행 95개가 x,y 좌표 값이 0인 것을 확인할 수 있다.

In [10]:
display(len(taxi_data))
len(taxi_data[taxi_data['x'] == 0])

16427

95

자세히 확인해보면 1~2개 데이터를 제외하곤 2초 간격으로 발생한 비정상적인 데이터라는 것을 확인할 수 있다. 따라서 GPS 오류로 확인하고 0인 좌표를 제거하는 것이 옳아 보인다.

In [11]:
taxi_data.loc[4613:4650, :]

Unnamed: 0,car_num,date_time,x,y,day,weather,hour_sin,hour_cos,month_sin,month_cos,time_zone,season,is_weekend
4613,zixghz,2023-06-30 18:03:47,127.420445,36.34251,Friday,맑음,-1.0,-1.83697e-16,1.224647e-16,-1.0,Evening,summer,0
4614,zixghz,2023-06-30 18:03:49,0.0,0.0,Friday,맑음,-1.0,-1.83697e-16,1.224647e-16,-1.0,Evening,summer,0
4615,zixghz,2023-06-30 18:12:55,127.438636,36.349665,Friday,맑음,-1.0,-1.83697e-16,1.224647e-16,-1.0,Evening,summer,0
4616,zixghz,2023-06-30 18:12:57,0.0,0.0,Friday,맑음,-1.0,-1.83697e-16,1.224647e-16,-1.0,Evening,summer,0
4617,zixghz,2023-06-30 18:21:40,0.0,0.0,Friday,맑음,-1.0,-1.83697e-16,1.224647e-16,-1.0,Evening,summer,0
4618,zixghz,2023-06-30 18:46:07,127.432788,36.331903,Friday,맑음,-1.0,-1.83697e-16,1.224647e-16,-1.0,Evening,summer,0
4619,zixghz,2023-06-30 18:46:09,0.0,0.0,Friday,맑음,-1.0,-1.83697e-16,1.224647e-16,-1.0,Evening,summer,0
4620,zixghz,2023-06-30 19:18:56,0.0,0.0,Friday,맑음,-0.965926,0.258819,1.224647e-16,-1.0,Evening,summer,0
4621,zixghz,2023-06-30 19:18:56,127.375381,36.420369,Friday,맑음,-0.965926,0.258819,1.224647e-16,-1.0,Evening,summer,0
4622,zixghz,2023-06-30 19:26:26,127.386799,36.428189,Friday,맑음,-0.965926,0.258819,1.224647e-16,-1.0,Evening,summer,0


다시 확인해보면 좌표가 0인 행이 제거되었다.

In [12]:
taxi_data = taxi_data[(taxi_data['x'] != 0) & (taxi_data['y'] != 0)]

display(len(taxi_data))
len(taxi_data[taxi_data['x'] == 0])

16332

0

명목형 변수(요일, 날씨, 계절)에 원-핫 인코딩을 적용해보자.

In [13]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoded_data = ohe.fit_transform(taxi_data[['day', 'weather', 'season', 'time_zone']])
eocoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out())

# 데이터프레임 연결 시 인덱스 문제로 인해 초기화
taxi_data.reset_index(drop=True, inplace=True)
eocoded_df.reset_index(drop=True, inplace=True)

taxi_data = pd.concat([taxi_data, eocoded_df], axis=1)
taxi_data.drop(['date_time', 'day', 'weather', 'car_num', 'season', 'time_zone'], axis=1, inplace=True)
taxi_data.head()

Unnamed: 0,x,y,hour_sin,hour_cos,month_sin,month_cos,is_weekend,day_Monday,day_Saturday,day_Sunday,...,weather_맑음,weather_비,weather_안개,weather_흐림,season_spring,season_summer,season_winter,time_zone_Evening,time_zone_Morning,time_zone_Night
0,127.378452,36.298087,0.0,1.0,0.866025,-0.5,1,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,127.38464,36.297051,0.0,1.0,0.866025,-0.5,1,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,127.369007,36.32985,0.0,1.0,0.866025,-0.5,1,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,127.37727,36.351743,0.0,1.0,0.866025,-0.5,1,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,127.395335,36.37661,0.0,1.0,0.866025,-0.5,1,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [14]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, random_state=42)
taxi_data['cluster'] = kmeans.fit_predict(taxi_data)

taxi_data.head()

Unnamed: 0,x,y,hour_sin,hour_cos,month_sin,month_cos,is_weekend,day_Monday,day_Saturday,day_Sunday,...,weather_비,weather_안개,weather_흐림,season_spring,season_summer,season_winter,time_zone_Evening,time_zone_Morning,time_zone_Night,cluster
0,127.378452,36.298087,0.0,1.0,0.866025,-0.5,1,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3
1,127.38464,36.297051,0.0,1.0,0.866025,-0.5,1,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3
2,127.369007,36.32985,0.0,1.0,0.866025,-0.5,1,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3
3,127.37727,36.351743,0.0,1.0,0.866025,-0.5,1,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3
4,127.395335,36.37661,0.0,1.0,0.866025,-0.5,1,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3


In [15]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input

# Haversine 공식을 사용하여 두 좌표 사이의 거리를 계산하는 함수
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # 지구 반지름 (단위: km)
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat / 2) ** 2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

# 데이터 준비
# 'x'와 'y'가 예측할 대상이고, 나머지 열들은 모델에 입력되는 피처
features = taxi_data.drop(columns=['x', 'y'])
targets = taxi_data[['x', 'y']]

# 데이터를 정규화 (LSTM은 정규화된 데이터를 더 잘 학습)
scaler = MinMaxScaler()
targets_scaled = scaler.fit_transform(targets)
targets_scaled = pd.DataFrame(targets_scaled, columns=scaler.get_feature_names_out())

# 시퀀스 데이터 생성 함수
def create_sequences(features, targets, time_steps=10):
    X, y = [], []
    for i in range(len(features) - time_steps):
        X.append(features[i:i + time_steps])
        y.append(targets.iloc[i + time_steps].values)
    return np.array(X), np.array(y)

# 시퀀스 데이터 생성
time_steps = 10  # 10개의 이전 시점을 사용해 예측
X_seq, y_seq = create_sequences(features, targets_scaled, time_steps)

# 훈련 및 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

# LSTM 모델 구성
model = Sequential()
model.add(Input(shape=(X_train.shape[1], X_train.shape[2])))  # Input 객체 사용
model.add(LSTM(64, activation='relu'))
model.add(Dense(2))  # X, Y 좌표 두 개의 값을 예측

# 모델 컴파일
model.compile(optimizer='adam', loss='mean_squared_error')

# 모델 학습
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# 예측
y_pred = model.predict(X_test)

# 예측값과 실제값을 역정규화 (실제 위도/경도 좌표로 복원)
y_test_original = scaler.inverse_transform(y_test)
y_pred_original = scaler.inverse_transform(y_pred)

# Haversine 공식을 사용하여 실제 거리 오차 계산
distances = haversine_distance(
    y_test_original[:, 1], y_test_original[:, 0],  # 실제 경도, 위도
    y_pred_original[:, 1], y_pred_original[:, 0]   # 예측 경도, 위도
)

# 평균 거리 오차 출력
mean_distance_error = np.mean(distances)
print(f"평균 거리 오차: {mean_distance_error:.2f} km")


Epoch 1/20
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.0547 - val_loss: 0.0077
Epoch 2/20
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.0076 - val_loss: 0.0076
Epoch 3/20
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.0071 - val_loss: 0.0073
Epoch 4/20
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.0069 - val_loss: 0.0076
Epoch 5/20
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.0073 - val_loss: 0.0073
Epoch 6/20
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.0069 - val_loss: 0.0070
Epoch 7/20
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.0067 - val_loss: 0.0073
Epoch 8/20
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.0067 - val_loss: 0.0072
Epoch 9/20
[1m409/409[0m [32m━━━━━━━━

대전 내에서 오차 기준 설정:
* 매우 좋은 모델: 평균 오차가 100m 이하 (0.1km) – 매우 세밀한 위치 예측이 필요한 경우
* 좋은 모델: 평균 오차가 200~300m 이하 – 일반적인 위치 예측
* 보통 수준의 모델: 평균 오차가 500m 이하
* 오차가 큰 모델: 평균 오차가 1km 이상

In [16]:
# # 시간별 count 계산
# hourly_count = taxi_data.groupby(taxi_data['승차시간'].dt.hour)['차량번호'].count()

# # 요일별 count 계산
# weekly_count = taxi_data.groupby(taxi_data['승차시간'].dt.weekday)['차량번호'].count()

# # 시각화
# plt.figure(figsize=(10, 8))

# plt.subplot(2, 1, 1)
# hourly_count.plot(kind='bar', color='skyblue')
# plt.xlabel('Hour of Day')
# plt.ylabel('Count')

# plt.subplot(2, 1, 2)
# weekly_count.plot(kind='bar', color='orange')
# plt.xlabel('Day of Week')
# plt.ylabel('Count')

## 날씨 데이터 얻는 과정

In [17]:
# grouped_means = taxi_data.groupby('시간그룹')[['승차X좌표', '승차Y좌표']].mean().reset_index()
# grouped_means.head()

In [18]:
# def convert_timestamp(date_time):
#     timestamp = int(date_time.timestamp())
#     return timestamp

# grouped_means['타임스탬프'] = grouped_means['시간그룹'].apply(convert_timestamp)
# grouped_means['날씨'] = None
# grouped_means.head()

In [19]:
# API_key = '26b2690c05559a7396f701113f306e9f'
# lang = 'kr'

# for index, row in grouped_mean.iterrows():
#     lon = row['승차X좌표']
#     lat = row['승차Y좌표']
#     timestamp = row['타임스탬프']

#     url = f'https://api.openweathermap.org/data/3.0/onecall/timemachine?lat={lat}&lon={lon}&dt={timestamp}&appid={API_key}&lang={lang}'
#     response = requests.get(url)

#     if response.status_code == 200:
#         weather_description = response.json()['data'][0]['weather'][0]['description']
#         grouped_means.loc[index, '날씨'] = weather_description
#     else:
#         grouped_means.loc[index, '날씨'] = None

In [20]:
# grouped_means.head()

In [21]:
# grouped_means.to_csv('data/taxi/group_means.csv', index=False, encoding='euc-kr')