Python 코드 최적화 공부
2024-01-08~

keyword: pandas,...


# 판다스 코드 속도 최적화
- 참고: https://aldente0630.github.io/data-science/2018/08/05/a-beginners-guide-to-optimizing-pandas-code-for-speed.html

- 파일다운로드: https://github.com/s-heisler/pycon2017-optimizing-pandas

In [5]:
import pandas as pd
import numpy as np
from math import *

In [6]:
# Haversine 기본 거리 공식을 정의함
def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    total_miles = MILES * c
    return total_miles

In [7]:
"""
Read data
"""

df = pd.read_csv('new_york_hotels.csv', encoding='cp1252')

## 1. 단순 반복 코드

In [8]:
def haversine_looping(df):
    distance_list = []
    for i in range(0, len(df)):
        d = haversine(40.671, -73.985, df.iloc[i]['latitude'], df.iloc[i]['longitude'])
        distance_list.append(d)
    return distance_list

In [9]:
%%timeit

# Haversine 반복 함수 실행하기
df['distance'] = haversine_looping(df)

102 ms ± 595 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## 2. iterrows()를 사용한 반복

In [10]:
%%timeit

# 반복을 통해 행에 적용되는 Haversine 함수
haversine_series = []
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985, row['latitude'], row['longitude']))
df['distance'] = haversine_series

60.1 ms ± 399 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
